1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Float Min/Max */ 1667 /* min() and max() functions. These can't be implemented as 1668 * 'compare and pick one input' because that would mishandle 1669 * NaNs and +0 vs -0. 1670 * 1671 * minnum() and maxnum() functions. These are similar to the min() 1672 * and max() functions but if one of the arguments is a QNaN and 1673 * the other is numerical then the numerical argument is returned. 1674 * SNaNs will get quietened before being returned. 1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1676 * and maxNum() operations. min() and max() are the typical min/max 1677 * semantics provided by many CPUs which predate that specification. 1678 * 1679 * minnummag() and maxnummag() functions correspond to minNumMag() 1680 * and minNumMag() from the IEEE-754 2008. 1681 */ 1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1683 bool ieee, bool ismag, float_status *s) 1684 { 1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1686 if (ieee) { 1687 /* Takes two floating-point values `a' and `b', one of 1688 * which is a NaN, and returns the appropriate NaN 1689 * result. If either `a' or `b' is a signaling NaN, 1690 * the invalid exception is raised. 1691 */ 1692 if (is_snan(a.cls) || is_snan(b.cls)) { 1693 return pick_nan(a, b, s); 1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1695 return b; 1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1697 return a; 1698 } 1699 } 1700 return pick_nan(a, b, s); 1701 } else { 1702 int a_exp, b_exp; 1703 bool a_sign, b_sign; 1704 1705 switch (a.cls) { 1706 case float_class_normal: 1707 a_exp = a.exp; 1708 break; 1709 case float_class_inf: 1710 a_exp = INT_MAX; 1711 break; 1712 case float_class_zero: 1713 a_exp = INT_MIN; 1714 break; 1715 default: 1716 g_assert_not_reached(); 1717 break; 1718 } 1719 switch (b.cls) { 1720 case float_class_normal: 1721 b_exp = b.exp; 1722 break; 1723 case float_class_inf: 1724 b_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 b_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 1734 a_sign = a.sign; 1735 b_sign = b.sign; 1736 if (ismag) { 1737 a_sign = b_sign = 0; 1738 } 1739 1740 if (a_sign == b_sign) { 1741 bool a_less = a_exp < b_exp; 1742 if (a_exp == b_exp) { 1743 a_less = a.frac < b.frac; 1744 } 1745 return a_sign ^ a_less ^ ismin ? b : a; 1746 } else { 1747 return a_sign ^ ismin ? b : a; 1748 } 1749 } 1750 } 1751 1752 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1754 float_status *s) \ 1755 { \ 1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1759 \ 1760 return float ## sz ## _round_pack_canonical(pr, s); \ 1761 } 1762 1763 MINMAX(16, min, true, false, false) 1764 MINMAX(16, minnum, true, true, false) 1765 MINMAX(16, minnummag, true, true, true) 1766 MINMAX(16, max, false, false, false) 1767 MINMAX(16, maxnum, false, true, false) 1768 MINMAX(16, maxnummag, false, true, true) 1769 1770 MINMAX(32, min, true, false, false) 1771 MINMAX(32, minnum, true, true, false) 1772 MINMAX(32, minnummag, true, true, true) 1773 MINMAX(32, max, false, false, false) 1774 MINMAX(32, maxnum, false, true, false) 1775 MINMAX(32, maxnummag, false, true, true) 1776 1777 MINMAX(64, min, true, false, false) 1778 MINMAX(64, minnum, true, true, false) 1779 MINMAX(64, minnummag, true, true, true) 1780 MINMAX(64, max, false, false, false) 1781 MINMAX(64, maxnum, false, true, false) 1782 MINMAX(64, maxnummag, false, true, true) 1783 1784 #undef MINMAX 1785 1786 /* Multiply A by 2 raised to the power N. */ 1787 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1788 { 1789 if (unlikely(is_nan(a.cls))) { 1790 return return_nan(a, s); 1791 } 1792 if (a.cls == float_class_normal) { 1793 a.exp += n; 1794 } 1795 return a; 1796 } 1797 1798 float16 float16_scalbn(float16 a, int n, float_status *status) 1799 { 1800 FloatParts pa = float16_unpack_canonical(a, status); 1801 FloatParts pr = scalbn_decomposed(pa, n, status); 1802 return float16_round_pack_canonical(pr, status); 1803 } 1804 1805 float32 float32_scalbn(float32 a, int n, float_status *status) 1806 { 1807 FloatParts pa = float32_unpack_canonical(a, status); 1808 FloatParts pr = scalbn_decomposed(pa, n, status); 1809 return float32_round_pack_canonical(pr, status); 1810 } 1811 1812 float64 float64_scalbn(float64 a, int n, float_status *status) 1813 { 1814 FloatParts pa = float64_unpack_canonical(a, status); 1815 FloatParts pr = scalbn_decomposed(pa, n, status); 1816 return float64_round_pack_canonical(pr, status); 1817 } 1818 1819 /*---------------------------------------------------------------------------- 1820 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1821 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1822 | input. If `zSign' is 1, the input is negated before being converted to an 1823 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 1824 | is simply rounded to an integer, with the inexact exception raised if the 1825 | input cannot be represented exactly as an integer. However, if the fixed- 1826 | point input is too large, the invalid exception is raised and the largest 1827 | positive or negative integer is returned. 1828 *----------------------------------------------------------------------------*/ 1829 1830 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 1831 { 1832 int8_t roundingMode; 1833 flag roundNearestEven; 1834 int8_t roundIncrement, roundBits; 1835 int32_t z; 1836 1837 roundingMode = status->float_rounding_mode; 1838 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1839 switch (roundingMode) { 1840 case float_round_nearest_even: 1841 case float_round_ties_away: 1842 roundIncrement = 0x40; 1843 break; 1844 case float_round_to_zero: 1845 roundIncrement = 0; 1846 break; 1847 case float_round_up: 1848 roundIncrement = zSign ? 0 : 0x7f; 1849 break; 1850 case float_round_down: 1851 roundIncrement = zSign ? 0x7f : 0; 1852 break; 1853 default: 1854 abort(); 1855 } 1856 roundBits = absZ & 0x7F; 1857 absZ = ( absZ + roundIncrement )>>7; 1858 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1859 z = absZ; 1860 if ( zSign ) z = - z; 1861 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 1862 float_raise(float_flag_invalid, status); 1863 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 1864 } 1865 if (roundBits) { 1866 status->float_exception_flags |= float_flag_inexact; 1867 } 1868 return z; 1869 1870 } 1871 1872 /*---------------------------------------------------------------------------- 1873 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1874 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1875 | and returns the properly rounded 64-bit integer corresponding to the input. 1876 | If `zSign' is 1, the input is negated before being converted to an integer. 1877 | Ordinarily, the fixed-point input is simply rounded to an integer, with 1878 | the inexact exception raised if the input cannot be represented exactly as 1879 | an integer. However, if the fixed-point input is too large, the invalid 1880 | exception is raised and the largest positive or negative integer is 1881 | returned. 1882 *----------------------------------------------------------------------------*/ 1883 1884 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 1885 float_status *status) 1886 { 1887 int8_t roundingMode; 1888 flag roundNearestEven, increment; 1889 int64_t z; 1890 1891 roundingMode = status->float_rounding_mode; 1892 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1893 switch (roundingMode) { 1894 case float_round_nearest_even: 1895 case float_round_ties_away: 1896 increment = ((int64_t) absZ1 < 0); 1897 break; 1898 case float_round_to_zero: 1899 increment = 0; 1900 break; 1901 case float_round_up: 1902 increment = !zSign && absZ1; 1903 break; 1904 case float_round_down: 1905 increment = zSign && absZ1; 1906 break; 1907 default: 1908 abort(); 1909 } 1910 if ( increment ) { 1911 ++absZ0; 1912 if ( absZ0 == 0 ) goto overflow; 1913 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1914 } 1915 z = absZ0; 1916 if ( zSign ) z = - z; 1917 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1918 overflow: 1919 float_raise(float_flag_invalid, status); 1920 return 1921 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 1922 : LIT64( 0x7FFFFFFFFFFFFFFF ); 1923 } 1924 if (absZ1) { 1925 status->float_exception_flags |= float_flag_inexact; 1926 } 1927 return z; 1928 1929 } 1930 1931 /*---------------------------------------------------------------------------- 1932 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1933 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1934 | and returns the properly rounded 64-bit unsigned integer corresponding to the 1935 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 1936 | with the inexact exception raised if the input cannot be represented exactly 1937 | as an integer. However, if the fixed-point input is too large, the invalid 1938 | exception is raised and the largest unsigned integer is returned. 1939 *----------------------------------------------------------------------------*/ 1940 1941 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 1942 uint64_t absZ1, float_status *status) 1943 { 1944 int8_t roundingMode; 1945 flag roundNearestEven, increment; 1946 1947 roundingMode = status->float_rounding_mode; 1948 roundNearestEven = (roundingMode == float_round_nearest_even); 1949 switch (roundingMode) { 1950 case float_round_nearest_even: 1951 case float_round_ties_away: 1952 increment = ((int64_t)absZ1 < 0); 1953 break; 1954 case float_round_to_zero: 1955 increment = 0; 1956 break; 1957 case float_round_up: 1958 increment = !zSign && absZ1; 1959 break; 1960 case float_round_down: 1961 increment = zSign && absZ1; 1962 break; 1963 default: 1964 abort(); 1965 } 1966 if (increment) { 1967 ++absZ0; 1968 if (absZ0 == 0) { 1969 float_raise(float_flag_invalid, status); 1970 return LIT64(0xFFFFFFFFFFFFFFFF); 1971 } 1972 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 1973 } 1974 1975 if (zSign && absZ0) { 1976 float_raise(float_flag_invalid, status); 1977 return 0; 1978 } 1979 1980 if (absZ1) { 1981 status->float_exception_flags |= float_flag_inexact; 1982 } 1983 return absZ0; 1984 } 1985 1986 /*---------------------------------------------------------------------------- 1987 | If `a' is denormal and we are in flush-to-zero mode then set the 1988 | input-denormal exception and return zero. Otherwise just return the value. 1989 *----------------------------------------------------------------------------*/ 1990 float32 float32_squash_input_denormal(float32 a, float_status *status) 1991 { 1992 if (status->flush_inputs_to_zero) { 1993 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 1994 float_raise(float_flag_input_denormal, status); 1995 return make_float32(float32_val(a) & 0x80000000); 1996 } 1997 } 1998 return a; 1999 } 2000 2001 /*---------------------------------------------------------------------------- 2002 | Normalizes the subnormal single-precision floating-point value represented 2003 | by the denormalized significand `aSig'. The normalized exponent and 2004 | significand are stored at the locations pointed to by `zExpPtr' and 2005 | `zSigPtr', respectively. 2006 *----------------------------------------------------------------------------*/ 2007 2008 static void 2009 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2010 { 2011 int8_t shiftCount; 2012 2013 shiftCount = countLeadingZeros32( aSig ) - 8; 2014 *zSigPtr = aSig<<shiftCount; 2015 *zExpPtr = 1 - shiftCount; 2016 2017 } 2018 2019 /*---------------------------------------------------------------------------- 2020 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2021 | single-precision floating-point value, returning the result. After being 2022 | shifted into the proper positions, the three fields are simply added 2023 | together to form the result. This means that any integer portion of `zSig' 2024 | will be added into the exponent. Since a properly normalized significand 2025 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2026 | than the desired result exponent whenever `zSig' is a complete, normalized 2027 | significand. 2028 *----------------------------------------------------------------------------*/ 2029 2030 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 2031 { 2032 2033 return make_float32( 2034 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 2035 2036 } 2037 2038 /*---------------------------------------------------------------------------- 2039 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2040 | and significand `zSig', and returns the proper single-precision floating- 2041 | point value corresponding to the abstract input. Ordinarily, the abstract 2042 | value is simply rounded and packed into the single-precision format, with 2043 | the inexact exception raised if the abstract input cannot be represented 2044 | exactly. However, if the abstract value is too large, the overflow and 2045 | inexact exceptions are raised and an infinity or maximal finite value is 2046 | returned. If the abstract value is too small, the input value is rounded to 2047 | a subnormal number, and the underflow and inexact exceptions are raised if 2048 | the abstract input cannot be represented exactly as a subnormal single- 2049 | precision floating-point number. 2050 | The input significand `zSig' has its binary point between bits 30 2051 | and 29, which is 7 bits to the left of the usual location. This shifted 2052 | significand must be normalized or smaller. If `zSig' is not normalized, 2053 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2054 | and it must not require rounding. In the usual case that `zSig' is 2055 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2056 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2057 | Binary Floating-Point Arithmetic. 2058 *----------------------------------------------------------------------------*/ 2059 2060 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2061 float_status *status) 2062 { 2063 int8_t roundingMode; 2064 flag roundNearestEven; 2065 int8_t roundIncrement, roundBits; 2066 flag isTiny; 2067 2068 roundingMode = status->float_rounding_mode; 2069 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2070 switch (roundingMode) { 2071 case float_round_nearest_even: 2072 case float_round_ties_away: 2073 roundIncrement = 0x40; 2074 break; 2075 case float_round_to_zero: 2076 roundIncrement = 0; 2077 break; 2078 case float_round_up: 2079 roundIncrement = zSign ? 0 : 0x7f; 2080 break; 2081 case float_round_down: 2082 roundIncrement = zSign ? 0x7f : 0; 2083 break; 2084 default: 2085 abort(); 2086 break; 2087 } 2088 roundBits = zSig & 0x7F; 2089 if ( 0xFD <= (uint16_t) zExp ) { 2090 if ( ( 0xFD < zExp ) 2091 || ( ( zExp == 0xFD ) 2092 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2093 ) { 2094 float_raise(float_flag_overflow | float_flag_inexact, status); 2095 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2096 } 2097 if ( zExp < 0 ) { 2098 if (status->flush_to_zero) { 2099 float_raise(float_flag_output_denormal, status); 2100 return packFloat32(zSign, 0, 0); 2101 } 2102 isTiny = 2103 (status->float_detect_tininess 2104 == float_tininess_before_rounding) 2105 || ( zExp < -1 ) 2106 || ( zSig + roundIncrement < 0x80000000 ); 2107 shift32RightJamming( zSig, - zExp, &zSig ); 2108 zExp = 0; 2109 roundBits = zSig & 0x7F; 2110 if (isTiny && roundBits) { 2111 float_raise(float_flag_underflow, status); 2112 } 2113 } 2114 } 2115 if (roundBits) { 2116 status->float_exception_flags |= float_flag_inexact; 2117 } 2118 zSig = ( zSig + roundIncrement )>>7; 2119 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2120 if ( zSig == 0 ) zExp = 0; 2121 return packFloat32( zSign, zExp, zSig ); 2122 2123 } 2124 2125 /*---------------------------------------------------------------------------- 2126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2127 | and significand `zSig', and returns the proper single-precision floating- 2128 | point value corresponding to the abstract input. This routine is just like 2129 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2130 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2131 | floating-point exponent. 2132 *----------------------------------------------------------------------------*/ 2133 2134 static float32 2135 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2136 float_status *status) 2137 { 2138 int8_t shiftCount; 2139 2140 shiftCount = countLeadingZeros32( zSig ) - 1; 2141 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2142 status); 2143 2144 } 2145 2146 /*---------------------------------------------------------------------------- 2147 | If `a' is denormal and we are in flush-to-zero mode then set the 2148 | input-denormal exception and return zero. Otherwise just return the value. 2149 *----------------------------------------------------------------------------*/ 2150 float64 float64_squash_input_denormal(float64 a, float_status *status) 2151 { 2152 if (status->flush_inputs_to_zero) { 2153 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2154 float_raise(float_flag_input_denormal, status); 2155 return make_float64(float64_val(a) & (1ULL << 63)); 2156 } 2157 } 2158 return a; 2159 } 2160 2161 /*---------------------------------------------------------------------------- 2162 | Normalizes the subnormal double-precision floating-point value represented 2163 | by the denormalized significand `aSig'. The normalized exponent and 2164 | significand are stored at the locations pointed to by `zExpPtr' and 2165 | `zSigPtr', respectively. 2166 *----------------------------------------------------------------------------*/ 2167 2168 static void 2169 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2170 { 2171 int8_t shiftCount; 2172 2173 shiftCount = countLeadingZeros64( aSig ) - 11; 2174 *zSigPtr = aSig<<shiftCount; 2175 *zExpPtr = 1 - shiftCount; 2176 2177 } 2178 2179 /*---------------------------------------------------------------------------- 2180 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2181 | double-precision floating-point value, returning the result. After being 2182 | shifted into the proper positions, the three fields are simply added 2183 | together to form the result. This means that any integer portion of `zSig' 2184 | will be added into the exponent. Since a properly normalized significand 2185 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2186 | than the desired result exponent whenever `zSig' is a complete, normalized 2187 | significand. 2188 *----------------------------------------------------------------------------*/ 2189 2190 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2191 { 2192 2193 return make_float64( 2194 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2195 2196 } 2197 2198 /*---------------------------------------------------------------------------- 2199 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2200 | and significand `zSig', and returns the proper double-precision floating- 2201 | point value corresponding to the abstract input. Ordinarily, the abstract 2202 | value is simply rounded and packed into the double-precision format, with 2203 | the inexact exception raised if the abstract input cannot be represented 2204 | exactly. However, if the abstract value is too large, the overflow and 2205 | inexact exceptions are raised and an infinity or maximal finite value is 2206 | returned. If the abstract value is too small, the input value is rounded to 2207 | a subnormal number, and the underflow and inexact exceptions are raised if 2208 | the abstract input cannot be represented exactly as a subnormal double- 2209 | precision floating-point number. 2210 | The input significand `zSig' has its binary point between bits 62 2211 | and 61, which is 10 bits to the left of the usual location. This shifted 2212 | significand must be normalized or smaller. If `zSig' is not normalized, 2213 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2214 | and it must not require rounding. In the usual case that `zSig' is 2215 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2216 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2217 | Binary Floating-Point Arithmetic. 2218 *----------------------------------------------------------------------------*/ 2219 2220 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2221 float_status *status) 2222 { 2223 int8_t roundingMode; 2224 flag roundNearestEven; 2225 int roundIncrement, roundBits; 2226 flag isTiny; 2227 2228 roundingMode = status->float_rounding_mode; 2229 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2230 switch (roundingMode) { 2231 case float_round_nearest_even: 2232 case float_round_ties_away: 2233 roundIncrement = 0x200; 2234 break; 2235 case float_round_to_zero: 2236 roundIncrement = 0; 2237 break; 2238 case float_round_up: 2239 roundIncrement = zSign ? 0 : 0x3ff; 2240 break; 2241 case float_round_down: 2242 roundIncrement = zSign ? 0x3ff : 0; 2243 break; 2244 case float_round_to_odd: 2245 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2246 break; 2247 default: 2248 abort(); 2249 } 2250 roundBits = zSig & 0x3FF; 2251 if ( 0x7FD <= (uint16_t) zExp ) { 2252 if ( ( 0x7FD < zExp ) 2253 || ( ( zExp == 0x7FD ) 2254 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2255 ) { 2256 bool overflow_to_inf = roundingMode != float_round_to_odd && 2257 roundIncrement != 0; 2258 float_raise(float_flag_overflow | float_flag_inexact, status); 2259 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2260 } 2261 if ( zExp < 0 ) { 2262 if (status->flush_to_zero) { 2263 float_raise(float_flag_output_denormal, status); 2264 return packFloat64(zSign, 0, 0); 2265 } 2266 isTiny = 2267 (status->float_detect_tininess 2268 == float_tininess_before_rounding) 2269 || ( zExp < -1 ) 2270 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2271 shift64RightJamming( zSig, - zExp, &zSig ); 2272 zExp = 0; 2273 roundBits = zSig & 0x3FF; 2274 if (isTiny && roundBits) { 2275 float_raise(float_flag_underflow, status); 2276 } 2277 if (roundingMode == float_round_to_odd) { 2278 /* 2279 * For round-to-odd case, the roundIncrement depends on 2280 * zSig which just changed. 2281 */ 2282 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2283 } 2284 } 2285 } 2286 if (roundBits) { 2287 status->float_exception_flags |= float_flag_inexact; 2288 } 2289 zSig = ( zSig + roundIncrement )>>10; 2290 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2291 if ( zSig == 0 ) zExp = 0; 2292 return packFloat64( zSign, zExp, zSig ); 2293 2294 } 2295 2296 /*---------------------------------------------------------------------------- 2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2298 | and significand `zSig', and returns the proper double-precision floating- 2299 | point value corresponding to the abstract input. This routine is just like 2300 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2301 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2302 | floating-point exponent. 2303 *----------------------------------------------------------------------------*/ 2304 2305 static float64 2306 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2307 float_status *status) 2308 { 2309 int8_t shiftCount; 2310 2311 shiftCount = countLeadingZeros64( zSig ) - 1; 2312 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2313 status); 2314 2315 } 2316 2317 /*---------------------------------------------------------------------------- 2318 | Returns the fraction bits of the extended double-precision floating-point 2319 | value `a'. 2320 *----------------------------------------------------------------------------*/ 2321 2322 static inline uint64_t extractFloatx80Frac( floatx80 a ) 2323 { 2324 2325 return a.low; 2326 2327 } 2328 2329 /*---------------------------------------------------------------------------- 2330 | Returns the exponent bits of the extended double-precision floating-point 2331 | value `a'. 2332 *----------------------------------------------------------------------------*/ 2333 2334 static inline int32_t extractFloatx80Exp( floatx80 a ) 2335 { 2336 2337 return a.high & 0x7FFF; 2338 2339 } 2340 2341 /*---------------------------------------------------------------------------- 2342 | Returns the sign bit of the extended double-precision floating-point value 2343 | `a'. 2344 *----------------------------------------------------------------------------*/ 2345 2346 static inline flag extractFloatx80Sign( floatx80 a ) 2347 { 2348 2349 return a.high>>15; 2350 2351 } 2352 2353 /*---------------------------------------------------------------------------- 2354 | Normalizes the subnormal extended double-precision floating-point value 2355 | represented by the denormalized significand `aSig'. The normalized exponent 2356 | and significand are stored at the locations pointed to by `zExpPtr' and 2357 | `zSigPtr', respectively. 2358 *----------------------------------------------------------------------------*/ 2359 2360 static void 2361 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 2362 { 2363 int8_t shiftCount; 2364 2365 shiftCount = countLeadingZeros64( aSig ); 2366 *zSigPtr = aSig<<shiftCount; 2367 *zExpPtr = 1 - shiftCount; 2368 2369 } 2370 2371 /*---------------------------------------------------------------------------- 2372 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 2373 | extended double-precision floating-point value, returning the result. 2374 *----------------------------------------------------------------------------*/ 2375 2376 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 2377 { 2378 floatx80 z; 2379 2380 z.low = zSig; 2381 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 2382 return z; 2383 2384 } 2385 2386 /*---------------------------------------------------------------------------- 2387 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2388 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2389 | and returns the proper extended double-precision floating-point value 2390 | corresponding to the abstract input. Ordinarily, the abstract value is 2391 | rounded and packed into the extended double-precision format, with the 2392 | inexact exception raised if the abstract input cannot be represented 2393 | exactly. However, if the abstract value is too large, the overflow and 2394 | inexact exceptions are raised and an infinity or maximal finite value is 2395 | returned. If the abstract value is too small, the input value is rounded to 2396 | a subnormal number, and the underflow and inexact exceptions are raised if 2397 | the abstract input cannot be represented exactly as a subnormal extended 2398 | double-precision floating-point number. 2399 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2400 | number of bits as single or double precision, respectively. Otherwise, the 2401 | result is rounded to the full precision of the extended double-precision 2402 | format. 2403 | The input significand must be normalized or smaller. If the input 2404 | significand is not normalized, `zExp' must be 0; in that case, the result 2405 | returned is a subnormal number, and it must not require rounding. The 2406 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2407 | Floating-Point Arithmetic. 2408 *----------------------------------------------------------------------------*/ 2409 2410 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2411 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2412 float_status *status) 2413 { 2414 int8_t roundingMode; 2415 flag roundNearestEven, increment, isTiny; 2416 int64_t roundIncrement, roundMask, roundBits; 2417 2418 roundingMode = status->float_rounding_mode; 2419 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2420 if ( roundingPrecision == 80 ) goto precision80; 2421 if ( roundingPrecision == 64 ) { 2422 roundIncrement = LIT64( 0x0000000000000400 ); 2423 roundMask = LIT64( 0x00000000000007FF ); 2424 } 2425 else if ( roundingPrecision == 32 ) { 2426 roundIncrement = LIT64( 0x0000008000000000 ); 2427 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2428 } 2429 else { 2430 goto precision80; 2431 } 2432 zSig0 |= ( zSig1 != 0 ); 2433 switch (roundingMode) { 2434 case float_round_nearest_even: 2435 case float_round_ties_away: 2436 break; 2437 case float_round_to_zero: 2438 roundIncrement = 0; 2439 break; 2440 case float_round_up: 2441 roundIncrement = zSign ? 0 : roundMask; 2442 break; 2443 case float_round_down: 2444 roundIncrement = zSign ? roundMask : 0; 2445 break; 2446 default: 2447 abort(); 2448 } 2449 roundBits = zSig0 & roundMask; 2450 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2451 if ( ( 0x7FFE < zExp ) 2452 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2453 ) { 2454 goto overflow; 2455 } 2456 if ( zExp <= 0 ) { 2457 if (status->flush_to_zero) { 2458 float_raise(float_flag_output_denormal, status); 2459 return packFloatx80(zSign, 0, 0); 2460 } 2461 isTiny = 2462 (status->float_detect_tininess 2463 == float_tininess_before_rounding) 2464 || ( zExp < 0 ) 2465 || ( zSig0 <= zSig0 + roundIncrement ); 2466 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2467 zExp = 0; 2468 roundBits = zSig0 & roundMask; 2469 if (isTiny && roundBits) { 2470 float_raise(float_flag_underflow, status); 2471 } 2472 if (roundBits) { 2473 status->float_exception_flags |= float_flag_inexact; 2474 } 2475 zSig0 += roundIncrement; 2476 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2477 roundIncrement = roundMask + 1; 2478 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2479 roundMask |= roundIncrement; 2480 } 2481 zSig0 &= ~ roundMask; 2482 return packFloatx80( zSign, zExp, zSig0 ); 2483 } 2484 } 2485 if (roundBits) { 2486 status->float_exception_flags |= float_flag_inexact; 2487 } 2488 zSig0 += roundIncrement; 2489 if ( zSig0 < roundIncrement ) { 2490 ++zExp; 2491 zSig0 = LIT64( 0x8000000000000000 ); 2492 } 2493 roundIncrement = roundMask + 1; 2494 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2495 roundMask |= roundIncrement; 2496 } 2497 zSig0 &= ~ roundMask; 2498 if ( zSig0 == 0 ) zExp = 0; 2499 return packFloatx80( zSign, zExp, zSig0 ); 2500 precision80: 2501 switch (roundingMode) { 2502 case float_round_nearest_even: 2503 case float_round_ties_away: 2504 increment = ((int64_t)zSig1 < 0); 2505 break; 2506 case float_round_to_zero: 2507 increment = 0; 2508 break; 2509 case float_round_up: 2510 increment = !zSign && zSig1; 2511 break; 2512 case float_round_down: 2513 increment = zSign && zSig1; 2514 break; 2515 default: 2516 abort(); 2517 } 2518 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2519 if ( ( 0x7FFE < zExp ) 2520 || ( ( zExp == 0x7FFE ) 2521 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2522 && increment 2523 ) 2524 ) { 2525 roundMask = 0; 2526 overflow: 2527 float_raise(float_flag_overflow | float_flag_inexact, status); 2528 if ( ( roundingMode == float_round_to_zero ) 2529 || ( zSign && ( roundingMode == float_round_up ) ) 2530 || ( ! zSign && ( roundingMode == float_round_down ) ) 2531 ) { 2532 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2533 } 2534 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2535 } 2536 if ( zExp <= 0 ) { 2537 isTiny = 2538 (status->float_detect_tininess 2539 == float_tininess_before_rounding) 2540 || ( zExp < 0 ) 2541 || ! increment 2542 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2543 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2544 zExp = 0; 2545 if (isTiny && zSig1) { 2546 float_raise(float_flag_underflow, status); 2547 } 2548 if (zSig1) { 2549 status->float_exception_flags |= float_flag_inexact; 2550 } 2551 switch (roundingMode) { 2552 case float_round_nearest_even: 2553 case float_round_ties_away: 2554 increment = ((int64_t)zSig1 < 0); 2555 break; 2556 case float_round_to_zero: 2557 increment = 0; 2558 break; 2559 case float_round_up: 2560 increment = !zSign && zSig1; 2561 break; 2562 case float_round_down: 2563 increment = zSign && zSig1; 2564 break; 2565 default: 2566 abort(); 2567 } 2568 if ( increment ) { 2569 ++zSig0; 2570 zSig0 &= 2571 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2572 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2573 } 2574 return packFloatx80( zSign, zExp, zSig0 ); 2575 } 2576 } 2577 if (zSig1) { 2578 status->float_exception_flags |= float_flag_inexact; 2579 } 2580 if ( increment ) { 2581 ++zSig0; 2582 if ( zSig0 == 0 ) { 2583 ++zExp; 2584 zSig0 = LIT64( 0x8000000000000000 ); 2585 } 2586 else { 2587 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2588 } 2589 } 2590 else { 2591 if ( zSig0 == 0 ) zExp = 0; 2592 } 2593 return packFloatx80( zSign, zExp, zSig0 ); 2594 2595 } 2596 2597 /*---------------------------------------------------------------------------- 2598 | Takes an abstract floating-point value having sign `zSign', exponent 2599 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2600 | and returns the proper extended double-precision floating-point value 2601 | corresponding to the abstract input. This routine is just like 2602 | `roundAndPackFloatx80' except that the input significand does not have to be 2603 | normalized. 2604 *----------------------------------------------------------------------------*/ 2605 2606 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2607 flag zSign, int32_t zExp, 2608 uint64_t zSig0, uint64_t zSig1, 2609 float_status *status) 2610 { 2611 int8_t shiftCount; 2612 2613 if ( zSig0 == 0 ) { 2614 zSig0 = zSig1; 2615 zSig1 = 0; 2616 zExp -= 64; 2617 } 2618 shiftCount = countLeadingZeros64( zSig0 ); 2619 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2620 zExp -= shiftCount; 2621 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2622 zSig0, zSig1, status); 2623 2624 } 2625 2626 /*---------------------------------------------------------------------------- 2627 | Returns the least-significant 64 fraction bits of the quadruple-precision 2628 | floating-point value `a'. 2629 *----------------------------------------------------------------------------*/ 2630 2631 static inline uint64_t extractFloat128Frac1( float128 a ) 2632 { 2633 2634 return a.low; 2635 2636 } 2637 2638 /*---------------------------------------------------------------------------- 2639 | Returns the most-significant 48 fraction bits of the quadruple-precision 2640 | floating-point value `a'. 2641 *----------------------------------------------------------------------------*/ 2642 2643 static inline uint64_t extractFloat128Frac0( float128 a ) 2644 { 2645 2646 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2647 2648 } 2649 2650 /*---------------------------------------------------------------------------- 2651 | Returns the exponent bits of the quadruple-precision floating-point value 2652 | `a'. 2653 *----------------------------------------------------------------------------*/ 2654 2655 static inline int32_t extractFloat128Exp( float128 a ) 2656 { 2657 2658 return ( a.high>>48 ) & 0x7FFF; 2659 2660 } 2661 2662 /*---------------------------------------------------------------------------- 2663 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2664 *----------------------------------------------------------------------------*/ 2665 2666 static inline flag extractFloat128Sign( float128 a ) 2667 { 2668 2669 return a.high>>63; 2670 2671 } 2672 2673 /*---------------------------------------------------------------------------- 2674 | Normalizes the subnormal quadruple-precision floating-point value 2675 | represented by the denormalized significand formed by the concatenation of 2676 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2677 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2678 | significand are stored at the location pointed to by `zSig0Ptr', and the 2679 | least significant 64 bits of the normalized significand are stored at the 2680 | location pointed to by `zSig1Ptr'. 2681 *----------------------------------------------------------------------------*/ 2682 2683 static void 2684 normalizeFloat128Subnormal( 2685 uint64_t aSig0, 2686 uint64_t aSig1, 2687 int32_t *zExpPtr, 2688 uint64_t *zSig0Ptr, 2689 uint64_t *zSig1Ptr 2690 ) 2691 { 2692 int8_t shiftCount; 2693 2694 if ( aSig0 == 0 ) { 2695 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2696 if ( shiftCount < 0 ) { 2697 *zSig0Ptr = aSig1>>( - shiftCount ); 2698 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2699 } 2700 else { 2701 *zSig0Ptr = aSig1<<shiftCount; 2702 *zSig1Ptr = 0; 2703 } 2704 *zExpPtr = - shiftCount - 63; 2705 } 2706 else { 2707 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2708 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2709 *zExpPtr = 1 - shiftCount; 2710 } 2711 2712 } 2713 2714 /*---------------------------------------------------------------------------- 2715 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2716 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2717 | floating-point value, returning the result. After being shifted into the 2718 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2719 | added together to form the most significant 32 bits of the result. This 2720 | means that any integer portion of `zSig0' will be added into the exponent. 2721 | Since a properly normalized significand will have an integer portion equal 2722 | to 1, the `zExp' input should be 1 less than the desired result exponent 2723 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2724 | significand. 2725 *----------------------------------------------------------------------------*/ 2726 2727 static inline float128 2728 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2729 { 2730 float128 z; 2731 2732 z.low = zSig1; 2733 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2734 return z; 2735 2736 } 2737 2738 /*---------------------------------------------------------------------------- 2739 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2740 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2741 | and `zSig2', and returns the proper quadruple-precision floating-point value 2742 | corresponding to the abstract input. Ordinarily, the abstract value is 2743 | simply rounded and packed into the quadruple-precision format, with the 2744 | inexact exception raised if the abstract input cannot be represented 2745 | exactly. However, if the abstract value is too large, the overflow and 2746 | inexact exceptions are raised and an infinity or maximal finite value is 2747 | returned. If the abstract value is too small, the input value is rounded to 2748 | a subnormal number, and the underflow and inexact exceptions are raised if 2749 | the abstract input cannot be represented exactly as a subnormal quadruple- 2750 | precision floating-point number. 2751 | The input significand must be normalized or smaller. If the input 2752 | significand is not normalized, `zExp' must be 0; in that case, the result 2753 | returned is a subnormal number, and it must not require rounding. In the 2754 | usual case that the input significand is normalized, `zExp' must be 1 less 2755 | than the ``true'' floating-point exponent. The handling of underflow and 2756 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2757 *----------------------------------------------------------------------------*/ 2758 2759 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2760 uint64_t zSig0, uint64_t zSig1, 2761 uint64_t zSig2, float_status *status) 2762 { 2763 int8_t roundingMode; 2764 flag roundNearestEven, increment, isTiny; 2765 2766 roundingMode = status->float_rounding_mode; 2767 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2768 switch (roundingMode) { 2769 case float_round_nearest_even: 2770 case float_round_ties_away: 2771 increment = ((int64_t)zSig2 < 0); 2772 break; 2773 case float_round_to_zero: 2774 increment = 0; 2775 break; 2776 case float_round_up: 2777 increment = !zSign && zSig2; 2778 break; 2779 case float_round_down: 2780 increment = zSign && zSig2; 2781 break; 2782 case float_round_to_odd: 2783 increment = !(zSig1 & 0x1) && zSig2; 2784 break; 2785 default: 2786 abort(); 2787 } 2788 if ( 0x7FFD <= (uint32_t) zExp ) { 2789 if ( ( 0x7FFD < zExp ) 2790 || ( ( zExp == 0x7FFD ) 2791 && eq128( 2792 LIT64( 0x0001FFFFFFFFFFFF ), 2793 LIT64( 0xFFFFFFFFFFFFFFFF ), 2794 zSig0, 2795 zSig1 2796 ) 2797 && increment 2798 ) 2799 ) { 2800 float_raise(float_flag_overflow | float_flag_inexact, status); 2801 if ( ( roundingMode == float_round_to_zero ) 2802 || ( zSign && ( roundingMode == float_round_up ) ) 2803 || ( ! zSign && ( roundingMode == float_round_down ) ) 2804 || (roundingMode == float_round_to_odd) 2805 ) { 2806 return 2807 packFloat128( 2808 zSign, 2809 0x7FFE, 2810 LIT64( 0x0000FFFFFFFFFFFF ), 2811 LIT64( 0xFFFFFFFFFFFFFFFF ) 2812 ); 2813 } 2814 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2815 } 2816 if ( zExp < 0 ) { 2817 if (status->flush_to_zero) { 2818 float_raise(float_flag_output_denormal, status); 2819 return packFloat128(zSign, 0, 0, 0); 2820 } 2821 isTiny = 2822 (status->float_detect_tininess 2823 == float_tininess_before_rounding) 2824 || ( zExp < -1 ) 2825 || ! increment 2826 || lt128( 2827 zSig0, 2828 zSig1, 2829 LIT64( 0x0001FFFFFFFFFFFF ), 2830 LIT64( 0xFFFFFFFFFFFFFFFF ) 2831 ); 2832 shift128ExtraRightJamming( 2833 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2834 zExp = 0; 2835 if (isTiny && zSig2) { 2836 float_raise(float_flag_underflow, status); 2837 } 2838 switch (roundingMode) { 2839 case float_round_nearest_even: 2840 case float_round_ties_away: 2841 increment = ((int64_t)zSig2 < 0); 2842 break; 2843 case float_round_to_zero: 2844 increment = 0; 2845 break; 2846 case float_round_up: 2847 increment = !zSign && zSig2; 2848 break; 2849 case float_round_down: 2850 increment = zSign && zSig2; 2851 break; 2852 case float_round_to_odd: 2853 increment = !(zSig1 & 0x1) && zSig2; 2854 break; 2855 default: 2856 abort(); 2857 } 2858 } 2859 } 2860 if (zSig2) { 2861 status->float_exception_flags |= float_flag_inexact; 2862 } 2863 if ( increment ) { 2864 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2865 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2866 } 2867 else { 2868 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2869 } 2870 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2871 2872 } 2873 2874 /*---------------------------------------------------------------------------- 2875 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2876 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2877 | returns the proper quadruple-precision floating-point value corresponding 2878 | to the abstract input. This routine is just like `roundAndPackFloat128' 2879 | except that the input significand has fewer bits and does not have to be 2880 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2881 | point exponent. 2882 *----------------------------------------------------------------------------*/ 2883 2884 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2885 uint64_t zSig0, uint64_t zSig1, 2886 float_status *status) 2887 { 2888 int8_t shiftCount; 2889 uint64_t zSig2; 2890 2891 if ( zSig0 == 0 ) { 2892 zSig0 = zSig1; 2893 zSig1 = 0; 2894 zExp -= 64; 2895 } 2896 shiftCount = countLeadingZeros64( zSig0 ) - 15; 2897 if ( 0 <= shiftCount ) { 2898 zSig2 = 0; 2899 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2900 } 2901 else { 2902 shift128ExtraRightJamming( 2903 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 2904 } 2905 zExp -= shiftCount; 2906 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 2907 2908 } 2909 2910 2911 /*---------------------------------------------------------------------------- 2912 | Returns the result of converting the 32-bit two's complement integer `a' 2913 | to the extended double-precision floating-point format. The conversion 2914 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2915 | Arithmetic. 2916 *----------------------------------------------------------------------------*/ 2917 2918 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2919 { 2920 flag zSign; 2921 uint32_t absA; 2922 int8_t shiftCount; 2923 uint64_t zSig; 2924 2925 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2926 zSign = ( a < 0 ); 2927 absA = zSign ? - a : a; 2928 shiftCount = countLeadingZeros32( absA ) + 32; 2929 zSig = absA; 2930 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 2931 2932 } 2933 2934 /*---------------------------------------------------------------------------- 2935 | Returns the result of converting the 32-bit two's complement integer `a' to 2936 | the quadruple-precision floating-point format. The conversion is performed 2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2938 *----------------------------------------------------------------------------*/ 2939 2940 float128 int32_to_float128(int32_t a, float_status *status) 2941 { 2942 flag zSign; 2943 uint32_t absA; 2944 int8_t shiftCount; 2945 uint64_t zSig0; 2946 2947 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2948 zSign = ( a < 0 ); 2949 absA = zSign ? - a : a; 2950 shiftCount = countLeadingZeros32( absA ) + 17; 2951 zSig0 = absA; 2952 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 2953 2954 } 2955 2956 /*---------------------------------------------------------------------------- 2957 | Returns the result of converting the 64-bit two's complement integer `a' 2958 | to the extended double-precision floating-point format. The conversion 2959 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2960 | Arithmetic. 2961 *----------------------------------------------------------------------------*/ 2962 2963 floatx80 int64_to_floatx80(int64_t a, float_status *status) 2964 { 2965 flag zSign; 2966 uint64_t absA; 2967 int8_t shiftCount; 2968 2969 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2970 zSign = ( a < 0 ); 2971 absA = zSign ? - a : a; 2972 shiftCount = countLeadingZeros64( absA ); 2973 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 2974 2975 } 2976 2977 /*---------------------------------------------------------------------------- 2978 | Returns the result of converting the 64-bit two's complement integer `a' to 2979 | the quadruple-precision floating-point format. The conversion is performed 2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2981 *----------------------------------------------------------------------------*/ 2982 2983 float128 int64_to_float128(int64_t a, float_status *status) 2984 { 2985 flag zSign; 2986 uint64_t absA; 2987 int8_t shiftCount; 2988 int32_t zExp; 2989 uint64_t zSig0, zSig1; 2990 2991 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2992 zSign = ( a < 0 ); 2993 absA = zSign ? - a : a; 2994 shiftCount = countLeadingZeros64( absA ) + 49; 2995 zExp = 0x406E - shiftCount; 2996 if ( 64 <= shiftCount ) { 2997 zSig1 = 0; 2998 zSig0 = absA; 2999 shiftCount -= 64; 3000 } 3001 else { 3002 zSig1 = absA; 3003 zSig0 = 0; 3004 } 3005 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3006 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3007 3008 } 3009 3010 /*---------------------------------------------------------------------------- 3011 | Returns the result of converting the 64-bit unsigned integer `a' 3012 | to the quadruple-precision floating-point format. The conversion is performed 3013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3014 *----------------------------------------------------------------------------*/ 3015 3016 float128 uint64_to_float128(uint64_t a, float_status *status) 3017 { 3018 if (a == 0) { 3019 return float128_zero; 3020 } 3021 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3022 } 3023 3024 3025 3026 3027 /*---------------------------------------------------------------------------- 3028 | Returns the result of converting the single-precision floating-point value 3029 | `a' to the double-precision floating-point format. The conversion is 3030 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3031 | Arithmetic. 3032 *----------------------------------------------------------------------------*/ 3033 3034 float64 float32_to_float64(float32 a, float_status *status) 3035 { 3036 flag aSign; 3037 int aExp; 3038 uint32_t aSig; 3039 a = float32_squash_input_denormal(a, status); 3040 3041 aSig = extractFloat32Frac( a ); 3042 aExp = extractFloat32Exp( a ); 3043 aSign = extractFloat32Sign( a ); 3044 if ( aExp == 0xFF ) { 3045 if (aSig) { 3046 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3047 } 3048 return packFloat64( aSign, 0x7FF, 0 ); 3049 } 3050 if ( aExp == 0 ) { 3051 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3052 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3053 --aExp; 3054 } 3055 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3056 3057 } 3058 3059 /*---------------------------------------------------------------------------- 3060 | Returns the result of converting the single-precision floating-point value 3061 | `a' to the extended double-precision floating-point format. The conversion 3062 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3063 | Arithmetic. 3064 *----------------------------------------------------------------------------*/ 3065 3066 floatx80 float32_to_floatx80(float32 a, float_status *status) 3067 { 3068 flag aSign; 3069 int aExp; 3070 uint32_t aSig; 3071 3072 a = float32_squash_input_denormal(a, status); 3073 aSig = extractFloat32Frac( a ); 3074 aExp = extractFloat32Exp( a ); 3075 aSign = extractFloat32Sign( a ); 3076 if ( aExp == 0xFF ) { 3077 if (aSig) { 3078 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3079 } 3080 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3081 } 3082 if ( aExp == 0 ) { 3083 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3084 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3085 } 3086 aSig |= 0x00800000; 3087 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3088 3089 } 3090 3091 /*---------------------------------------------------------------------------- 3092 | Returns the result of converting the single-precision floating-point value 3093 | `a' to the double-precision floating-point format. The conversion is 3094 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3095 | Arithmetic. 3096 *----------------------------------------------------------------------------*/ 3097 3098 float128 float32_to_float128(float32 a, float_status *status) 3099 { 3100 flag aSign; 3101 int aExp; 3102 uint32_t aSig; 3103 3104 a = float32_squash_input_denormal(a, status); 3105 aSig = extractFloat32Frac( a ); 3106 aExp = extractFloat32Exp( a ); 3107 aSign = extractFloat32Sign( a ); 3108 if ( aExp == 0xFF ) { 3109 if (aSig) { 3110 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3111 } 3112 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3113 } 3114 if ( aExp == 0 ) { 3115 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3116 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3117 --aExp; 3118 } 3119 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3120 3121 } 3122 3123 /*---------------------------------------------------------------------------- 3124 | Returns the remainder of the single-precision floating-point value `a' 3125 | with respect to the corresponding value `b'. The operation is performed 3126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3127 *----------------------------------------------------------------------------*/ 3128 3129 float32 float32_rem(float32 a, float32 b, float_status *status) 3130 { 3131 flag aSign, zSign; 3132 int aExp, bExp, expDiff; 3133 uint32_t aSig, bSig; 3134 uint32_t q; 3135 uint64_t aSig64, bSig64, q64; 3136 uint32_t alternateASig; 3137 int32_t sigMean; 3138 a = float32_squash_input_denormal(a, status); 3139 b = float32_squash_input_denormal(b, status); 3140 3141 aSig = extractFloat32Frac( a ); 3142 aExp = extractFloat32Exp( a ); 3143 aSign = extractFloat32Sign( a ); 3144 bSig = extractFloat32Frac( b ); 3145 bExp = extractFloat32Exp( b ); 3146 if ( aExp == 0xFF ) { 3147 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3148 return propagateFloat32NaN(a, b, status); 3149 } 3150 float_raise(float_flag_invalid, status); 3151 return float32_default_nan(status); 3152 } 3153 if ( bExp == 0xFF ) { 3154 if (bSig) { 3155 return propagateFloat32NaN(a, b, status); 3156 } 3157 return a; 3158 } 3159 if ( bExp == 0 ) { 3160 if ( bSig == 0 ) { 3161 float_raise(float_flag_invalid, status); 3162 return float32_default_nan(status); 3163 } 3164 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3165 } 3166 if ( aExp == 0 ) { 3167 if ( aSig == 0 ) return a; 3168 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3169 } 3170 expDiff = aExp - bExp; 3171 aSig |= 0x00800000; 3172 bSig |= 0x00800000; 3173 if ( expDiff < 32 ) { 3174 aSig <<= 8; 3175 bSig <<= 8; 3176 if ( expDiff < 0 ) { 3177 if ( expDiff < -1 ) return a; 3178 aSig >>= 1; 3179 } 3180 q = ( bSig <= aSig ); 3181 if ( q ) aSig -= bSig; 3182 if ( 0 < expDiff ) { 3183 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3184 q >>= 32 - expDiff; 3185 bSig >>= 2; 3186 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3187 } 3188 else { 3189 aSig >>= 2; 3190 bSig >>= 2; 3191 } 3192 } 3193 else { 3194 if ( bSig <= aSig ) aSig -= bSig; 3195 aSig64 = ( (uint64_t) aSig )<<40; 3196 bSig64 = ( (uint64_t) bSig )<<40; 3197 expDiff -= 64; 3198 while ( 0 < expDiff ) { 3199 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3200 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3201 aSig64 = - ( ( bSig * q64 )<<38 ); 3202 expDiff -= 62; 3203 } 3204 expDiff += 64; 3205 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3206 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3207 q = q64>>( 64 - expDiff ); 3208 bSig <<= 6; 3209 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3210 } 3211 do { 3212 alternateASig = aSig; 3213 ++q; 3214 aSig -= bSig; 3215 } while ( 0 <= (int32_t) aSig ); 3216 sigMean = aSig + alternateASig; 3217 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3218 aSig = alternateASig; 3219 } 3220 zSign = ( (int32_t) aSig < 0 ); 3221 if ( zSign ) aSig = - aSig; 3222 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3223 } 3224 3225 3226 /*---------------------------------------------------------------------------- 3227 | Returns the square root of the single-precision floating-point value `a'. 3228 | The operation is performed according to the IEC/IEEE Standard for Binary 3229 | Floating-Point Arithmetic. 3230 *----------------------------------------------------------------------------*/ 3231 3232 float32 float32_sqrt(float32 a, float_status *status) 3233 { 3234 flag aSign; 3235 int aExp, zExp; 3236 uint32_t aSig, zSig; 3237 uint64_t rem, term; 3238 a = float32_squash_input_denormal(a, status); 3239 3240 aSig = extractFloat32Frac( a ); 3241 aExp = extractFloat32Exp( a ); 3242 aSign = extractFloat32Sign( a ); 3243 if ( aExp == 0xFF ) { 3244 if (aSig) { 3245 return propagateFloat32NaN(a, float32_zero, status); 3246 } 3247 if ( ! aSign ) return a; 3248 float_raise(float_flag_invalid, status); 3249 return float32_default_nan(status); 3250 } 3251 if ( aSign ) { 3252 if ( ( aExp | aSig ) == 0 ) return a; 3253 float_raise(float_flag_invalid, status); 3254 return float32_default_nan(status); 3255 } 3256 if ( aExp == 0 ) { 3257 if ( aSig == 0 ) return float32_zero; 3258 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3259 } 3260 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3261 aSig = ( aSig | 0x00800000 )<<8; 3262 zSig = estimateSqrt32( aExp, aSig ) + 2; 3263 if ( ( zSig & 0x7F ) <= 5 ) { 3264 if ( zSig < 2 ) { 3265 zSig = 0x7FFFFFFF; 3266 goto roundAndPack; 3267 } 3268 aSig >>= aExp & 1; 3269 term = ( (uint64_t) zSig ) * zSig; 3270 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3271 while ( (int64_t) rem < 0 ) { 3272 --zSig; 3273 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3274 } 3275 zSig |= ( rem != 0 ); 3276 } 3277 shift32RightJamming( zSig, 1, &zSig ); 3278 roundAndPack: 3279 return roundAndPackFloat32(0, zExp, zSig, status); 3280 3281 } 3282 3283 /*---------------------------------------------------------------------------- 3284 | Returns the binary exponential of the single-precision floating-point value 3285 | `a'. The operation is performed according to the IEC/IEEE Standard for 3286 | Binary Floating-Point Arithmetic. 3287 | 3288 | Uses the following identities: 3289 | 3290 | 1. ------------------------------------------------------------------------- 3291 | x x*ln(2) 3292 | 2 = e 3293 | 3294 | 2. ------------------------------------------------------------------------- 3295 | 2 3 4 5 n 3296 | x x x x x x x 3297 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3298 | 1! 2! 3! 4! 5! n! 3299 *----------------------------------------------------------------------------*/ 3300 3301 static const float64 float32_exp2_coefficients[15] = 3302 { 3303 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3304 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3305 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3306 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3307 const_float64( 0x3f81111111111111ll ), /* 5 */ 3308 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3309 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3310 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3311 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3312 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3313 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3314 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3315 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3316 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3317 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3318 }; 3319 3320 float32 float32_exp2(float32 a, float_status *status) 3321 { 3322 flag aSign; 3323 int aExp; 3324 uint32_t aSig; 3325 float64 r, x, xn; 3326 int i; 3327 a = float32_squash_input_denormal(a, status); 3328 3329 aSig = extractFloat32Frac( a ); 3330 aExp = extractFloat32Exp( a ); 3331 aSign = extractFloat32Sign( a ); 3332 3333 if ( aExp == 0xFF) { 3334 if (aSig) { 3335 return propagateFloat32NaN(a, float32_zero, status); 3336 } 3337 return (aSign) ? float32_zero : a; 3338 } 3339 if (aExp == 0) { 3340 if (aSig == 0) return float32_one; 3341 } 3342 3343 float_raise(float_flag_inexact, status); 3344 3345 /* ******************************* */ 3346 /* using float64 for approximation */ 3347 /* ******************************* */ 3348 x = float32_to_float64(a, status); 3349 x = float64_mul(x, float64_ln2, status); 3350 3351 xn = x; 3352 r = float64_one; 3353 for (i = 0 ; i < 15 ; i++) { 3354 float64 f; 3355 3356 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3357 r = float64_add(r, f, status); 3358 3359 xn = float64_mul(xn, x, status); 3360 } 3361 3362 return float64_to_float32(r, status); 3363 } 3364 3365 /*---------------------------------------------------------------------------- 3366 | Returns the binary log of the single-precision floating-point value `a'. 3367 | The operation is performed according to the IEC/IEEE Standard for Binary 3368 | Floating-Point Arithmetic. 3369 *----------------------------------------------------------------------------*/ 3370 float32 float32_log2(float32 a, float_status *status) 3371 { 3372 flag aSign, zSign; 3373 int aExp; 3374 uint32_t aSig, zSig, i; 3375 3376 a = float32_squash_input_denormal(a, status); 3377 aSig = extractFloat32Frac( a ); 3378 aExp = extractFloat32Exp( a ); 3379 aSign = extractFloat32Sign( a ); 3380 3381 if ( aExp == 0 ) { 3382 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3383 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3384 } 3385 if ( aSign ) { 3386 float_raise(float_flag_invalid, status); 3387 return float32_default_nan(status); 3388 } 3389 if ( aExp == 0xFF ) { 3390 if (aSig) { 3391 return propagateFloat32NaN(a, float32_zero, status); 3392 } 3393 return a; 3394 } 3395 3396 aExp -= 0x7F; 3397 aSig |= 0x00800000; 3398 zSign = aExp < 0; 3399 zSig = aExp << 23; 3400 3401 for (i = 1 << 22; i > 0; i >>= 1) { 3402 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3403 if ( aSig & 0x01000000 ) { 3404 aSig >>= 1; 3405 zSig |= i; 3406 } 3407 } 3408 3409 if ( zSign ) 3410 zSig = -zSig; 3411 3412 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3413 } 3414 3415 /*---------------------------------------------------------------------------- 3416 | Returns 1 if the single-precision floating-point value `a' is equal to 3417 | the corresponding value `b', and 0 otherwise. The invalid exception is 3418 | raised if either operand is a NaN. Otherwise, the comparison is performed 3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3420 *----------------------------------------------------------------------------*/ 3421 3422 int float32_eq(float32 a, float32 b, float_status *status) 3423 { 3424 uint32_t av, bv; 3425 a = float32_squash_input_denormal(a, status); 3426 b = float32_squash_input_denormal(b, status); 3427 3428 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3429 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3430 ) { 3431 float_raise(float_flag_invalid, status); 3432 return 0; 3433 } 3434 av = float32_val(a); 3435 bv = float32_val(b); 3436 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3437 } 3438 3439 /*---------------------------------------------------------------------------- 3440 | Returns 1 if the single-precision floating-point value `a' is less than 3441 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3442 | exception is raised if either operand is a NaN. The comparison is performed 3443 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3444 *----------------------------------------------------------------------------*/ 3445 3446 int float32_le(float32 a, float32 b, float_status *status) 3447 { 3448 flag aSign, bSign; 3449 uint32_t av, bv; 3450 a = float32_squash_input_denormal(a, status); 3451 b = float32_squash_input_denormal(b, status); 3452 3453 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3454 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3455 ) { 3456 float_raise(float_flag_invalid, status); 3457 return 0; 3458 } 3459 aSign = extractFloat32Sign( a ); 3460 bSign = extractFloat32Sign( b ); 3461 av = float32_val(a); 3462 bv = float32_val(b); 3463 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3464 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3465 3466 } 3467 3468 /*---------------------------------------------------------------------------- 3469 | Returns 1 if the single-precision floating-point value `a' is less than 3470 | the corresponding value `b', and 0 otherwise. The invalid exception is 3471 | raised if either operand is a NaN. The comparison is performed according 3472 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3473 *----------------------------------------------------------------------------*/ 3474 3475 int float32_lt(float32 a, float32 b, float_status *status) 3476 { 3477 flag aSign, bSign; 3478 uint32_t av, bv; 3479 a = float32_squash_input_denormal(a, status); 3480 b = float32_squash_input_denormal(b, status); 3481 3482 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3483 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3484 ) { 3485 float_raise(float_flag_invalid, status); 3486 return 0; 3487 } 3488 aSign = extractFloat32Sign( a ); 3489 bSign = extractFloat32Sign( b ); 3490 av = float32_val(a); 3491 bv = float32_val(b); 3492 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3493 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3494 3495 } 3496 3497 /*---------------------------------------------------------------------------- 3498 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3499 | be compared, and 0 otherwise. The invalid exception is raised if either 3500 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3501 | Standard for Binary Floating-Point Arithmetic. 3502 *----------------------------------------------------------------------------*/ 3503 3504 int float32_unordered(float32 a, float32 b, float_status *status) 3505 { 3506 a = float32_squash_input_denormal(a, status); 3507 b = float32_squash_input_denormal(b, status); 3508 3509 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3510 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3511 ) { 3512 float_raise(float_flag_invalid, status); 3513 return 1; 3514 } 3515 return 0; 3516 } 3517 3518 /*---------------------------------------------------------------------------- 3519 | Returns 1 if the single-precision floating-point value `a' is equal to 3520 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3521 | exception. The comparison is performed according to the IEC/IEEE Standard 3522 | for Binary Floating-Point Arithmetic. 3523 *----------------------------------------------------------------------------*/ 3524 3525 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3526 { 3527 a = float32_squash_input_denormal(a, status); 3528 b = float32_squash_input_denormal(b, status); 3529 3530 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3531 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3532 ) { 3533 if (float32_is_signaling_nan(a, status) 3534 || float32_is_signaling_nan(b, status)) { 3535 float_raise(float_flag_invalid, status); 3536 } 3537 return 0; 3538 } 3539 return ( float32_val(a) == float32_val(b) ) || 3540 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3541 } 3542 3543 /*---------------------------------------------------------------------------- 3544 | Returns 1 if the single-precision floating-point value `a' is less than or 3545 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3546 | cause an exception. Otherwise, the comparison is performed according to the 3547 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3548 *----------------------------------------------------------------------------*/ 3549 3550 int float32_le_quiet(float32 a, float32 b, float_status *status) 3551 { 3552 flag aSign, bSign; 3553 uint32_t av, bv; 3554 a = float32_squash_input_denormal(a, status); 3555 b = float32_squash_input_denormal(b, status); 3556 3557 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3558 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3559 ) { 3560 if (float32_is_signaling_nan(a, status) 3561 || float32_is_signaling_nan(b, status)) { 3562 float_raise(float_flag_invalid, status); 3563 } 3564 return 0; 3565 } 3566 aSign = extractFloat32Sign( a ); 3567 bSign = extractFloat32Sign( b ); 3568 av = float32_val(a); 3569 bv = float32_val(b); 3570 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3571 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3572 3573 } 3574 3575 /*---------------------------------------------------------------------------- 3576 | Returns 1 if the single-precision floating-point value `a' is less than 3577 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3578 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3579 | Standard for Binary Floating-Point Arithmetic. 3580 *----------------------------------------------------------------------------*/ 3581 3582 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3583 { 3584 flag aSign, bSign; 3585 uint32_t av, bv; 3586 a = float32_squash_input_denormal(a, status); 3587 b = float32_squash_input_denormal(b, status); 3588 3589 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3590 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3591 ) { 3592 if (float32_is_signaling_nan(a, status) 3593 || float32_is_signaling_nan(b, status)) { 3594 float_raise(float_flag_invalid, status); 3595 } 3596 return 0; 3597 } 3598 aSign = extractFloat32Sign( a ); 3599 bSign = extractFloat32Sign( b ); 3600 av = float32_val(a); 3601 bv = float32_val(b); 3602 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3603 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3604 3605 } 3606 3607 /*---------------------------------------------------------------------------- 3608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3609 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3610 | comparison is performed according to the IEC/IEEE Standard for Binary 3611 | Floating-Point Arithmetic. 3612 *----------------------------------------------------------------------------*/ 3613 3614 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3615 { 3616 a = float32_squash_input_denormal(a, status); 3617 b = float32_squash_input_denormal(b, status); 3618 3619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3621 ) { 3622 if (float32_is_signaling_nan(a, status) 3623 || float32_is_signaling_nan(b, status)) { 3624 float_raise(float_flag_invalid, status); 3625 } 3626 return 1; 3627 } 3628 return 0; 3629 } 3630 3631 3632 /*---------------------------------------------------------------------------- 3633 | Returns the result of converting the double-precision floating-point value 3634 | `a' to the single-precision floating-point format. The conversion is 3635 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3636 | Arithmetic. 3637 *----------------------------------------------------------------------------*/ 3638 3639 float32 float64_to_float32(float64 a, float_status *status) 3640 { 3641 flag aSign; 3642 int aExp; 3643 uint64_t aSig; 3644 uint32_t zSig; 3645 a = float64_squash_input_denormal(a, status); 3646 3647 aSig = extractFloat64Frac( a ); 3648 aExp = extractFloat64Exp( a ); 3649 aSign = extractFloat64Sign( a ); 3650 if ( aExp == 0x7FF ) { 3651 if (aSig) { 3652 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3653 } 3654 return packFloat32( aSign, 0xFF, 0 ); 3655 } 3656 shift64RightJamming( aSig, 22, &aSig ); 3657 zSig = aSig; 3658 if ( aExp || zSig ) { 3659 zSig |= 0x40000000; 3660 aExp -= 0x381; 3661 } 3662 return roundAndPackFloat32(aSign, aExp, zSig, status); 3663 3664 } 3665 3666 3667 /*---------------------------------------------------------------------------- 3668 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3669 | half-precision floating-point value, returning the result. After being 3670 | shifted into the proper positions, the three fields are simply added 3671 | together to form the result. This means that any integer portion of `zSig' 3672 | will be added into the exponent. Since a properly normalized significand 3673 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3674 | than the desired result exponent whenever `zSig' is a complete, normalized 3675 | significand. 3676 *----------------------------------------------------------------------------*/ 3677 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3678 { 3679 return make_float16( 3680 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3681 } 3682 3683 /*---------------------------------------------------------------------------- 3684 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3685 | and significand `zSig', and returns the proper half-precision floating- 3686 | point value corresponding to the abstract input. Ordinarily, the abstract 3687 | value is simply rounded and packed into the half-precision format, with 3688 | the inexact exception raised if the abstract input cannot be represented 3689 | exactly. However, if the abstract value is too large, the overflow and 3690 | inexact exceptions are raised and an infinity or maximal finite value is 3691 | returned. If the abstract value is too small, the input value is rounded to 3692 | a subnormal number, and the underflow and inexact exceptions are raised if 3693 | the abstract input cannot be represented exactly as a subnormal half- 3694 | precision floating-point number. 3695 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3696 | ARM-style "alternative representation", which omits the NaN and Inf 3697 | encodings in order to raise the maximum representable exponent by one. 3698 | The input significand `zSig' has its binary point between bits 22 3699 | and 23, which is 13 bits to the left of the usual location. This shifted 3700 | significand must be normalized or smaller. If `zSig' is not normalized, 3701 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3702 | and it must not require rounding. In the usual case that `zSig' is 3703 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3704 | Note the slightly odd position of the binary point in zSig compared with the 3705 | other roundAndPackFloat functions. This should probably be fixed if we 3706 | need to implement more float16 routines than just conversion. 3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3708 | Binary Floating-Point Arithmetic. 3709 *----------------------------------------------------------------------------*/ 3710 3711 static float16 roundAndPackFloat16(flag zSign, int zExp, 3712 uint32_t zSig, flag ieee, 3713 float_status *status) 3714 { 3715 int maxexp = ieee ? 29 : 30; 3716 uint32_t mask; 3717 uint32_t increment; 3718 bool rounding_bumps_exp; 3719 bool is_tiny = false; 3720 3721 /* Calculate the mask of bits of the mantissa which are not 3722 * representable in half-precision and will be lost. 3723 */ 3724 if (zExp < 1) { 3725 /* Will be denormal in halfprec */ 3726 mask = 0x00ffffff; 3727 if (zExp >= -11) { 3728 mask >>= 11 + zExp; 3729 } 3730 } else { 3731 /* Normal number in halfprec */ 3732 mask = 0x00001fff; 3733 } 3734 3735 switch (status->float_rounding_mode) { 3736 case float_round_nearest_even: 3737 increment = (mask + 1) >> 1; 3738 if ((zSig & mask) == increment) { 3739 increment = zSig & (increment << 1); 3740 } 3741 break; 3742 case float_round_ties_away: 3743 increment = (mask + 1) >> 1; 3744 break; 3745 case float_round_up: 3746 increment = zSign ? 0 : mask; 3747 break; 3748 case float_round_down: 3749 increment = zSign ? mask : 0; 3750 break; 3751 default: /* round_to_zero */ 3752 increment = 0; 3753 break; 3754 } 3755 3756 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3757 3758 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3759 if (ieee) { 3760 float_raise(float_flag_overflow | float_flag_inexact, status); 3761 return packFloat16(zSign, 0x1f, 0); 3762 } else { 3763 float_raise(float_flag_invalid, status); 3764 return packFloat16(zSign, 0x1f, 0x3ff); 3765 } 3766 } 3767 3768 if (zExp < 0) { 3769 /* Note that flush-to-zero does not affect half-precision results */ 3770 is_tiny = 3771 (status->float_detect_tininess == float_tininess_before_rounding) 3772 || (zExp < -1) 3773 || (!rounding_bumps_exp); 3774 } 3775 if (zSig & mask) { 3776 float_raise(float_flag_inexact, status); 3777 if (is_tiny) { 3778 float_raise(float_flag_underflow, status); 3779 } 3780 } 3781 3782 zSig += increment; 3783 if (rounding_bumps_exp) { 3784 zSig >>= 1; 3785 zExp++; 3786 } 3787 3788 if (zExp < -10) { 3789 return packFloat16(zSign, 0, 0); 3790 } 3791 if (zExp < 0) { 3792 zSig >>= -zExp; 3793 zExp = 0; 3794 } 3795 return packFloat16(zSign, zExp, zSig >> 13); 3796 } 3797 3798 /*---------------------------------------------------------------------------- 3799 | If `a' is denormal and we are in flush-to-zero mode then set the 3800 | input-denormal exception and return zero. Otherwise just return the value. 3801 *----------------------------------------------------------------------------*/ 3802 float16 float16_squash_input_denormal(float16 a, float_status *status) 3803 { 3804 if (status->flush_inputs_to_zero) { 3805 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3806 float_raise(float_flag_input_denormal, status); 3807 return make_float16(float16_val(a) & 0x8000); 3808 } 3809 } 3810 return a; 3811 } 3812 3813 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3814 uint32_t *zSigPtr) 3815 { 3816 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3817 *zSigPtr = aSig << shiftCount; 3818 *zExpPtr = 1 - shiftCount; 3819 } 3820 3821 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3822 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3823 3824 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3825 { 3826 flag aSign; 3827 int aExp; 3828 uint32_t aSig; 3829 3830 aSign = extractFloat16Sign(a); 3831 aExp = extractFloat16Exp(a); 3832 aSig = extractFloat16Frac(a); 3833 3834 if (aExp == 0x1f && ieee) { 3835 if (aSig) { 3836 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3837 } 3838 return packFloat32(aSign, 0xff, 0); 3839 } 3840 if (aExp == 0) { 3841 if (aSig == 0) { 3842 return packFloat32(aSign, 0, 0); 3843 } 3844 3845 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3846 aExp--; 3847 } 3848 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3849 } 3850 3851 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3852 { 3853 flag aSign; 3854 int aExp; 3855 uint32_t aSig; 3856 3857 a = float32_squash_input_denormal(a, status); 3858 3859 aSig = extractFloat32Frac( a ); 3860 aExp = extractFloat32Exp( a ); 3861 aSign = extractFloat32Sign( a ); 3862 if ( aExp == 0xFF ) { 3863 if (aSig) { 3864 /* Input is a NaN */ 3865 if (!ieee) { 3866 float_raise(float_flag_invalid, status); 3867 return packFloat16(aSign, 0, 0); 3868 } 3869 return commonNaNToFloat16( 3870 float32ToCommonNaN(a, status), status); 3871 } 3872 /* Infinity */ 3873 if (!ieee) { 3874 float_raise(float_flag_invalid, status); 3875 return packFloat16(aSign, 0x1f, 0x3ff); 3876 } 3877 return packFloat16(aSign, 0x1f, 0); 3878 } 3879 if (aExp == 0 && aSig == 0) { 3880 return packFloat16(aSign, 0, 0); 3881 } 3882 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3883 * even if the input is denormal; however this is harmless because 3884 * the largest possible single-precision denormal is still smaller 3885 * than the smallest representable half-precision denormal, and so we 3886 * will end up ignoring aSig and returning via the "always return zero" 3887 * codepath. 3888 */ 3889 aSig |= 0x00800000; 3890 aExp -= 0x71; 3891 3892 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3893 } 3894 3895 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3896 { 3897 flag aSign; 3898 int aExp; 3899 uint32_t aSig; 3900 3901 aSign = extractFloat16Sign(a); 3902 aExp = extractFloat16Exp(a); 3903 aSig = extractFloat16Frac(a); 3904 3905 if (aExp == 0x1f && ieee) { 3906 if (aSig) { 3907 return commonNaNToFloat64( 3908 float16ToCommonNaN(a, status), status); 3909 } 3910 return packFloat64(aSign, 0x7ff, 0); 3911 } 3912 if (aExp == 0) { 3913 if (aSig == 0) { 3914 return packFloat64(aSign, 0, 0); 3915 } 3916 3917 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3918 aExp--; 3919 } 3920 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3921 } 3922 3923 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3924 { 3925 flag aSign; 3926 int aExp; 3927 uint64_t aSig; 3928 uint32_t zSig; 3929 3930 a = float64_squash_input_denormal(a, status); 3931 3932 aSig = extractFloat64Frac(a); 3933 aExp = extractFloat64Exp(a); 3934 aSign = extractFloat64Sign(a); 3935 if (aExp == 0x7FF) { 3936 if (aSig) { 3937 /* Input is a NaN */ 3938 if (!ieee) { 3939 float_raise(float_flag_invalid, status); 3940 return packFloat16(aSign, 0, 0); 3941 } 3942 return commonNaNToFloat16( 3943 float64ToCommonNaN(a, status), status); 3944 } 3945 /* Infinity */ 3946 if (!ieee) { 3947 float_raise(float_flag_invalid, status); 3948 return packFloat16(aSign, 0x1f, 0x3ff); 3949 } 3950 return packFloat16(aSign, 0x1f, 0); 3951 } 3952 shift64RightJamming(aSig, 29, &aSig); 3953 zSig = aSig; 3954 if (aExp == 0 && zSig == 0) { 3955 return packFloat16(aSign, 0, 0); 3956 } 3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3958 * even if the input is denormal; however this is harmless because 3959 * the largest possible single-precision denormal is still smaller 3960 * than the smallest representable half-precision denormal, and so we 3961 * will end up ignoring aSig and returning via the "always return zero" 3962 * codepath. 3963 */ 3964 zSig |= 0x00800000; 3965 aExp -= 0x3F1; 3966 3967 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3968 } 3969 3970 /*---------------------------------------------------------------------------- 3971 | Returns the result of converting the double-precision floating-point value 3972 | `a' to the extended double-precision floating-point format. The conversion 3973 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3974 | Arithmetic. 3975 *----------------------------------------------------------------------------*/ 3976 3977 floatx80 float64_to_floatx80(float64 a, float_status *status) 3978 { 3979 flag aSign; 3980 int aExp; 3981 uint64_t aSig; 3982 3983 a = float64_squash_input_denormal(a, status); 3984 aSig = extractFloat64Frac( a ); 3985 aExp = extractFloat64Exp( a ); 3986 aSign = extractFloat64Sign( a ); 3987 if ( aExp == 0x7FF ) { 3988 if (aSig) { 3989 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3990 } 3991 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3992 } 3993 if ( aExp == 0 ) { 3994 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3995 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3996 } 3997 return 3998 packFloatx80( 3999 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4000 4001 } 4002 4003 /*---------------------------------------------------------------------------- 4004 | Returns the result of converting the double-precision floating-point value 4005 | `a' to the quadruple-precision floating-point format. The conversion is 4006 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4007 | Arithmetic. 4008 *----------------------------------------------------------------------------*/ 4009 4010 float128 float64_to_float128(float64 a, float_status *status) 4011 { 4012 flag aSign; 4013 int aExp; 4014 uint64_t aSig, zSig0, zSig1; 4015 4016 a = float64_squash_input_denormal(a, status); 4017 aSig = extractFloat64Frac( a ); 4018 aExp = extractFloat64Exp( a ); 4019 aSign = extractFloat64Sign( a ); 4020 if ( aExp == 0x7FF ) { 4021 if (aSig) { 4022 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4023 } 4024 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4025 } 4026 if ( aExp == 0 ) { 4027 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4028 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4029 --aExp; 4030 } 4031 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4032 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4033 4034 } 4035 4036 4037 /*---------------------------------------------------------------------------- 4038 | Returns the remainder of the double-precision floating-point value `a' 4039 | with respect to the corresponding value `b'. The operation is performed 4040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4041 *----------------------------------------------------------------------------*/ 4042 4043 float64 float64_rem(float64 a, float64 b, float_status *status) 4044 { 4045 flag aSign, zSign; 4046 int aExp, bExp, expDiff; 4047 uint64_t aSig, bSig; 4048 uint64_t q, alternateASig; 4049 int64_t sigMean; 4050 4051 a = float64_squash_input_denormal(a, status); 4052 b = float64_squash_input_denormal(b, status); 4053 aSig = extractFloat64Frac( a ); 4054 aExp = extractFloat64Exp( a ); 4055 aSign = extractFloat64Sign( a ); 4056 bSig = extractFloat64Frac( b ); 4057 bExp = extractFloat64Exp( b ); 4058 if ( aExp == 0x7FF ) { 4059 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4060 return propagateFloat64NaN(a, b, status); 4061 } 4062 float_raise(float_flag_invalid, status); 4063 return float64_default_nan(status); 4064 } 4065 if ( bExp == 0x7FF ) { 4066 if (bSig) { 4067 return propagateFloat64NaN(a, b, status); 4068 } 4069 return a; 4070 } 4071 if ( bExp == 0 ) { 4072 if ( bSig == 0 ) { 4073 float_raise(float_flag_invalid, status); 4074 return float64_default_nan(status); 4075 } 4076 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4077 } 4078 if ( aExp == 0 ) { 4079 if ( aSig == 0 ) return a; 4080 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4081 } 4082 expDiff = aExp - bExp; 4083 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4084 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4085 if ( expDiff < 0 ) { 4086 if ( expDiff < -1 ) return a; 4087 aSig >>= 1; 4088 } 4089 q = ( bSig <= aSig ); 4090 if ( q ) aSig -= bSig; 4091 expDiff -= 64; 4092 while ( 0 < expDiff ) { 4093 q = estimateDiv128To64( aSig, 0, bSig ); 4094 q = ( 2 < q ) ? q - 2 : 0; 4095 aSig = - ( ( bSig>>2 ) * q ); 4096 expDiff -= 62; 4097 } 4098 expDiff += 64; 4099 if ( 0 < expDiff ) { 4100 q = estimateDiv128To64( aSig, 0, bSig ); 4101 q = ( 2 < q ) ? q - 2 : 0; 4102 q >>= 64 - expDiff; 4103 bSig >>= 2; 4104 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4105 } 4106 else { 4107 aSig >>= 2; 4108 bSig >>= 2; 4109 } 4110 do { 4111 alternateASig = aSig; 4112 ++q; 4113 aSig -= bSig; 4114 } while ( 0 <= (int64_t) aSig ); 4115 sigMean = aSig + alternateASig; 4116 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4117 aSig = alternateASig; 4118 } 4119 zSign = ( (int64_t) aSig < 0 ); 4120 if ( zSign ) aSig = - aSig; 4121 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4122 4123 } 4124 4125 4126 /*---------------------------------------------------------------------------- 4127 | Returns the square root of the double-precision floating-point value `a'. 4128 | The operation is performed according to the IEC/IEEE Standard for Binary 4129 | Floating-Point Arithmetic. 4130 *----------------------------------------------------------------------------*/ 4131 4132 float64 float64_sqrt(float64 a, float_status *status) 4133 { 4134 flag aSign; 4135 int aExp, zExp; 4136 uint64_t aSig, zSig, doubleZSig; 4137 uint64_t rem0, rem1, term0, term1; 4138 a = float64_squash_input_denormal(a, status); 4139 4140 aSig = extractFloat64Frac( a ); 4141 aExp = extractFloat64Exp( a ); 4142 aSign = extractFloat64Sign( a ); 4143 if ( aExp == 0x7FF ) { 4144 if (aSig) { 4145 return propagateFloat64NaN(a, a, status); 4146 } 4147 if ( ! aSign ) return a; 4148 float_raise(float_flag_invalid, status); 4149 return float64_default_nan(status); 4150 } 4151 if ( aSign ) { 4152 if ( ( aExp | aSig ) == 0 ) return a; 4153 float_raise(float_flag_invalid, status); 4154 return float64_default_nan(status); 4155 } 4156 if ( aExp == 0 ) { 4157 if ( aSig == 0 ) return float64_zero; 4158 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4159 } 4160 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4161 aSig |= LIT64( 0x0010000000000000 ); 4162 zSig = estimateSqrt32( aExp, aSig>>21 ); 4163 aSig <<= 9 - ( aExp & 1 ); 4164 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4165 if ( ( zSig & 0x1FF ) <= 5 ) { 4166 doubleZSig = zSig<<1; 4167 mul64To128( zSig, zSig, &term0, &term1 ); 4168 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4169 while ( (int64_t) rem0 < 0 ) { 4170 --zSig; 4171 doubleZSig -= 2; 4172 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4173 } 4174 zSig |= ( ( rem0 | rem1 ) != 0 ); 4175 } 4176 return roundAndPackFloat64(0, zExp, zSig, status); 4177 4178 } 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns the binary log of the double-precision floating-point value `a'. 4182 | The operation is performed according to the IEC/IEEE Standard for Binary 4183 | Floating-Point Arithmetic. 4184 *----------------------------------------------------------------------------*/ 4185 float64 float64_log2(float64 a, float_status *status) 4186 { 4187 flag aSign, zSign; 4188 int aExp; 4189 uint64_t aSig, aSig0, aSig1, zSig, i; 4190 a = float64_squash_input_denormal(a, status); 4191 4192 aSig = extractFloat64Frac( a ); 4193 aExp = extractFloat64Exp( a ); 4194 aSign = extractFloat64Sign( a ); 4195 4196 if ( aExp == 0 ) { 4197 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4199 } 4200 if ( aSign ) { 4201 float_raise(float_flag_invalid, status); 4202 return float64_default_nan(status); 4203 } 4204 if ( aExp == 0x7FF ) { 4205 if (aSig) { 4206 return propagateFloat64NaN(a, float64_zero, status); 4207 } 4208 return a; 4209 } 4210 4211 aExp -= 0x3FF; 4212 aSig |= LIT64( 0x0010000000000000 ); 4213 zSign = aExp < 0; 4214 zSig = (uint64_t)aExp << 52; 4215 for (i = 1LL << 51; i > 0; i >>= 1) { 4216 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4217 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4218 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4219 aSig >>= 1; 4220 zSig |= i; 4221 } 4222 } 4223 4224 if ( zSign ) 4225 zSig = -zSig; 4226 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4227 } 4228 4229 /*---------------------------------------------------------------------------- 4230 | Returns 1 if the double-precision floating-point value `a' is equal to the 4231 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4232 | if either operand is a NaN. Otherwise, the comparison is performed 4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4234 *----------------------------------------------------------------------------*/ 4235 4236 int float64_eq(float64 a, float64 b, float_status *status) 4237 { 4238 uint64_t av, bv; 4239 a = float64_squash_input_denormal(a, status); 4240 b = float64_squash_input_denormal(b, status); 4241 4242 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4243 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4244 ) { 4245 float_raise(float_flag_invalid, status); 4246 return 0; 4247 } 4248 av = float64_val(a); 4249 bv = float64_val(b); 4250 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4251 4252 } 4253 4254 /*---------------------------------------------------------------------------- 4255 | Returns 1 if the double-precision floating-point value `a' is less than or 4256 | equal to the corresponding value `b', and 0 otherwise. The invalid 4257 | exception is raised if either operand is a NaN. The comparison is performed 4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4259 *----------------------------------------------------------------------------*/ 4260 4261 int float64_le(float64 a, float64 b, float_status *status) 4262 { 4263 flag aSign, bSign; 4264 uint64_t av, bv; 4265 a = float64_squash_input_denormal(a, status); 4266 b = float64_squash_input_denormal(b, status); 4267 4268 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4269 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4270 ) { 4271 float_raise(float_flag_invalid, status); 4272 return 0; 4273 } 4274 aSign = extractFloat64Sign( a ); 4275 bSign = extractFloat64Sign( b ); 4276 av = float64_val(a); 4277 bv = float64_val(b); 4278 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4279 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4280 4281 } 4282 4283 /*---------------------------------------------------------------------------- 4284 | Returns 1 if the double-precision floating-point value `a' is less than 4285 | the corresponding value `b', and 0 otherwise. The invalid exception is 4286 | raised if either operand is a NaN. The comparison is performed according 4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4288 *----------------------------------------------------------------------------*/ 4289 4290 int float64_lt(float64 a, float64 b, float_status *status) 4291 { 4292 flag aSign, bSign; 4293 uint64_t av, bv; 4294 4295 a = float64_squash_input_denormal(a, status); 4296 b = float64_squash_input_denormal(b, status); 4297 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4298 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4299 ) { 4300 float_raise(float_flag_invalid, status); 4301 return 0; 4302 } 4303 aSign = extractFloat64Sign( a ); 4304 bSign = extractFloat64Sign( b ); 4305 av = float64_val(a); 4306 bv = float64_val(b); 4307 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4308 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4309 4310 } 4311 4312 /*---------------------------------------------------------------------------- 4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4314 | be compared, and 0 otherwise. The invalid exception is raised if either 4315 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4316 | Standard for Binary Floating-Point Arithmetic. 4317 *----------------------------------------------------------------------------*/ 4318 4319 int float64_unordered(float64 a, float64 b, float_status *status) 4320 { 4321 a = float64_squash_input_denormal(a, status); 4322 b = float64_squash_input_denormal(b, status); 4323 4324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4326 ) { 4327 float_raise(float_flag_invalid, status); 4328 return 1; 4329 } 4330 return 0; 4331 } 4332 4333 /*---------------------------------------------------------------------------- 4334 | Returns 1 if the double-precision floating-point value `a' is equal to the 4335 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4336 | exception.The comparison is performed according to the IEC/IEEE Standard 4337 | for Binary Floating-Point Arithmetic. 4338 *----------------------------------------------------------------------------*/ 4339 4340 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4341 { 4342 uint64_t av, bv; 4343 a = float64_squash_input_denormal(a, status); 4344 b = float64_squash_input_denormal(b, status); 4345 4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4348 ) { 4349 if (float64_is_signaling_nan(a, status) 4350 || float64_is_signaling_nan(b, status)) { 4351 float_raise(float_flag_invalid, status); 4352 } 4353 return 0; 4354 } 4355 av = float64_val(a); 4356 bv = float64_val(b); 4357 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4358 4359 } 4360 4361 /*---------------------------------------------------------------------------- 4362 | Returns 1 if the double-precision floating-point value `a' is less than or 4363 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4364 | cause an exception. Otherwise, the comparison is performed according to the 4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4366 *----------------------------------------------------------------------------*/ 4367 4368 int float64_le_quiet(float64 a, float64 b, float_status *status) 4369 { 4370 flag aSign, bSign; 4371 uint64_t av, bv; 4372 a = float64_squash_input_denormal(a, status); 4373 b = float64_squash_input_denormal(b, status); 4374 4375 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4376 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4377 ) { 4378 if (float64_is_signaling_nan(a, status) 4379 || float64_is_signaling_nan(b, status)) { 4380 float_raise(float_flag_invalid, status); 4381 } 4382 return 0; 4383 } 4384 aSign = extractFloat64Sign( a ); 4385 bSign = extractFloat64Sign( b ); 4386 av = float64_val(a); 4387 bv = float64_val(b); 4388 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4389 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4390 4391 } 4392 4393 /*---------------------------------------------------------------------------- 4394 | Returns 1 if the double-precision floating-point value `a' is less than 4395 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4396 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4397 | Standard for Binary Floating-Point Arithmetic. 4398 *----------------------------------------------------------------------------*/ 4399 4400 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4401 { 4402 flag aSign, bSign; 4403 uint64_t av, bv; 4404 a = float64_squash_input_denormal(a, status); 4405 b = float64_squash_input_denormal(b, status); 4406 4407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4409 ) { 4410 if (float64_is_signaling_nan(a, status) 4411 || float64_is_signaling_nan(b, status)) { 4412 float_raise(float_flag_invalid, status); 4413 } 4414 return 0; 4415 } 4416 aSign = extractFloat64Sign( a ); 4417 bSign = extractFloat64Sign( b ); 4418 av = float64_val(a); 4419 bv = float64_val(b); 4420 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4421 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4422 4423 } 4424 4425 /*---------------------------------------------------------------------------- 4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4427 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4428 | comparison is performed according to the IEC/IEEE Standard for Binary 4429 | Floating-Point Arithmetic. 4430 *----------------------------------------------------------------------------*/ 4431 4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4433 { 4434 a = float64_squash_input_denormal(a, status); 4435 b = float64_squash_input_denormal(b, status); 4436 4437 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4438 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4439 ) { 4440 if (float64_is_signaling_nan(a, status) 4441 || float64_is_signaling_nan(b, status)) { 4442 float_raise(float_flag_invalid, status); 4443 } 4444 return 1; 4445 } 4446 return 0; 4447 } 4448 4449 /*---------------------------------------------------------------------------- 4450 | Returns the result of converting the extended double-precision floating- 4451 | point value `a' to the 32-bit two's complement integer format. The 4452 | conversion is performed according to the IEC/IEEE Standard for Binary 4453 | Floating-Point Arithmetic---which means in particular that the conversion 4454 | is rounded according to the current rounding mode. If `a' is a NaN, the 4455 | largest positive integer is returned. Otherwise, if the conversion 4456 | overflows, the largest integer with the same sign as `a' is returned. 4457 *----------------------------------------------------------------------------*/ 4458 4459 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4460 { 4461 flag aSign; 4462 int32_t aExp, shiftCount; 4463 uint64_t aSig; 4464 4465 if (floatx80_invalid_encoding(a)) { 4466 float_raise(float_flag_invalid, status); 4467 return 1 << 31; 4468 } 4469 aSig = extractFloatx80Frac( a ); 4470 aExp = extractFloatx80Exp( a ); 4471 aSign = extractFloatx80Sign( a ); 4472 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4473 shiftCount = 0x4037 - aExp; 4474 if ( shiftCount <= 0 ) shiftCount = 1; 4475 shift64RightJamming( aSig, shiftCount, &aSig ); 4476 return roundAndPackInt32(aSign, aSig, status); 4477 4478 } 4479 4480 /*---------------------------------------------------------------------------- 4481 | Returns the result of converting the extended double-precision floating- 4482 | point value `a' to the 32-bit two's complement integer format. The 4483 | conversion is performed according to the IEC/IEEE Standard for Binary 4484 | Floating-Point Arithmetic, except that the conversion is always rounded 4485 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4486 | Otherwise, if the conversion overflows, the largest integer with the same 4487 | sign as `a' is returned. 4488 *----------------------------------------------------------------------------*/ 4489 4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4491 { 4492 flag aSign; 4493 int32_t aExp, shiftCount; 4494 uint64_t aSig, savedASig; 4495 int32_t z; 4496 4497 if (floatx80_invalid_encoding(a)) { 4498 float_raise(float_flag_invalid, status); 4499 return 1 << 31; 4500 } 4501 aSig = extractFloatx80Frac( a ); 4502 aExp = extractFloatx80Exp( a ); 4503 aSign = extractFloatx80Sign( a ); 4504 if ( 0x401E < aExp ) { 4505 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4506 goto invalid; 4507 } 4508 else if ( aExp < 0x3FFF ) { 4509 if (aExp || aSig) { 4510 status->float_exception_flags |= float_flag_inexact; 4511 } 4512 return 0; 4513 } 4514 shiftCount = 0x403E - aExp; 4515 savedASig = aSig; 4516 aSig >>= shiftCount; 4517 z = aSig; 4518 if ( aSign ) z = - z; 4519 if ( ( z < 0 ) ^ aSign ) { 4520 invalid: 4521 float_raise(float_flag_invalid, status); 4522 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4523 } 4524 if ( ( aSig<<shiftCount ) != savedASig ) { 4525 status->float_exception_flags |= float_flag_inexact; 4526 } 4527 return z; 4528 4529 } 4530 4531 /*---------------------------------------------------------------------------- 4532 | Returns the result of converting the extended double-precision floating- 4533 | point value `a' to the 64-bit two's complement integer format. The 4534 | conversion is performed according to the IEC/IEEE Standard for Binary 4535 | Floating-Point Arithmetic---which means in particular that the conversion 4536 | is rounded according to the current rounding mode. If `a' is a NaN, 4537 | the largest positive integer is returned. Otherwise, if the conversion 4538 | overflows, the largest integer with the same sign as `a' is returned. 4539 *----------------------------------------------------------------------------*/ 4540 4541 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4542 { 4543 flag aSign; 4544 int32_t aExp, shiftCount; 4545 uint64_t aSig, aSigExtra; 4546 4547 if (floatx80_invalid_encoding(a)) { 4548 float_raise(float_flag_invalid, status); 4549 return 1ULL << 63; 4550 } 4551 aSig = extractFloatx80Frac( a ); 4552 aExp = extractFloatx80Exp( a ); 4553 aSign = extractFloatx80Sign( a ); 4554 shiftCount = 0x403E - aExp; 4555 if ( shiftCount <= 0 ) { 4556 if ( shiftCount ) { 4557 float_raise(float_flag_invalid, status); 4558 if ( ! aSign 4559 || ( ( aExp == 0x7FFF ) 4560 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4561 ) { 4562 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4563 } 4564 return (int64_t) LIT64( 0x8000000000000000 ); 4565 } 4566 aSigExtra = 0; 4567 } 4568 else { 4569 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4570 } 4571 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4572 4573 } 4574 4575 /*---------------------------------------------------------------------------- 4576 | Returns the result of converting the extended double-precision floating- 4577 | point value `a' to the 64-bit two's complement integer format. The 4578 | conversion is performed according to the IEC/IEEE Standard for Binary 4579 | Floating-Point Arithmetic, except that the conversion is always rounded 4580 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4581 | Otherwise, if the conversion overflows, the largest integer with the same 4582 | sign as `a' is returned. 4583 *----------------------------------------------------------------------------*/ 4584 4585 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4586 { 4587 flag aSign; 4588 int32_t aExp, shiftCount; 4589 uint64_t aSig; 4590 int64_t z; 4591 4592 if (floatx80_invalid_encoding(a)) { 4593 float_raise(float_flag_invalid, status); 4594 return 1ULL << 63; 4595 } 4596 aSig = extractFloatx80Frac( a ); 4597 aExp = extractFloatx80Exp( a ); 4598 aSign = extractFloatx80Sign( a ); 4599 shiftCount = aExp - 0x403E; 4600 if ( 0 <= shiftCount ) { 4601 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4602 if ( ( a.high != 0xC03E ) || aSig ) { 4603 float_raise(float_flag_invalid, status); 4604 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4605 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4606 } 4607 } 4608 return (int64_t) LIT64( 0x8000000000000000 ); 4609 } 4610 else if ( aExp < 0x3FFF ) { 4611 if (aExp | aSig) { 4612 status->float_exception_flags |= float_flag_inexact; 4613 } 4614 return 0; 4615 } 4616 z = aSig>>( - shiftCount ); 4617 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4618 status->float_exception_flags |= float_flag_inexact; 4619 } 4620 if ( aSign ) z = - z; 4621 return z; 4622 4623 } 4624 4625 /*---------------------------------------------------------------------------- 4626 | Returns the result of converting the extended double-precision floating- 4627 | point value `a' to the single-precision floating-point format. The 4628 | conversion is performed according to the IEC/IEEE Standard for Binary 4629 | Floating-Point Arithmetic. 4630 *----------------------------------------------------------------------------*/ 4631 4632 float32 floatx80_to_float32(floatx80 a, float_status *status) 4633 { 4634 flag aSign; 4635 int32_t aExp; 4636 uint64_t aSig; 4637 4638 if (floatx80_invalid_encoding(a)) { 4639 float_raise(float_flag_invalid, status); 4640 return float32_default_nan(status); 4641 } 4642 aSig = extractFloatx80Frac( a ); 4643 aExp = extractFloatx80Exp( a ); 4644 aSign = extractFloatx80Sign( a ); 4645 if ( aExp == 0x7FFF ) { 4646 if ( (uint64_t) ( aSig<<1 ) ) { 4647 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4648 } 4649 return packFloat32( aSign, 0xFF, 0 ); 4650 } 4651 shift64RightJamming( aSig, 33, &aSig ); 4652 if ( aExp || aSig ) aExp -= 0x3F81; 4653 return roundAndPackFloat32(aSign, aExp, aSig, status); 4654 4655 } 4656 4657 /*---------------------------------------------------------------------------- 4658 | Returns the result of converting the extended double-precision floating- 4659 | point value `a' to the double-precision floating-point format. The 4660 | conversion is performed according to the IEC/IEEE Standard for Binary 4661 | Floating-Point Arithmetic. 4662 *----------------------------------------------------------------------------*/ 4663 4664 float64 floatx80_to_float64(floatx80 a, float_status *status) 4665 { 4666 flag aSign; 4667 int32_t aExp; 4668 uint64_t aSig, zSig; 4669 4670 if (floatx80_invalid_encoding(a)) { 4671 float_raise(float_flag_invalid, status); 4672 return float64_default_nan(status); 4673 } 4674 aSig = extractFloatx80Frac( a ); 4675 aExp = extractFloatx80Exp( a ); 4676 aSign = extractFloatx80Sign( a ); 4677 if ( aExp == 0x7FFF ) { 4678 if ( (uint64_t) ( aSig<<1 ) ) { 4679 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4680 } 4681 return packFloat64( aSign, 0x7FF, 0 ); 4682 } 4683 shift64RightJamming( aSig, 1, &zSig ); 4684 if ( aExp || aSig ) aExp -= 0x3C01; 4685 return roundAndPackFloat64(aSign, aExp, zSig, status); 4686 4687 } 4688 4689 /*---------------------------------------------------------------------------- 4690 | Returns the result of converting the extended double-precision floating- 4691 | point value `a' to the quadruple-precision floating-point format. The 4692 | conversion is performed according to the IEC/IEEE Standard for Binary 4693 | Floating-Point Arithmetic. 4694 *----------------------------------------------------------------------------*/ 4695 4696 float128 floatx80_to_float128(floatx80 a, float_status *status) 4697 { 4698 flag aSign; 4699 int aExp; 4700 uint64_t aSig, zSig0, zSig1; 4701 4702 if (floatx80_invalid_encoding(a)) { 4703 float_raise(float_flag_invalid, status); 4704 return float128_default_nan(status); 4705 } 4706 aSig = extractFloatx80Frac( a ); 4707 aExp = extractFloatx80Exp( a ); 4708 aSign = extractFloatx80Sign( a ); 4709 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4710 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4711 } 4712 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4713 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4714 4715 } 4716 4717 /*---------------------------------------------------------------------------- 4718 | Rounds the extended double-precision floating-point value `a' 4719 | to the precision provided by floatx80_rounding_precision and returns the 4720 | result as an extended double-precision floating-point value. 4721 | The operation is performed according to the IEC/IEEE Standard for Binary 4722 | Floating-Point Arithmetic. 4723 *----------------------------------------------------------------------------*/ 4724 4725 floatx80 floatx80_round(floatx80 a, float_status *status) 4726 { 4727 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4728 extractFloatx80Sign(a), 4729 extractFloatx80Exp(a), 4730 extractFloatx80Frac(a), 0, status); 4731 } 4732 4733 /*---------------------------------------------------------------------------- 4734 | Rounds the extended double-precision floating-point value `a' to an integer, 4735 | and returns the result as an extended quadruple-precision floating-point 4736 | value. The operation is performed according to the IEC/IEEE Standard for 4737 | Binary Floating-Point Arithmetic. 4738 *----------------------------------------------------------------------------*/ 4739 4740 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4741 { 4742 flag aSign; 4743 int32_t aExp; 4744 uint64_t lastBitMask, roundBitsMask; 4745 floatx80 z; 4746 4747 if (floatx80_invalid_encoding(a)) { 4748 float_raise(float_flag_invalid, status); 4749 return floatx80_default_nan(status); 4750 } 4751 aExp = extractFloatx80Exp( a ); 4752 if ( 0x403E <= aExp ) { 4753 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4754 return propagateFloatx80NaN(a, a, status); 4755 } 4756 return a; 4757 } 4758 if ( aExp < 0x3FFF ) { 4759 if ( ( aExp == 0 ) 4760 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4761 return a; 4762 } 4763 status->float_exception_flags |= float_flag_inexact; 4764 aSign = extractFloatx80Sign( a ); 4765 switch (status->float_rounding_mode) { 4766 case float_round_nearest_even: 4767 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4768 ) { 4769 return 4770 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4771 } 4772 break; 4773 case float_round_ties_away: 4774 if (aExp == 0x3FFE) { 4775 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4776 } 4777 break; 4778 case float_round_down: 4779 return 4780 aSign ? 4781 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4782 : packFloatx80( 0, 0, 0 ); 4783 case float_round_up: 4784 return 4785 aSign ? packFloatx80( 1, 0, 0 ) 4786 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4787 } 4788 return packFloatx80( aSign, 0, 0 ); 4789 } 4790 lastBitMask = 1; 4791 lastBitMask <<= 0x403E - aExp; 4792 roundBitsMask = lastBitMask - 1; 4793 z = a; 4794 switch (status->float_rounding_mode) { 4795 case float_round_nearest_even: 4796 z.low += lastBitMask>>1; 4797 if ((z.low & roundBitsMask) == 0) { 4798 z.low &= ~lastBitMask; 4799 } 4800 break; 4801 case float_round_ties_away: 4802 z.low += lastBitMask >> 1; 4803 break; 4804 case float_round_to_zero: 4805 break; 4806 case float_round_up: 4807 if (!extractFloatx80Sign(z)) { 4808 z.low += roundBitsMask; 4809 } 4810 break; 4811 case float_round_down: 4812 if (extractFloatx80Sign(z)) { 4813 z.low += roundBitsMask; 4814 } 4815 break; 4816 default: 4817 abort(); 4818 } 4819 z.low &= ~ roundBitsMask; 4820 if ( z.low == 0 ) { 4821 ++z.high; 4822 z.low = LIT64( 0x8000000000000000 ); 4823 } 4824 if (z.low != a.low) { 4825 status->float_exception_flags |= float_flag_inexact; 4826 } 4827 return z; 4828 4829 } 4830 4831 /*---------------------------------------------------------------------------- 4832 | Returns the result of adding the absolute values of the extended double- 4833 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4834 | negated before being returned. `zSign' is ignored if the result is a NaN. 4835 | The addition is performed according to the IEC/IEEE Standard for Binary 4836 | Floating-Point Arithmetic. 4837 *----------------------------------------------------------------------------*/ 4838 4839 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4840 float_status *status) 4841 { 4842 int32_t aExp, bExp, zExp; 4843 uint64_t aSig, bSig, zSig0, zSig1; 4844 int32_t expDiff; 4845 4846 aSig = extractFloatx80Frac( a ); 4847 aExp = extractFloatx80Exp( a ); 4848 bSig = extractFloatx80Frac( b ); 4849 bExp = extractFloatx80Exp( b ); 4850 expDiff = aExp - bExp; 4851 if ( 0 < expDiff ) { 4852 if ( aExp == 0x7FFF ) { 4853 if ((uint64_t)(aSig << 1)) { 4854 return propagateFloatx80NaN(a, b, status); 4855 } 4856 return a; 4857 } 4858 if ( bExp == 0 ) --expDiff; 4859 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4860 zExp = aExp; 4861 } 4862 else if ( expDiff < 0 ) { 4863 if ( bExp == 0x7FFF ) { 4864 if ((uint64_t)(bSig << 1)) { 4865 return propagateFloatx80NaN(a, b, status); 4866 } 4867 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4868 } 4869 if ( aExp == 0 ) ++expDiff; 4870 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4871 zExp = bExp; 4872 } 4873 else { 4874 if ( aExp == 0x7FFF ) { 4875 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4876 return propagateFloatx80NaN(a, b, status); 4877 } 4878 return a; 4879 } 4880 zSig1 = 0; 4881 zSig0 = aSig + bSig; 4882 if ( aExp == 0 ) { 4883 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4884 goto roundAndPack; 4885 } 4886 zExp = aExp; 4887 goto shiftRight1; 4888 } 4889 zSig0 = aSig + bSig; 4890 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4891 shiftRight1: 4892 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4893 zSig0 |= LIT64( 0x8000000000000000 ); 4894 ++zExp; 4895 roundAndPack: 4896 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4897 zSign, zExp, zSig0, zSig1, status); 4898 } 4899 4900 /*---------------------------------------------------------------------------- 4901 | Returns the result of subtracting the absolute values of the extended 4902 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4903 | difference is negated before being returned. `zSign' is ignored if the 4904 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4905 | Standard for Binary Floating-Point Arithmetic. 4906 *----------------------------------------------------------------------------*/ 4907 4908 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4909 float_status *status) 4910 { 4911 int32_t aExp, bExp, zExp; 4912 uint64_t aSig, bSig, zSig0, zSig1; 4913 int32_t expDiff; 4914 4915 aSig = extractFloatx80Frac( a ); 4916 aExp = extractFloatx80Exp( a ); 4917 bSig = extractFloatx80Frac( b ); 4918 bExp = extractFloatx80Exp( b ); 4919 expDiff = aExp - bExp; 4920 if ( 0 < expDiff ) goto aExpBigger; 4921 if ( expDiff < 0 ) goto bExpBigger; 4922 if ( aExp == 0x7FFF ) { 4923 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4924 return propagateFloatx80NaN(a, b, status); 4925 } 4926 float_raise(float_flag_invalid, status); 4927 return floatx80_default_nan(status); 4928 } 4929 if ( aExp == 0 ) { 4930 aExp = 1; 4931 bExp = 1; 4932 } 4933 zSig1 = 0; 4934 if ( bSig < aSig ) goto aBigger; 4935 if ( aSig < bSig ) goto bBigger; 4936 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4937 bExpBigger: 4938 if ( bExp == 0x7FFF ) { 4939 if ((uint64_t)(bSig << 1)) { 4940 return propagateFloatx80NaN(a, b, status); 4941 } 4942 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4943 } 4944 if ( aExp == 0 ) ++expDiff; 4945 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4946 bBigger: 4947 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4948 zExp = bExp; 4949 zSign ^= 1; 4950 goto normalizeRoundAndPack; 4951 aExpBigger: 4952 if ( aExp == 0x7FFF ) { 4953 if ((uint64_t)(aSig << 1)) { 4954 return propagateFloatx80NaN(a, b, status); 4955 } 4956 return a; 4957 } 4958 if ( bExp == 0 ) --expDiff; 4959 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4960 aBigger: 4961 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4962 zExp = aExp; 4963 normalizeRoundAndPack: 4964 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4965 zSign, zExp, zSig0, zSig1, status); 4966 } 4967 4968 /*---------------------------------------------------------------------------- 4969 | Returns the result of adding the extended double-precision floating-point 4970 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4971 | Standard for Binary Floating-Point Arithmetic. 4972 *----------------------------------------------------------------------------*/ 4973 4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4975 { 4976 flag aSign, bSign; 4977 4978 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4979 float_raise(float_flag_invalid, status); 4980 return floatx80_default_nan(status); 4981 } 4982 aSign = extractFloatx80Sign( a ); 4983 bSign = extractFloatx80Sign( b ); 4984 if ( aSign == bSign ) { 4985 return addFloatx80Sigs(a, b, aSign, status); 4986 } 4987 else { 4988 return subFloatx80Sigs(a, b, aSign, status); 4989 } 4990 4991 } 4992 4993 /*---------------------------------------------------------------------------- 4994 | Returns the result of subtracting the extended double-precision floating- 4995 | point values `a' and `b'. The operation is performed according to the 4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4997 *----------------------------------------------------------------------------*/ 4998 4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5000 { 5001 flag aSign, bSign; 5002 5003 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5004 float_raise(float_flag_invalid, status); 5005 return floatx80_default_nan(status); 5006 } 5007 aSign = extractFloatx80Sign( a ); 5008 bSign = extractFloatx80Sign( b ); 5009 if ( aSign == bSign ) { 5010 return subFloatx80Sigs(a, b, aSign, status); 5011 } 5012 else { 5013 return addFloatx80Sigs(a, b, aSign, status); 5014 } 5015 5016 } 5017 5018 /*---------------------------------------------------------------------------- 5019 | Returns the result of multiplying the extended double-precision floating- 5020 | point values `a' and `b'. The operation is performed according to the 5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5025 { 5026 flag aSign, bSign, zSign; 5027 int32_t aExp, bExp, zExp; 5028 uint64_t aSig, bSig, zSig0, zSig1; 5029 5030 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5031 float_raise(float_flag_invalid, status); 5032 return floatx80_default_nan(status); 5033 } 5034 aSig = extractFloatx80Frac( a ); 5035 aExp = extractFloatx80Exp( a ); 5036 aSign = extractFloatx80Sign( a ); 5037 bSig = extractFloatx80Frac( b ); 5038 bExp = extractFloatx80Exp( b ); 5039 bSign = extractFloatx80Sign( b ); 5040 zSign = aSign ^ bSign; 5041 if ( aExp == 0x7FFF ) { 5042 if ( (uint64_t) ( aSig<<1 ) 5043 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5044 return propagateFloatx80NaN(a, b, status); 5045 } 5046 if ( ( bExp | bSig ) == 0 ) goto invalid; 5047 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5048 } 5049 if ( bExp == 0x7FFF ) { 5050 if ((uint64_t)(bSig << 1)) { 5051 return propagateFloatx80NaN(a, b, status); 5052 } 5053 if ( ( aExp | aSig ) == 0 ) { 5054 invalid: 5055 float_raise(float_flag_invalid, status); 5056 return floatx80_default_nan(status); 5057 } 5058 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5059 } 5060 if ( aExp == 0 ) { 5061 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5062 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5063 } 5064 if ( bExp == 0 ) { 5065 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5066 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5067 } 5068 zExp = aExp + bExp - 0x3FFE; 5069 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5070 if ( 0 < (int64_t) zSig0 ) { 5071 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5072 --zExp; 5073 } 5074 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5075 zSign, zExp, zSig0, zSig1, status); 5076 } 5077 5078 /*---------------------------------------------------------------------------- 5079 | Returns the result of dividing the extended double-precision floating-point 5080 | value `a' by the corresponding value `b'. The operation is performed 5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5082 *----------------------------------------------------------------------------*/ 5083 5084 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5085 { 5086 flag aSign, bSign, zSign; 5087 int32_t aExp, bExp, zExp; 5088 uint64_t aSig, bSig, zSig0, zSig1; 5089 uint64_t rem0, rem1, rem2, term0, term1, term2; 5090 5091 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5092 float_raise(float_flag_invalid, status); 5093 return floatx80_default_nan(status); 5094 } 5095 aSig = extractFloatx80Frac( a ); 5096 aExp = extractFloatx80Exp( a ); 5097 aSign = extractFloatx80Sign( a ); 5098 bSig = extractFloatx80Frac( b ); 5099 bExp = extractFloatx80Exp( b ); 5100 bSign = extractFloatx80Sign( b ); 5101 zSign = aSign ^ bSign; 5102 if ( aExp == 0x7FFF ) { 5103 if ((uint64_t)(aSig << 1)) { 5104 return propagateFloatx80NaN(a, b, status); 5105 } 5106 if ( bExp == 0x7FFF ) { 5107 if ((uint64_t)(bSig << 1)) { 5108 return propagateFloatx80NaN(a, b, status); 5109 } 5110 goto invalid; 5111 } 5112 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5113 } 5114 if ( bExp == 0x7FFF ) { 5115 if ((uint64_t)(bSig << 1)) { 5116 return propagateFloatx80NaN(a, b, status); 5117 } 5118 return packFloatx80( zSign, 0, 0 ); 5119 } 5120 if ( bExp == 0 ) { 5121 if ( bSig == 0 ) { 5122 if ( ( aExp | aSig ) == 0 ) { 5123 invalid: 5124 float_raise(float_flag_invalid, status); 5125 return floatx80_default_nan(status); 5126 } 5127 float_raise(float_flag_divbyzero, status); 5128 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5129 } 5130 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5131 } 5132 if ( aExp == 0 ) { 5133 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5134 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5135 } 5136 zExp = aExp - bExp + 0x3FFE; 5137 rem1 = 0; 5138 if ( bSig <= aSig ) { 5139 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5140 ++zExp; 5141 } 5142 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5143 mul64To128( bSig, zSig0, &term0, &term1 ); 5144 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5145 while ( (int64_t) rem0 < 0 ) { 5146 --zSig0; 5147 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5148 } 5149 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5150 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5151 mul64To128( bSig, zSig1, &term1, &term2 ); 5152 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5153 while ( (int64_t) rem1 < 0 ) { 5154 --zSig1; 5155 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5156 } 5157 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5158 } 5159 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5160 zSign, zExp, zSig0, zSig1, status); 5161 } 5162 5163 /*---------------------------------------------------------------------------- 5164 | Returns the remainder of the extended double-precision floating-point value 5165 | `a' with respect to the corresponding value `b'. The operation is performed 5166 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5167 *----------------------------------------------------------------------------*/ 5168 5169 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5170 { 5171 flag aSign, zSign; 5172 int32_t aExp, bExp, expDiff; 5173 uint64_t aSig0, aSig1, bSig; 5174 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5175 5176 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5177 float_raise(float_flag_invalid, status); 5178 return floatx80_default_nan(status); 5179 } 5180 aSig0 = extractFloatx80Frac( a ); 5181 aExp = extractFloatx80Exp( a ); 5182 aSign = extractFloatx80Sign( a ); 5183 bSig = extractFloatx80Frac( b ); 5184 bExp = extractFloatx80Exp( b ); 5185 if ( aExp == 0x7FFF ) { 5186 if ( (uint64_t) ( aSig0<<1 ) 5187 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5188 return propagateFloatx80NaN(a, b, status); 5189 } 5190 goto invalid; 5191 } 5192 if ( bExp == 0x7FFF ) { 5193 if ((uint64_t)(bSig << 1)) { 5194 return propagateFloatx80NaN(a, b, status); 5195 } 5196 return a; 5197 } 5198 if ( bExp == 0 ) { 5199 if ( bSig == 0 ) { 5200 invalid: 5201 float_raise(float_flag_invalid, status); 5202 return floatx80_default_nan(status); 5203 } 5204 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5205 } 5206 if ( aExp == 0 ) { 5207 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5208 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5209 } 5210 bSig |= LIT64( 0x8000000000000000 ); 5211 zSign = aSign; 5212 expDiff = aExp - bExp; 5213 aSig1 = 0; 5214 if ( expDiff < 0 ) { 5215 if ( expDiff < -1 ) return a; 5216 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5217 expDiff = 0; 5218 } 5219 q = ( bSig <= aSig0 ); 5220 if ( q ) aSig0 -= bSig; 5221 expDiff -= 64; 5222 while ( 0 < expDiff ) { 5223 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5224 q = ( 2 < q ) ? q - 2 : 0; 5225 mul64To128( bSig, q, &term0, &term1 ); 5226 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5227 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5228 expDiff -= 62; 5229 } 5230 expDiff += 64; 5231 if ( 0 < expDiff ) { 5232 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5233 q = ( 2 < q ) ? q - 2 : 0; 5234 q >>= 64 - expDiff; 5235 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5236 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5237 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5238 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5239 ++q; 5240 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5241 } 5242 } 5243 else { 5244 term1 = 0; 5245 term0 = bSig; 5246 } 5247 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5248 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5249 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5250 && ( q & 1 ) ) 5251 ) { 5252 aSig0 = alternateASig0; 5253 aSig1 = alternateASig1; 5254 zSign = ! zSign; 5255 } 5256 return 5257 normalizeRoundAndPackFloatx80( 5258 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5259 5260 } 5261 5262 /*---------------------------------------------------------------------------- 5263 | Returns the square root of the extended double-precision floating-point 5264 | value `a'. The operation is performed according to the IEC/IEEE Standard 5265 | for Binary Floating-Point Arithmetic. 5266 *----------------------------------------------------------------------------*/ 5267 5268 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5269 { 5270 flag aSign; 5271 int32_t aExp, zExp; 5272 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5273 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5274 5275 if (floatx80_invalid_encoding(a)) { 5276 float_raise(float_flag_invalid, status); 5277 return floatx80_default_nan(status); 5278 } 5279 aSig0 = extractFloatx80Frac( a ); 5280 aExp = extractFloatx80Exp( a ); 5281 aSign = extractFloatx80Sign( a ); 5282 if ( aExp == 0x7FFF ) { 5283 if ((uint64_t)(aSig0 << 1)) { 5284 return propagateFloatx80NaN(a, a, status); 5285 } 5286 if ( ! aSign ) return a; 5287 goto invalid; 5288 } 5289 if ( aSign ) { 5290 if ( ( aExp | aSig0 ) == 0 ) return a; 5291 invalid: 5292 float_raise(float_flag_invalid, status); 5293 return floatx80_default_nan(status); 5294 } 5295 if ( aExp == 0 ) { 5296 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5297 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5298 } 5299 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5300 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5301 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5302 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5303 doubleZSig0 = zSig0<<1; 5304 mul64To128( zSig0, zSig0, &term0, &term1 ); 5305 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5306 while ( (int64_t) rem0 < 0 ) { 5307 --zSig0; 5308 doubleZSig0 -= 2; 5309 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5310 } 5311 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5312 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5313 if ( zSig1 == 0 ) zSig1 = 1; 5314 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5315 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5316 mul64To128( zSig1, zSig1, &term2, &term3 ); 5317 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5318 while ( (int64_t) rem1 < 0 ) { 5319 --zSig1; 5320 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5321 term3 |= 1; 5322 term2 |= doubleZSig0; 5323 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5324 } 5325 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5326 } 5327 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5328 zSig0 |= doubleZSig0; 5329 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5330 0, zExp, zSig0, zSig1, status); 5331 } 5332 5333 /*---------------------------------------------------------------------------- 5334 | Returns 1 if the extended double-precision floating-point value `a' is equal 5335 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5336 | raised if either operand is a NaN. Otherwise, the comparison is performed 5337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5338 *----------------------------------------------------------------------------*/ 5339 5340 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5341 { 5342 5343 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5344 || (extractFloatx80Exp(a) == 0x7FFF 5345 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5346 || (extractFloatx80Exp(b) == 0x7FFF 5347 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5348 ) { 5349 float_raise(float_flag_invalid, status); 5350 return 0; 5351 } 5352 return 5353 ( a.low == b.low ) 5354 && ( ( a.high == b.high ) 5355 || ( ( a.low == 0 ) 5356 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5357 ); 5358 5359 } 5360 5361 /*---------------------------------------------------------------------------- 5362 | Returns 1 if the extended double-precision floating-point value `a' is 5363 | less than or equal to the corresponding value `b', and 0 otherwise. The 5364 | invalid exception is raised if either operand is a NaN. The comparison is 5365 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5366 | Arithmetic. 5367 *----------------------------------------------------------------------------*/ 5368 5369 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5370 { 5371 flag aSign, bSign; 5372 5373 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5374 || (extractFloatx80Exp(a) == 0x7FFF 5375 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5376 || (extractFloatx80Exp(b) == 0x7FFF 5377 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5378 ) { 5379 float_raise(float_flag_invalid, status); 5380 return 0; 5381 } 5382 aSign = extractFloatx80Sign( a ); 5383 bSign = extractFloatx80Sign( b ); 5384 if ( aSign != bSign ) { 5385 return 5386 aSign 5387 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5388 == 0 ); 5389 } 5390 return 5391 aSign ? le128( b.high, b.low, a.high, a.low ) 5392 : le128( a.high, a.low, b.high, b.low ); 5393 5394 } 5395 5396 /*---------------------------------------------------------------------------- 5397 | Returns 1 if the extended double-precision floating-point value `a' is 5398 | less than the corresponding value `b', and 0 otherwise. The invalid 5399 | exception is raised if either operand is a NaN. The comparison is performed 5400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5401 *----------------------------------------------------------------------------*/ 5402 5403 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5404 { 5405 flag aSign, bSign; 5406 5407 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5408 || (extractFloatx80Exp(a) == 0x7FFF 5409 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5410 || (extractFloatx80Exp(b) == 0x7FFF 5411 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5412 ) { 5413 float_raise(float_flag_invalid, status); 5414 return 0; 5415 } 5416 aSign = extractFloatx80Sign( a ); 5417 bSign = extractFloatx80Sign( b ); 5418 if ( aSign != bSign ) { 5419 return 5420 aSign 5421 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5422 != 0 ); 5423 } 5424 return 5425 aSign ? lt128( b.high, b.low, a.high, a.low ) 5426 : lt128( a.high, a.low, b.high, b.low ); 5427 5428 } 5429 5430 /*---------------------------------------------------------------------------- 5431 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5432 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5433 | either operand is a NaN. The comparison is performed according to the 5434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5435 *----------------------------------------------------------------------------*/ 5436 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5437 { 5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5439 || (extractFloatx80Exp(a) == 0x7FFF 5440 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5441 || (extractFloatx80Exp(b) == 0x7FFF 5442 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5443 ) { 5444 float_raise(float_flag_invalid, status); 5445 return 1; 5446 } 5447 return 0; 5448 } 5449 5450 /*---------------------------------------------------------------------------- 5451 | Returns 1 if the extended double-precision floating-point value `a' is 5452 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5453 | cause an exception. The comparison is performed according to the IEC/IEEE 5454 | Standard for Binary Floating-Point Arithmetic. 5455 *----------------------------------------------------------------------------*/ 5456 5457 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5458 { 5459 5460 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5461 float_raise(float_flag_invalid, status); 5462 return 0; 5463 } 5464 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5465 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5466 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5467 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5468 ) { 5469 if (floatx80_is_signaling_nan(a, status) 5470 || floatx80_is_signaling_nan(b, status)) { 5471 float_raise(float_flag_invalid, status); 5472 } 5473 return 0; 5474 } 5475 return 5476 ( a.low == b.low ) 5477 && ( ( a.high == b.high ) 5478 || ( ( a.low == 0 ) 5479 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5480 ); 5481 5482 } 5483 5484 /*---------------------------------------------------------------------------- 5485 | Returns 1 if the extended double-precision floating-point value `a' is less 5486 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5487 | do not cause an exception. Otherwise, the comparison is performed according 5488 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5489 *----------------------------------------------------------------------------*/ 5490 5491 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5492 { 5493 flag aSign, bSign; 5494 5495 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5496 float_raise(float_flag_invalid, status); 5497 return 0; 5498 } 5499 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5500 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5501 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5502 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5503 ) { 5504 if (floatx80_is_signaling_nan(a, status) 5505 || floatx80_is_signaling_nan(b, status)) { 5506 float_raise(float_flag_invalid, status); 5507 } 5508 return 0; 5509 } 5510 aSign = extractFloatx80Sign( a ); 5511 bSign = extractFloatx80Sign( b ); 5512 if ( aSign != bSign ) { 5513 return 5514 aSign 5515 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5516 == 0 ); 5517 } 5518 return 5519 aSign ? le128( b.high, b.low, a.high, a.low ) 5520 : le128( a.high, a.low, b.high, b.low ); 5521 5522 } 5523 5524 /*---------------------------------------------------------------------------- 5525 | Returns 1 if the extended double-precision floating-point value `a' is less 5526 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5527 | an exception. Otherwise, the comparison is performed according to the 5528 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5529 *----------------------------------------------------------------------------*/ 5530 5531 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5532 { 5533 flag aSign, bSign; 5534 5535 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5536 float_raise(float_flag_invalid, status); 5537 return 0; 5538 } 5539 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5540 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5541 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5542 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5543 ) { 5544 if (floatx80_is_signaling_nan(a, status) 5545 || floatx80_is_signaling_nan(b, status)) { 5546 float_raise(float_flag_invalid, status); 5547 } 5548 return 0; 5549 } 5550 aSign = extractFloatx80Sign( a ); 5551 bSign = extractFloatx80Sign( b ); 5552 if ( aSign != bSign ) { 5553 return 5554 aSign 5555 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5556 != 0 ); 5557 } 5558 return 5559 aSign ? lt128( b.high, b.low, a.high, a.low ) 5560 : lt128( a.high, a.low, b.high, b.low ); 5561 5562 } 5563 5564 /*---------------------------------------------------------------------------- 5565 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5566 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5567 | The comparison is performed according to the IEC/IEEE Standard for Binary 5568 | Floating-Point Arithmetic. 5569 *----------------------------------------------------------------------------*/ 5570 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5571 { 5572 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5573 float_raise(float_flag_invalid, status); 5574 return 1; 5575 } 5576 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5577 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5578 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5579 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5580 ) { 5581 if (floatx80_is_signaling_nan(a, status) 5582 || floatx80_is_signaling_nan(b, status)) { 5583 float_raise(float_flag_invalid, status); 5584 } 5585 return 1; 5586 } 5587 return 0; 5588 } 5589 5590 /*---------------------------------------------------------------------------- 5591 | Returns the result of converting the quadruple-precision floating-point 5592 | value `a' to the 32-bit two's complement integer format. The conversion 5593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5594 | Arithmetic---which means in particular that the conversion is rounded 5595 | according to the current rounding mode. If `a' is a NaN, the largest 5596 | positive integer is returned. Otherwise, if the conversion overflows, the 5597 | largest integer with the same sign as `a' is returned. 5598 *----------------------------------------------------------------------------*/ 5599 5600 int32_t float128_to_int32(float128 a, float_status *status) 5601 { 5602 flag aSign; 5603 int32_t aExp, shiftCount; 5604 uint64_t aSig0, aSig1; 5605 5606 aSig1 = extractFloat128Frac1( a ); 5607 aSig0 = extractFloat128Frac0( a ); 5608 aExp = extractFloat128Exp( a ); 5609 aSign = extractFloat128Sign( a ); 5610 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5611 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5612 aSig0 |= ( aSig1 != 0 ); 5613 shiftCount = 0x4028 - aExp; 5614 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5615 return roundAndPackInt32(aSign, aSig0, status); 5616 5617 } 5618 5619 /*---------------------------------------------------------------------------- 5620 | Returns the result of converting the quadruple-precision floating-point 5621 | value `a' to the 32-bit two's complement integer format. The conversion 5622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5623 | Arithmetic, except that the conversion is always rounded toward zero. If 5624 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5625 | conversion overflows, the largest integer with the same sign as `a' is 5626 | returned. 5627 *----------------------------------------------------------------------------*/ 5628 5629 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5630 { 5631 flag aSign; 5632 int32_t aExp, shiftCount; 5633 uint64_t aSig0, aSig1, savedASig; 5634 int32_t z; 5635 5636 aSig1 = extractFloat128Frac1( a ); 5637 aSig0 = extractFloat128Frac0( a ); 5638 aExp = extractFloat128Exp( a ); 5639 aSign = extractFloat128Sign( a ); 5640 aSig0 |= ( aSig1 != 0 ); 5641 if ( 0x401E < aExp ) { 5642 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5643 goto invalid; 5644 } 5645 else if ( aExp < 0x3FFF ) { 5646 if (aExp || aSig0) { 5647 status->float_exception_flags |= float_flag_inexact; 5648 } 5649 return 0; 5650 } 5651 aSig0 |= LIT64( 0x0001000000000000 ); 5652 shiftCount = 0x402F - aExp; 5653 savedASig = aSig0; 5654 aSig0 >>= shiftCount; 5655 z = aSig0; 5656 if ( aSign ) z = - z; 5657 if ( ( z < 0 ) ^ aSign ) { 5658 invalid: 5659 float_raise(float_flag_invalid, status); 5660 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5661 } 5662 if ( ( aSig0<<shiftCount ) != savedASig ) { 5663 status->float_exception_flags |= float_flag_inexact; 5664 } 5665 return z; 5666 5667 } 5668 5669 /*---------------------------------------------------------------------------- 5670 | Returns the result of converting the quadruple-precision floating-point 5671 | value `a' to the 64-bit two's complement integer format. The conversion 5672 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5673 | Arithmetic---which means in particular that the conversion is rounded 5674 | according to the current rounding mode. If `a' is a NaN, the largest 5675 | positive integer is returned. Otherwise, if the conversion overflows, the 5676 | largest integer with the same sign as `a' is returned. 5677 *----------------------------------------------------------------------------*/ 5678 5679 int64_t float128_to_int64(float128 a, float_status *status) 5680 { 5681 flag aSign; 5682 int32_t aExp, shiftCount; 5683 uint64_t aSig0, aSig1; 5684 5685 aSig1 = extractFloat128Frac1( a ); 5686 aSig0 = extractFloat128Frac0( a ); 5687 aExp = extractFloat128Exp( a ); 5688 aSign = extractFloat128Sign( a ); 5689 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5690 shiftCount = 0x402F - aExp; 5691 if ( shiftCount <= 0 ) { 5692 if ( 0x403E < aExp ) { 5693 float_raise(float_flag_invalid, status); 5694 if ( ! aSign 5695 || ( ( aExp == 0x7FFF ) 5696 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5697 ) 5698 ) { 5699 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5700 } 5701 return (int64_t) LIT64( 0x8000000000000000 ); 5702 } 5703 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5704 } 5705 else { 5706 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5707 } 5708 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5709 5710 } 5711 5712 /*---------------------------------------------------------------------------- 5713 | Returns the result of converting the quadruple-precision floating-point 5714 | value `a' to the 64-bit two's complement integer format. The conversion 5715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5716 | Arithmetic, except that the conversion is always rounded toward zero. 5717 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5718 | the conversion overflows, the largest integer with the same sign as `a' is 5719 | returned. 5720 *----------------------------------------------------------------------------*/ 5721 5722 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5723 { 5724 flag aSign; 5725 int32_t aExp, shiftCount; 5726 uint64_t aSig0, aSig1; 5727 int64_t z; 5728 5729 aSig1 = extractFloat128Frac1( a ); 5730 aSig0 = extractFloat128Frac0( a ); 5731 aExp = extractFloat128Exp( a ); 5732 aSign = extractFloat128Sign( a ); 5733 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5734 shiftCount = aExp - 0x402F; 5735 if ( 0 < shiftCount ) { 5736 if ( 0x403E <= aExp ) { 5737 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5738 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5739 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5740 if (aSig1) { 5741 status->float_exception_flags |= float_flag_inexact; 5742 } 5743 } 5744 else { 5745 float_raise(float_flag_invalid, status); 5746 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5747 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5748 } 5749 } 5750 return (int64_t) LIT64( 0x8000000000000000 ); 5751 } 5752 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5753 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5754 status->float_exception_flags |= float_flag_inexact; 5755 } 5756 } 5757 else { 5758 if ( aExp < 0x3FFF ) { 5759 if ( aExp | aSig0 | aSig1 ) { 5760 status->float_exception_flags |= float_flag_inexact; 5761 } 5762 return 0; 5763 } 5764 z = aSig0>>( - shiftCount ); 5765 if ( aSig1 5766 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5767 status->float_exception_flags |= float_flag_inexact; 5768 } 5769 } 5770 if ( aSign ) z = - z; 5771 return z; 5772 5773 } 5774 5775 /*---------------------------------------------------------------------------- 5776 | Returns the result of converting the quadruple-precision floating-point value 5777 | `a' to the 64-bit unsigned integer format. The conversion is 5778 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5779 | Arithmetic---which means in particular that the conversion is rounded 5780 | according to the current rounding mode. If `a' is a NaN, the largest 5781 | positive integer is returned. If the conversion overflows, the 5782 | largest unsigned integer is returned. If 'a' is negative, the value is 5783 | rounded and zero is returned; negative values that do not round to zero 5784 | will raise the inexact exception. 5785 *----------------------------------------------------------------------------*/ 5786 5787 uint64_t float128_to_uint64(float128 a, float_status *status) 5788 { 5789 flag aSign; 5790 int aExp; 5791 int shiftCount; 5792 uint64_t aSig0, aSig1; 5793 5794 aSig0 = extractFloat128Frac0(a); 5795 aSig1 = extractFloat128Frac1(a); 5796 aExp = extractFloat128Exp(a); 5797 aSign = extractFloat128Sign(a); 5798 if (aSign && (aExp > 0x3FFE)) { 5799 float_raise(float_flag_invalid, status); 5800 if (float128_is_any_nan(a)) { 5801 return LIT64(0xFFFFFFFFFFFFFFFF); 5802 } else { 5803 return 0; 5804 } 5805 } 5806 if (aExp) { 5807 aSig0 |= LIT64(0x0001000000000000); 5808 } 5809 shiftCount = 0x402F - aExp; 5810 if (shiftCount <= 0) { 5811 if (0x403E < aExp) { 5812 float_raise(float_flag_invalid, status); 5813 return LIT64(0xFFFFFFFFFFFFFFFF); 5814 } 5815 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5816 } else { 5817 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5818 } 5819 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5820 } 5821 5822 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5823 { 5824 uint64_t v; 5825 signed char current_rounding_mode = status->float_rounding_mode; 5826 5827 set_float_rounding_mode(float_round_to_zero, status); 5828 v = float128_to_uint64(a, status); 5829 set_float_rounding_mode(current_rounding_mode, status); 5830 5831 return v; 5832 } 5833 5834 /*---------------------------------------------------------------------------- 5835 | Returns the result of converting the quadruple-precision floating-point 5836 | value `a' to the 32-bit unsigned integer format. The conversion 5837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5838 | Arithmetic except that the conversion is always rounded toward zero. 5839 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5840 | if the conversion overflows, the largest unsigned integer is returned. 5841 | If 'a' is negative, the value is rounded and zero is returned; negative 5842 | values that do not round to zero will raise the inexact exception. 5843 *----------------------------------------------------------------------------*/ 5844 5845 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5846 { 5847 uint64_t v; 5848 uint32_t res; 5849 int old_exc_flags = get_float_exception_flags(status); 5850 5851 v = float128_to_uint64_round_to_zero(a, status); 5852 if (v > 0xffffffff) { 5853 res = 0xffffffff; 5854 } else { 5855 return v; 5856 } 5857 set_float_exception_flags(old_exc_flags, status); 5858 float_raise(float_flag_invalid, status); 5859 return res; 5860 } 5861 5862 /*---------------------------------------------------------------------------- 5863 | Returns the result of converting the quadruple-precision floating-point 5864 | value `a' to the single-precision floating-point format. The conversion 5865 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5866 | Arithmetic. 5867 *----------------------------------------------------------------------------*/ 5868 5869 float32 float128_to_float32(float128 a, float_status *status) 5870 { 5871 flag aSign; 5872 int32_t aExp; 5873 uint64_t aSig0, aSig1; 5874 uint32_t zSig; 5875 5876 aSig1 = extractFloat128Frac1( a ); 5877 aSig0 = extractFloat128Frac0( a ); 5878 aExp = extractFloat128Exp( a ); 5879 aSign = extractFloat128Sign( a ); 5880 if ( aExp == 0x7FFF ) { 5881 if ( aSig0 | aSig1 ) { 5882 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5883 } 5884 return packFloat32( aSign, 0xFF, 0 ); 5885 } 5886 aSig0 |= ( aSig1 != 0 ); 5887 shift64RightJamming( aSig0, 18, &aSig0 ); 5888 zSig = aSig0; 5889 if ( aExp || zSig ) { 5890 zSig |= 0x40000000; 5891 aExp -= 0x3F81; 5892 } 5893 return roundAndPackFloat32(aSign, aExp, zSig, status); 5894 5895 } 5896 5897 /*---------------------------------------------------------------------------- 5898 | Returns the result of converting the quadruple-precision floating-point 5899 | value `a' to the double-precision floating-point format. The conversion 5900 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5901 | Arithmetic. 5902 *----------------------------------------------------------------------------*/ 5903 5904 float64 float128_to_float64(float128 a, float_status *status) 5905 { 5906 flag aSign; 5907 int32_t aExp; 5908 uint64_t aSig0, aSig1; 5909 5910 aSig1 = extractFloat128Frac1( a ); 5911 aSig0 = extractFloat128Frac0( a ); 5912 aExp = extractFloat128Exp( a ); 5913 aSign = extractFloat128Sign( a ); 5914 if ( aExp == 0x7FFF ) { 5915 if ( aSig0 | aSig1 ) { 5916 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5917 } 5918 return packFloat64( aSign, 0x7FF, 0 ); 5919 } 5920 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5921 aSig0 |= ( aSig1 != 0 ); 5922 if ( aExp || aSig0 ) { 5923 aSig0 |= LIT64( 0x4000000000000000 ); 5924 aExp -= 0x3C01; 5925 } 5926 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5927 5928 } 5929 5930 /*---------------------------------------------------------------------------- 5931 | Returns the result of converting the quadruple-precision floating-point 5932 | value `a' to the extended double-precision floating-point format. The 5933 | conversion is performed according to the IEC/IEEE Standard for Binary 5934 | Floating-Point Arithmetic. 5935 *----------------------------------------------------------------------------*/ 5936 5937 floatx80 float128_to_floatx80(float128 a, float_status *status) 5938 { 5939 flag aSign; 5940 int32_t aExp; 5941 uint64_t aSig0, aSig1; 5942 5943 aSig1 = extractFloat128Frac1( a ); 5944 aSig0 = extractFloat128Frac0( a ); 5945 aExp = extractFloat128Exp( a ); 5946 aSign = extractFloat128Sign( a ); 5947 if ( aExp == 0x7FFF ) { 5948 if ( aSig0 | aSig1 ) { 5949 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5950 } 5951 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5952 } 5953 if ( aExp == 0 ) { 5954 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5955 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5956 } 5957 else { 5958 aSig0 |= LIT64( 0x0001000000000000 ); 5959 } 5960 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5961 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5962 5963 } 5964 5965 /*---------------------------------------------------------------------------- 5966 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5967 | returns the result as a quadruple-precision floating-point value. The 5968 | operation is performed according to the IEC/IEEE Standard for Binary 5969 | Floating-Point Arithmetic. 5970 *----------------------------------------------------------------------------*/ 5971 5972 float128 float128_round_to_int(float128 a, float_status *status) 5973 { 5974 flag aSign; 5975 int32_t aExp; 5976 uint64_t lastBitMask, roundBitsMask; 5977 float128 z; 5978 5979 aExp = extractFloat128Exp( a ); 5980 if ( 0x402F <= aExp ) { 5981 if ( 0x406F <= aExp ) { 5982 if ( ( aExp == 0x7FFF ) 5983 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5984 ) { 5985 return propagateFloat128NaN(a, a, status); 5986 } 5987 return a; 5988 } 5989 lastBitMask = 1; 5990 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5991 roundBitsMask = lastBitMask - 1; 5992 z = a; 5993 switch (status->float_rounding_mode) { 5994 case float_round_nearest_even: 5995 if ( lastBitMask ) { 5996 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5997 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5998 } 5999 else { 6000 if ( (int64_t) z.low < 0 ) { 6001 ++z.high; 6002 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6003 } 6004 } 6005 break; 6006 case float_round_ties_away: 6007 if (lastBitMask) { 6008 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6009 } else { 6010 if ((int64_t) z.low < 0) { 6011 ++z.high; 6012 } 6013 } 6014 break; 6015 case float_round_to_zero: 6016 break; 6017 case float_round_up: 6018 if (!extractFloat128Sign(z)) { 6019 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6020 } 6021 break; 6022 case float_round_down: 6023 if (extractFloat128Sign(z)) { 6024 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6025 } 6026 break; 6027 default: 6028 abort(); 6029 } 6030 z.low &= ~ roundBitsMask; 6031 } 6032 else { 6033 if ( aExp < 0x3FFF ) { 6034 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6035 status->float_exception_flags |= float_flag_inexact; 6036 aSign = extractFloat128Sign( a ); 6037 switch (status->float_rounding_mode) { 6038 case float_round_nearest_even: 6039 if ( ( aExp == 0x3FFE ) 6040 && ( extractFloat128Frac0( a ) 6041 | extractFloat128Frac1( a ) ) 6042 ) { 6043 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6044 } 6045 break; 6046 case float_round_ties_away: 6047 if (aExp == 0x3FFE) { 6048 return packFloat128(aSign, 0x3FFF, 0, 0); 6049 } 6050 break; 6051 case float_round_down: 6052 return 6053 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6054 : packFloat128( 0, 0, 0, 0 ); 6055 case float_round_up: 6056 return 6057 aSign ? packFloat128( 1, 0, 0, 0 ) 6058 : packFloat128( 0, 0x3FFF, 0, 0 ); 6059 } 6060 return packFloat128( aSign, 0, 0, 0 ); 6061 } 6062 lastBitMask = 1; 6063 lastBitMask <<= 0x402F - aExp; 6064 roundBitsMask = lastBitMask - 1; 6065 z.low = 0; 6066 z.high = a.high; 6067 switch (status->float_rounding_mode) { 6068 case float_round_nearest_even: 6069 z.high += lastBitMask>>1; 6070 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6071 z.high &= ~ lastBitMask; 6072 } 6073 break; 6074 case float_round_ties_away: 6075 z.high += lastBitMask>>1; 6076 break; 6077 case float_round_to_zero: 6078 break; 6079 case float_round_up: 6080 if (!extractFloat128Sign(z)) { 6081 z.high |= ( a.low != 0 ); 6082 z.high += roundBitsMask; 6083 } 6084 break; 6085 case float_round_down: 6086 if (extractFloat128Sign(z)) { 6087 z.high |= (a.low != 0); 6088 z.high += roundBitsMask; 6089 } 6090 break; 6091 default: 6092 abort(); 6093 } 6094 z.high &= ~ roundBitsMask; 6095 } 6096 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6097 status->float_exception_flags |= float_flag_inexact; 6098 } 6099 return z; 6100 6101 } 6102 6103 /*---------------------------------------------------------------------------- 6104 | Returns the result of adding the absolute values of the quadruple-precision 6105 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6106 | before being returned. `zSign' is ignored if the result is a NaN. 6107 | The addition is performed according to the IEC/IEEE Standard for Binary 6108 | Floating-Point Arithmetic. 6109 *----------------------------------------------------------------------------*/ 6110 6111 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6112 float_status *status) 6113 { 6114 int32_t aExp, bExp, zExp; 6115 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6116 int32_t expDiff; 6117 6118 aSig1 = extractFloat128Frac1( a ); 6119 aSig0 = extractFloat128Frac0( a ); 6120 aExp = extractFloat128Exp( a ); 6121 bSig1 = extractFloat128Frac1( b ); 6122 bSig0 = extractFloat128Frac0( b ); 6123 bExp = extractFloat128Exp( b ); 6124 expDiff = aExp - bExp; 6125 if ( 0 < expDiff ) { 6126 if ( aExp == 0x7FFF ) { 6127 if (aSig0 | aSig1) { 6128 return propagateFloat128NaN(a, b, status); 6129 } 6130 return a; 6131 } 6132 if ( bExp == 0 ) { 6133 --expDiff; 6134 } 6135 else { 6136 bSig0 |= LIT64( 0x0001000000000000 ); 6137 } 6138 shift128ExtraRightJamming( 6139 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6140 zExp = aExp; 6141 } 6142 else if ( expDiff < 0 ) { 6143 if ( bExp == 0x7FFF ) { 6144 if (bSig0 | bSig1) { 6145 return propagateFloat128NaN(a, b, status); 6146 } 6147 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6148 } 6149 if ( aExp == 0 ) { 6150 ++expDiff; 6151 } 6152 else { 6153 aSig0 |= LIT64( 0x0001000000000000 ); 6154 } 6155 shift128ExtraRightJamming( 6156 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6157 zExp = bExp; 6158 } 6159 else { 6160 if ( aExp == 0x7FFF ) { 6161 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6162 return propagateFloat128NaN(a, b, status); 6163 } 6164 return a; 6165 } 6166 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6167 if ( aExp == 0 ) { 6168 if (status->flush_to_zero) { 6169 if (zSig0 | zSig1) { 6170 float_raise(float_flag_output_denormal, status); 6171 } 6172 return packFloat128(zSign, 0, 0, 0); 6173 } 6174 return packFloat128( zSign, 0, zSig0, zSig1 ); 6175 } 6176 zSig2 = 0; 6177 zSig0 |= LIT64( 0x0002000000000000 ); 6178 zExp = aExp; 6179 goto shiftRight1; 6180 } 6181 aSig0 |= LIT64( 0x0001000000000000 ); 6182 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6183 --zExp; 6184 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6185 ++zExp; 6186 shiftRight1: 6187 shift128ExtraRightJamming( 6188 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6189 roundAndPack: 6190 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6191 6192 } 6193 6194 /*---------------------------------------------------------------------------- 6195 | Returns the result of subtracting the absolute values of the quadruple- 6196 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6197 | difference is negated before being returned. `zSign' is ignored if the 6198 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6199 | Standard for Binary Floating-Point Arithmetic. 6200 *----------------------------------------------------------------------------*/ 6201 6202 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6203 float_status *status) 6204 { 6205 int32_t aExp, bExp, zExp; 6206 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6207 int32_t expDiff; 6208 6209 aSig1 = extractFloat128Frac1( a ); 6210 aSig0 = extractFloat128Frac0( a ); 6211 aExp = extractFloat128Exp( a ); 6212 bSig1 = extractFloat128Frac1( b ); 6213 bSig0 = extractFloat128Frac0( b ); 6214 bExp = extractFloat128Exp( b ); 6215 expDiff = aExp - bExp; 6216 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6217 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6218 if ( 0 < expDiff ) goto aExpBigger; 6219 if ( expDiff < 0 ) goto bExpBigger; 6220 if ( aExp == 0x7FFF ) { 6221 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6222 return propagateFloat128NaN(a, b, status); 6223 } 6224 float_raise(float_flag_invalid, status); 6225 return float128_default_nan(status); 6226 } 6227 if ( aExp == 0 ) { 6228 aExp = 1; 6229 bExp = 1; 6230 } 6231 if ( bSig0 < aSig0 ) goto aBigger; 6232 if ( aSig0 < bSig0 ) goto bBigger; 6233 if ( bSig1 < aSig1 ) goto aBigger; 6234 if ( aSig1 < bSig1 ) goto bBigger; 6235 return packFloat128(status->float_rounding_mode == float_round_down, 6236 0, 0, 0); 6237 bExpBigger: 6238 if ( bExp == 0x7FFF ) { 6239 if (bSig0 | bSig1) { 6240 return propagateFloat128NaN(a, b, status); 6241 } 6242 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6243 } 6244 if ( aExp == 0 ) { 6245 ++expDiff; 6246 } 6247 else { 6248 aSig0 |= LIT64( 0x4000000000000000 ); 6249 } 6250 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6251 bSig0 |= LIT64( 0x4000000000000000 ); 6252 bBigger: 6253 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6254 zExp = bExp; 6255 zSign ^= 1; 6256 goto normalizeRoundAndPack; 6257 aExpBigger: 6258 if ( aExp == 0x7FFF ) { 6259 if (aSig0 | aSig1) { 6260 return propagateFloat128NaN(a, b, status); 6261 } 6262 return a; 6263 } 6264 if ( bExp == 0 ) { 6265 --expDiff; 6266 } 6267 else { 6268 bSig0 |= LIT64( 0x4000000000000000 ); 6269 } 6270 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6271 aSig0 |= LIT64( 0x4000000000000000 ); 6272 aBigger: 6273 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6274 zExp = aExp; 6275 normalizeRoundAndPack: 6276 --zExp; 6277 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6278 status); 6279 6280 } 6281 6282 /*---------------------------------------------------------------------------- 6283 | Returns the result of adding the quadruple-precision floating-point values 6284 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6285 | for Binary Floating-Point Arithmetic. 6286 *----------------------------------------------------------------------------*/ 6287 6288 float128 float128_add(float128 a, float128 b, float_status *status) 6289 { 6290 flag aSign, bSign; 6291 6292 aSign = extractFloat128Sign( a ); 6293 bSign = extractFloat128Sign( b ); 6294 if ( aSign == bSign ) { 6295 return addFloat128Sigs(a, b, aSign, status); 6296 } 6297 else { 6298 return subFloat128Sigs(a, b, aSign, status); 6299 } 6300 6301 } 6302 6303 /*---------------------------------------------------------------------------- 6304 | Returns the result of subtracting the quadruple-precision floating-point 6305 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6306 | Standard for Binary Floating-Point Arithmetic. 6307 *----------------------------------------------------------------------------*/ 6308 6309 float128 float128_sub(float128 a, float128 b, float_status *status) 6310 { 6311 flag aSign, bSign; 6312 6313 aSign = extractFloat128Sign( a ); 6314 bSign = extractFloat128Sign( b ); 6315 if ( aSign == bSign ) { 6316 return subFloat128Sigs(a, b, aSign, status); 6317 } 6318 else { 6319 return addFloat128Sigs(a, b, aSign, status); 6320 } 6321 6322 } 6323 6324 /*---------------------------------------------------------------------------- 6325 | Returns the result of multiplying the quadruple-precision floating-point 6326 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6327 | Standard for Binary Floating-Point Arithmetic. 6328 *----------------------------------------------------------------------------*/ 6329 6330 float128 float128_mul(float128 a, float128 b, float_status *status) 6331 { 6332 flag aSign, bSign, zSign; 6333 int32_t aExp, bExp, zExp; 6334 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6335 6336 aSig1 = extractFloat128Frac1( a ); 6337 aSig0 = extractFloat128Frac0( a ); 6338 aExp = extractFloat128Exp( a ); 6339 aSign = extractFloat128Sign( a ); 6340 bSig1 = extractFloat128Frac1( b ); 6341 bSig0 = extractFloat128Frac0( b ); 6342 bExp = extractFloat128Exp( b ); 6343 bSign = extractFloat128Sign( b ); 6344 zSign = aSign ^ bSign; 6345 if ( aExp == 0x7FFF ) { 6346 if ( ( aSig0 | aSig1 ) 6347 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6348 return propagateFloat128NaN(a, b, status); 6349 } 6350 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6351 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6352 } 6353 if ( bExp == 0x7FFF ) { 6354 if (bSig0 | bSig1) { 6355 return propagateFloat128NaN(a, b, status); 6356 } 6357 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6358 invalid: 6359 float_raise(float_flag_invalid, status); 6360 return float128_default_nan(status); 6361 } 6362 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6363 } 6364 if ( aExp == 0 ) { 6365 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6366 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6367 } 6368 if ( bExp == 0 ) { 6369 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6370 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6371 } 6372 zExp = aExp + bExp - 0x4000; 6373 aSig0 |= LIT64( 0x0001000000000000 ); 6374 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6375 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6376 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6377 zSig2 |= ( zSig3 != 0 ); 6378 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6379 shift128ExtraRightJamming( 6380 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6381 ++zExp; 6382 } 6383 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6384 6385 } 6386 6387 /*---------------------------------------------------------------------------- 6388 | Returns the result of dividing the quadruple-precision floating-point value 6389 | `a' by the corresponding value `b'. The operation is performed according to 6390 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6391 *----------------------------------------------------------------------------*/ 6392 6393 float128 float128_div(float128 a, float128 b, float_status *status) 6394 { 6395 flag aSign, bSign, zSign; 6396 int32_t aExp, bExp, zExp; 6397 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6398 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6399 6400 aSig1 = extractFloat128Frac1( a ); 6401 aSig0 = extractFloat128Frac0( a ); 6402 aExp = extractFloat128Exp( a ); 6403 aSign = extractFloat128Sign( a ); 6404 bSig1 = extractFloat128Frac1( b ); 6405 bSig0 = extractFloat128Frac0( b ); 6406 bExp = extractFloat128Exp( b ); 6407 bSign = extractFloat128Sign( b ); 6408 zSign = aSign ^ bSign; 6409 if ( aExp == 0x7FFF ) { 6410 if (aSig0 | aSig1) { 6411 return propagateFloat128NaN(a, b, status); 6412 } 6413 if ( bExp == 0x7FFF ) { 6414 if (bSig0 | bSig1) { 6415 return propagateFloat128NaN(a, b, status); 6416 } 6417 goto invalid; 6418 } 6419 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6420 } 6421 if ( bExp == 0x7FFF ) { 6422 if (bSig0 | bSig1) { 6423 return propagateFloat128NaN(a, b, status); 6424 } 6425 return packFloat128( zSign, 0, 0, 0 ); 6426 } 6427 if ( bExp == 0 ) { 6428 if ( ( bSig0 | bSig1 ) == 0 ) { 6429 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6430 invalid: 6431 float_raise(float_flag_invalid, status); 6432 return float128_default_nan(status); 6433 } 6434 float_raise(float_flag_divbyzero, status); 6435 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6436 } 6437 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6438 } 6439 if ( aExp == 0 ) { 6440 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6441 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6442 } 6443 zExp = aExp - bExp + 0x3FFD; 6444 shortShift128Left( 6445 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6446 shortShift128Left( 6447 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6448 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6449 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6450 ++zExp; 6451 } 6452 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6453 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6454 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6455 while ( (int64_t) rem0 < 0 ) { 6456 --zSig0; 6457 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6458 } 6459 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6460 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6461 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6462 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6463 while ( (int64_t) rem1 < 0 ) { 6464 --zSig1; 6465 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6466 } 6467 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6468 } 6469 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6470 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6471 6472 } 6473 6474 /*---------------------------------------------------------------------------- 6475 | Returns the remainder of the quadruple-precision floating-point value `a' 6476 | with respect to the corresponding value `b'. The operation is performed 6477 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6478 *----------------------------------------------------------------------------*/ 6479 6480 float128 float128_rem(float128 a, float128 b, float_status *status) 6481 { 6482 flag aSign, zSign; 6483 int32_t aExp, bExp, expDiff; 6484 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6485 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6486 int64_t sigMean0; 6487 6488 aSig1 = extractFloat128Frac1( a ); 6489 aSig0 = extractFloat128Frac0( a ); 6490 aExp = extractFloat128Exp( a ); 6491 aSign = extractFloat128Sign( a ); 6492 bSig1 = extractFloat128Frac1( b ); 6493 bSig0 = extractFloat128Frac0( b ); 6494 bExp = extractFloat128Exp( b ); 6495 if ( aExp == 0x7FFF ) { 6496 if ( ( aSig0 | aSig1 ) 6497 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6498 return propagateFloat128NaN(a, b, status); 6499 } 6500 goto invalid; 6501 } 6502 if ( bExp == 0x7FFF ) { 6503 if (bSig0 | bSig1) { 6504 return propagateFloat128NaN(a, b, status); 6505 } 6506 return a; 6507 } 6508 if ( bExp == 0 ) { 6509 if ( ( bSig0 | bSig1 ) == 0 ) { 6510 invalid: 6511 float_raise(float_flag_invalid, status); 6512 return float128_default_nan(status); 6513 } 6514 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6515 } 6516 if ( aExp == 0 ) { 6517 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6518 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6519 } 6520 expDiff = aExp - bExp; 6521 if ( expDiff < -1 ) return a; 6522 shortShift128Left( 6523 aSig0 | LIT64( 0x0001000000000000 ), 6524 aSig1, 6525 15 - ( expDiff < 0 ), 6526 &aSig0, 6527 &aSig1 6528 ); 6529 shortShift128Left( 6530 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6531 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6532 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6533 expDiff -= 64; 6534 while ( 0 < expDiff ) { 6535 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6536 q = ( 4 < q ) ? q - 4 : 0; 6537 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6538 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6539 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6540 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6541 expDiff -= 61; 6542 } 6543 if ( -64 < expDiff ) { 6544 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6545 q = ( 4 < q ) ? q - 4 : 0; 6546 q >>= - expDiff; 6547 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6548 expDiff += 52; 6549 if ( expDiff < 0 ) { 6550 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6551 } 6552 else { 6553 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6554 } 6555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6556 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6557 } 6558 else { 6559 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6560 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6561 } 6562 do { 6563 alternateASig0 = aSig0; 6564 alternateASig1 = aSig1; 6565 ++q; 6566 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6567 } while ( 0 <= (int64_t) aSig0 ); 6568 add128( 6569 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6570 if ( ( sigMean0 < 0 ) 6571 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6572 aSig0 = alternateASig0; 6573 aSig1 = alternateASig1; 6574 } 6575 zSign = ( (int64_t) aSig0 < 0 ); 6576 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6577 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6578 status); 6579 } 6580 6581 /*---------------------------------------------------------------------------- 6582 | Returns the square root of the quadruple-precision floating-point value `a'. 6583 | The operation is performed according to the IEC/IEEE Standard for Binary 6584 | Floating-Point Arithmetic. 6585 *----------------------------------------------------------------------------*/ 6586 6587 float128 float128_sqrt(float128 a, float_status *status) 6588 { 6589 flag aSign; 6590 int32_t aExp, zExp; 6591 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6592 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6593 6594 aSig1 = extractFloat128Frac1( a ); 6595 aSig0 = extractFloat128Frac0( a ); 6596 aExp = extractFloat128Exp( a ); 6597 aSign = extractFloat128Sign( a ); 6598 if ( aExp == 0x7FFF ) { 6599 if (aSig0 | aSig1) { 6600 return propagateFloat128NaN(a, a, status); 6601 } 6602 if ( ! aSign ) return a; 6603 goto invalid; 6604 } 6605 if ( aSign ) { 6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6607 invalid: 6608 float_raise(float_flag_invalid, status); 6609 return float128_default_nan(status); 6610 } 6611 if ( aExp == 0 ) { 6612 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6613 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6614 } 6615 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6616 aSig0 |= LIT64( 0x0001000000000000 ); 6617 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6618 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6619 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6620 doubleZSig0 = zSig0<<1; 6621 mul64To128( zSig0, zSig0, &term0, &term1 ); 6622 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6623 while ( (int64_t) rem0 < 0 ) { 6624 --zSig0; 6625 doubleZSig0 -= 2; 6626 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6627 } 6628 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6629 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6630 if ( zSig1 == 0 ) zSig1 = 1; 6631 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6632 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6633 mul64To128( zSig1, zSig1, &term2, &term3 ); 6634 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6635 while ( (int64_t) rem1 < 0 ) { 6636 --zSig1; 6637 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6638 term3 |= 1; 6639 term2 |= doubleZSig0; 6640 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6641 } 6642 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6643 } 6644 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6645 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6646 6647 } 6648 6649 /*---------------------------------------------------------------------------- 6650 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6651 | the corresponding value `b', and 0 otherwise. The invalid exception is 6652 | raised if either operand is a NaN. Otherwise, the comparison is performed 6653 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6654 *----------------------------------------------------------------------------*/ 6655 6656 int float128_eq(float128 a, float128 b, float_status *status) 6657 { 6658 6659 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6660 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6661 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6662 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6663 ) { 6664 float_raise(float_flag_invalid, status); 6665 return 0; 6666 } 6667 return 6668 ( a.low == b.low ) 6669 && ( ( a.high == b.high ) 6670 || ( ( a.low == 0 ) 6671 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6672 ); 6673 6674 } 6675 6676 /*---------------------------------------------------------------------------- 6677 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6678 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6679 | exception is raised if either operand is a NaN. The comparison is performed 6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6681 *----------------------------------------------------------------------------*/ 6682 6683 int float128_le(float128 a, float128 b, float_status *status) 6684 { 6685 flag aSign, bSign; 6686 6687 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6688 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6689 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6690 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6691 ) { 6692 float_raise(float_flag_invalid, status); 6693 return 0; 6694 } 6695 aSign = extractFloat128Sign( a ); 6696 bSign = extractFloat128Sign( b ); 6697 if ( aSign != bSign ) { 6698 return 6699 aSign 6700 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6701 == 0 ); 6702 } 6703 return 6704 aSign ? le128( b.high, b.low, a.high, a.low ) 6705 : le128( a.high, a.low, b.high, b.low ); 6706 6707 } 6708 6709 /*---------------------------------------------------------------------------- 6710 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6711 | the corresponding value `b', and 0 otherwise. The invalid exception is 6712 | raised if either operand is a NaN. The comparison is performed according 6713 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6714 *----------------------------------------------------------------------------*/ 6715 6716 int float128_lt(float128 a, float128 b, float_status *status) 6717 { 6718 flag aSign, bSign; 6719 6720 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6721 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6722 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6723 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6724 ) { 6725 float_raise(float_flag_invalid, status); 6726 return 0; 6727 } 6728 aSign = extractFloat128Sign( a ); 6729 bSign = extractFloat128Sign( b ); 6730 if ( aSign != bSign ) { 6731 return 6732 aSign 6733 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6734 != 0 ); 6735 } 6736 return 6737 aSign ? lt128( b.high, b.low, a.high, a.low ) 6738 : lt128( a.high, a.low, b.high, b.low ); 6739 6740 } 6741 6742 /*---------------------------------------------------------------------------- 6743 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6744 | be compared, and 0 otherwise. The invalid exception is raised if either 6745 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6746 | Standard for Binary Floating-Point Arithmetic. 6747 *----------------------------------------------------------------------------*/ 6748 6749 int float128_unordered(float128 a, float128 b, float_status *status) 6750 { 6751 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6752 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6753 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6754 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6755 ) { 6756 float_raise(float_flag_invalid, status); 6757 return 1; 6758 } 6759 return 0; 6760 } 6761 6762 /*---------------------------------------------------------------------------- 6763 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6764 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6765 | exception. The comparison is performed according to the IEC/IEEE Standard 6766 | for Binary Floating-Point Arithmetic. 6767 *----------------------------------------------------------------------------*/ 6768 6769 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6770 { 6771 6772 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6773 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6774 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6775 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6776 ) { 6777 if (float128_is_signaling_nan(a, status) 6778 || float128_is_signaling_nan(b, status)) { 6779 float_raise(float_flag_invalid, status); 6780 } 6781 return 0; 6782 } 6783 return 6784 ( a.low == b.low ) 6785 && ( ( a.high == b.high ) 6786 || ( ( a.low == 0 ) 6787 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6788 ); 6789 6790 } 6791 6792 /*---------------------------------------------------------------------------- 6793 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6794 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6795 | cause an exception. Otherwise, the comparison is performed according to the 6796 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6797 *----------------------------------------------------------------------------*/ 6798 6799 int float128_le_quiet(float128 a, float128 b, float_status *status) 6800 { 6801 flag aSign, bSign; 6802 6803 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6804 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6805 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6806 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6807 ) { 6808 if (float128_is_signaling_nan(a, status) 6809 || float128_is_signaling_nan(b, status)) { 6810 float_raise(float_flag_invalid, status); 6811 } 6812 return 0; 6813 } 6814 aSign = extractFloat128Sign( a ); 6815 bSign = extractFloat128Sign( b ); 6816 if ( aSign != bSign ) { 6817 return 6818 aSign 6819 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6820 == 0 ); 6821 } 6822 return 6823 aSign ? le128( b.high, b.low, a.high, a.low ) 6824 : le128( a.high, a.low, b.high, b.low ); 6825 6826 } 6827 6828 /*---------------------------------------------------------------------------- 6829 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6830 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6831 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6832 | Standard for Binary Floating-Point Arithmetic. 6833 *----------------------------------------------------------------------------*/ 6834 6835 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6836 { 6837 flag aSign, bSign; 6838 6839 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6840 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6841 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6842 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6843 ) { 6844 if (float128_is_signaling_nan(a, status) 6845 || float128_is_signaling_nan(b, status)) { 6846 float_raise(float_flag_invalid, status); 6847 } 6848 return 0; 6849 } 6850 aSign = extractFloat128Sign( a ); 6851 bSign = extractFloat128Sign( b ); 6852 if ( aSign != bSign ) { 6853 return 6854 aSign 6855 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6856 != 0 ); 6857 } 6858 return 6859 aSign ? lt128( b.high, b.low, a.high, a.low ) 6860 : lt128( a.high, a.low, b.high, b.low ); 6861 6862 } 6863 6864 /*---------------------------------------------------------------------------- 6865 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6866 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6867 | comparison is performed according to the IEC/IEEE Standard for Binary 6868 | Floating-Point Arithmetic. 6869 *----------------------------------------------------------------------------*/ 6870 6871 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6872 { 6873 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6874 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6875 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6876 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6877 ) { 6878 if (float128_is_signaling_nan(a, status) 6879 || float128_is_signaling_nan(b, status)) { 6880 float_raise(float_flag_invalid, status); 6881 } 6882 return 1; 6883 } 6884 return 0; 6885 } 6886 6887 #define COMPARE(s, nan_exp) \ 6888 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 6889 int is_quiet, float_status *status) \ 6890 { \ 6891 flag aSign, bSign; \ 6892 uint ## s ## _t av, bv; \ 6893 a = float ## s ## _squash_input_denormal(a, status); \ 6894 b = float ## s ## _squash_input_denormal(b, status); \ 6895 \ 6896 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 6897 extractFloat ## s ## Frac( a ) ) || \ 6898 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 6899 extractFloat ## s ## Frac( b ) )) { \ 6900 if (!is_quiet || \ 6901 float ## s ## _is_signaling_nan(a, status) || \ 6902 float ## s ## _is_signaling_nan(b, status)) { \ 6903 float_raise(float_flag_invalid, status); \ 6904 } \ 6905 return float_relation_unordered; \ 6906 } \ 6907 aSign = extractFloat ## s ## Sign( a ); \ 6908 bSign = extractFloat ## s ## Sign( b ); \ 6909 av = float ## s ## _val(a); \ 6910 bv = float ## s ## _val(b); \ 6911 if ( aSign != bSign ) { \ 6912 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 6913 /* zero case */ \ 6914 return float_relation_equal; \ 6915 } else { \ 6916 return 1 - (2 * aSign); \ 6917 } \ 6918 } else { \ 6919 if (av == bv) { \ 6920 return float_relation_equal; \ 6921 } else { \ 6922 return 1 - 2 * (aSign ^ ( av < bv )); \ 6923 } \ 6924 } \ 6925 } \ 6926 \ 6927 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 6928 { \ 6929 return float ## s ## _compare_internal(a, b, 0, status); \ 6930 } \ 6931 \ 6932 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 6933 float_status *status) \ 6934 { \ 6935 return float ## s ## _compare_internal(a, b, 1, status); \ 6936 } 6937 6938 COMPARE(32, 0xff) 6939 COMPARE(64, 0x7ff) 6940 6941 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6942 int is_quiet, float_status *status) 6943 { 6944 flag aSign, bSign; 6945 6946 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6947 float_raise(float_flag_invalid, status); 6948 return float_relation_unordered; 6949 } 6950 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6951 ( extractFloatx80Frac( a )<<1 ) ) || 6952 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6953 ( extractFloatx80Frac( b )<<1 ) )) { 6954 if (!is_quiet || 6955 floatx80_is_signaling_nan(a, status) || 6956 floatx80_is_signaling_nan(b, status)) { 6957 float_raise(float_flag_invalid, status); 6958 } 6959 return float_relation_unordered; 6960 } 6961 aSign = extractFloatx80Sign( a ); 6962 bSign = extractFloatx80Sign( b ); 6963 if ( aSign != bSign ) { 6964 6965 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6966 ( ( a.low | b.low ) == 0 ) ) { 6967 /* zero case */ 6968 return float_relation_equal; 6969 } else { 6970 return 1 - (2 * aSign); 6971 } 6972 } else { 6973 if (a.low == b.low && a.high == b.high) { 6974 return float_relation_equal; 6975 } else { 6976 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6977 } 6978 } 6979 } 6980 6981 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6982 { 6983 return floatx80_compare_internal(a, b, 0, status); 6984 } 6985 6986 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6987 { 6988 return floatx80_compare_internal(a, b, 1, status); 6989 } 6990 6991 static inline int float128_compare_internal(float128 a, float128 b, 6992 int is_quiet, float_status *status) 6993 { 6994 flag aSign, bSign; 6995 6996 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6997 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6998 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6999 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7000 if (!is_quiet || 7001 float128_is_signaling_nan(a, status) || 7002 float128_is_signaling_nan(b, status)) { 7003 float_raise(float_flag_invalid, status); 7004 } 7005 return float_relation_unordered; 7006 } 7007 aSign = extractFloat128Sign( a ); 7008 bSign = extractFloat128Sign( b ); 7009 if ( aSign != bSign ) { 7010 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7011 /* zero case */ 7012 return float_relation_equal; 7013 } else { 7014 return 1 - (2 * aSign); 7015 } 7016 } else { 7017 if (a.low == b.low && a.high == b.high) { 7018 return float_relation_equal; 7019 } else { 7020 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7021 } 7022 } 7023 } 7024 7025 int float128_compare(float128 a, float128 b, float_status *status) 7026 { 7027 return float128_compare_internal(a, b, 0, status); 7028 } 7029 7030 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7031 { 7032 return float128_compare_internal(a, b, 1, status); 7033 } 7034 7035 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7036 { 7037 flag aSign; 7038 int32_t aExp; 7039 uint64_t aSig; 7040 7041 if (floatx80_invalid_encoding(a)) { 7042 float_raise(float_flag_invalid, status); 7043 return floatx80_default_nan(status); 7044 } 7045 aSig = extractFloatx80Frac( a ); 7046 aExp = extractFloatx80Exp( a ); 7047 aSign = extractFloatx80Sign( a ); 7048 7049 if ( aExp == 0x7FFF ) { 7050 if ( aSig<<1 ) { 7051 return propagateFloatx80NaN(a, a, status); 7052 } 7053 return a; 7054 } 7055 7056 if (aExp == 0) { 7057 if (aSig == 0) { 7058 return a; 7059 } 7060 aExp++; 7061 } 7062 7063 if (n > 0x10000) { 7064 n = 0x10000; 7065 } else if (n < -0x10000) { 7066 n = -0x10000; 7067 } 7068 7069 aExp += n; 7070 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7071 aSign, aExp, aSig, 0, status); 7072 } 7073 7074 float128 float128_scalbn(float128 a, int n, float_status *status) 7075 { 7076 flag aSign; 7077 int32_t aExp; 7078 uint64_t aSig0, aSig1; 7079 7080 aSig1 = extractFloat128Frac1( a ); 7081 aSig0 = extractFloat128Frac0( a ); 7082 aExp = extractFloat128Exp( a ); 7083 aSign = extractFloat128Sign( a ); 7084 if ( aExp == 0x7FFF ) { 7085 if ( aSig0 | aSig1 ) { 7086 return propagateFloat128NaN(a, a, status); 7087 } 7088 return a; 7089 } 7090 if (aExp != 0) { 7091 aSig0 |= LIT64( 0x0001000000000000 ); 7092 } else if (aSig0 == 0 && aSig1 == 0) { 7093 return a; 7094 } else { 7095 aExp++; 7096 } 7097 7098 if (n > 0x10000) { 7099 n = 0x10000; 7100 } else if (n < -0x10000) { 7101 n = -0x10000; 7102 } 7103 7104 aExp += n - 1; 7105 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7106 , status); 7107 7108 } 7109