1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Multiply A by 2 raised to the power N. */ 1667 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1668 { 1669 if (unlikely(is_nan(a.cls))) { 1670 return return_nan(a, s); 1671 } 1672 if (a.cls == float_class_normal) { 1673 a.exp += n; 1674 } 1675 return a; 1676 } 1677 1678 float16 float16_scalbn(float16 a, int n, float_status *status) 1679 { 1680 FloatParts pa = float16_unpack_canonical(a, status); 1681 FloatParts pr = scalbn_decomposed(pa, n, status); 1682 return float16_round_pack_canonical(pr, status); 1683 } 1684 1685 float32 float32_scalbn(float32 a, int n, float_status *status) 1686 { 1687 FloatParts pa = float32_unpack_canonical(a, status); 1688 FloatParts pr = scalbn_decomposed(pa, n, status); 1689 return float32_round_pack_canonical(pr, status); 1690 } 1691 1692 float64 float64_scalbn(float64 a, int n, float_status *status) 1693 { 1694 FloatParts pa = float64_unpack_canonical(a, status); 1695 FloatParts pr = scalbn_decomposed(pa, n, status); 1696 return float64_round_pack_canonical(pr, status); 1697 } 1698 1699 /*---------------------------------------------------------------------------- 1700 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1701 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1702 | input. If `zSign' is 1, the input is negated before being converted to an 1703 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 1704 | is simply rounded to an integer, with the inexact exception raised if the 1705 | input cannot be represented exactly as an integer. However, if the fixed- 1706 | point input is too large, the invalid exception is raised and the largest 1707 | positive or negative integer is returned. 1708 *----------------------------------------------------------------------------*/ 1709 1710 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 1711 { 1712 int8_t roundingMode; 1713 flag roundNearestEven; 1714 int8_t roundIncrement, roundBits; 1715 int32_t z; 1716 1717 roundingMode = status->float_rounding_mode; 1718 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1719 switch (roundingMode) { 1720 case float_round_nearest_even: 1721 case float_round_ties_away: 1722 roundIncrement = 0x40; 1723 break; 1724 case float_round_to_zero: 1725 roundIncrement = 0; 1726 break; 1727 case float_round_up: 1728 roundIncrement = zSign ? 0 : 0x7f; 1729 break; 1730 case float_round_down: 1731 roundIncrement = zSign ? 0x7f : 0; 1732 break; 1733 default: 1734 abort(); 1735 } 1736 roundBits = absZ & 0x7F; 1737 absZ = ( absZ + roundIncrement )>>7; 1738 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1739 z = absZ; 1740 if ( zSign ) z = - z; 1741 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 1742 float_raise(float_flag_invalid, status); 1743 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 1744 } 1745 if (roundBits) { 1746 status->float_exception_flags |= float_flag_inexact; 1747 } 1748 return z; 1749 1750 } 1751 1752 /*---------------------------------------------------------------------------- 1753 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1754 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1755 | and returns the properly rounded 64-bit integer corresponding to the input. 1756 | If `zSign' is 1, the input is negated before being converted to an integer. 1757 | Ordinarily, the fixed-point input is simply rounded to an integer, with 1758 | the inexact exception raised if the input cannot be represented exactly as 1759 | an integer. However, if the fixed-point input is too large, the invalid 1760 | exception is raised and the largest positive or negative integer is 1761 | returned. 1762 *----------------------------------------------------------------------------*/ 1763 1764 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 1765 float_status *status) 1766 { 1767 int8_t roundingMode; 1768 flag roundNearestEven, increment; 1769 int64_t z; 1770 1771 roundingMode = status->float_rounding_mode; 1772 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1773 switch (roundingMode) { 1774 case float_round_nearest_even: 1775 case float_round_ties_away: 1776 increment = ((int64_t) absZ1 < 0); 1777 break; 1778 case float_round_to_zero: 1779 increment = 0; 1780 break; 1781 case float_round_up: 1782 increment = !zSign && absZ1; 1783 break; 1784 case float_round_down: 1785 increment = zSign && absZ1; 1786 break; 1787 default: 1788 abort(); 1789 } 1790 if ( increment ) { 1791 ++absZ0; 1792 if ( absZ0 == 0 ) goto overflow; 1793 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1794 } 1795 z = absZ0; 1796 if ( zSign ) z = - z; 1797 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1798 overflow: 1799 float_raise(float_flag_invalid, status); 1800 return 1801 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 1802 : LIT64( 0x7FFFFFFFFFFFFFFF ); 1803 } 1804 if (absZ1) { 1805 status->float_exception_flags |= float_flag_inexact; 1806 } 1807 return z; 1808 1809 } 1810 1811 /*---------------------------------------------------------------------------- 1812 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1813 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1814 | and returns the properly rounded 64-bit unsigned integer corresponding to the 1815 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 1816 | with the inexact exception raised if the input cannot be represented exactly 1817 | as an integer. However, if the fixed-point input is too large, the invalid 1818 | exception is raised and the largest unsigned integer is returned. 1819 *----------------------------------------------------------------------------*/ 1820 1821 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 1822 uint64_t absZ1, float_status *status) 1823 { 1824 int8_t roundingMode; 1825 flag roundNearestEven, increment; 1826 1827 roundingMode = status->float_rounding_mode; 1828 roundNearestEven = (roundingMode == float_round_nearest_even); 1829 switch (roundingMode) { 1830 case float_round_nearest_even: 1831 case float_round_ties_away: 1832 increment = ((int64_t)absZ1 < 0); 1833 break; 1834 case float_round_to_zero: 1835 increment = 0; 1836 break; 1837 case float_round_up: 1838 increment = !zSign && absZ1; 1839 break; 1840 case float_round_down: 1841 increment = zSign && absZ1; 1842 break; 1843 default: 1844 abort(); 1845 } 1846 if (increment) { 1847 ++absZ0; 1848 if (absZ0 == 0) { 1849 float_raise(float_flag_invalid, status); 1850 return LIT64(0xFFFFFFFFFFFFFFFF); 1851 } 1852 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 1853 } 1854 1855 if (zSign && absZ0) { 1856 float_raise(float_flag_invalid, status); 1857 return 0; 1858 } 1859 1860 if (absZ1) { 1861 status->float_exception_flags |= float_flag_inexact; 1862 } 1863 return absZ0; 1864 } 1865 1866 /*---------------------------------------------------------------------------- 1867 | If `a' is denormal and we are in flush-to-zero mode then set the 1868 | input-denormal exception and return zero. Otherwise just return the value. 1869 *----------------------------------------------------------------------------*/ 1870 float32 float32_squash_input_denormal(float32 a, float_status *status) 1871 { 1872 if (status->flush_inputs_to_zero) { 1873 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 1874 float_raise(float_flag_input_denormal, status); 1875 return make_float32(float32_val(a) & 0x80000000); 1876 } 1877 } 1878 return a; 1879 } 1880 1881 /*---------------------------------------------------------------------------- 1882 | Normalizes the subnormal single-precision floating-point value represented 1883 | by the denormalized significand `aSig'. The normalized exponent and 1884 | significand are stored at the locations pointed to by `zExpPtr' and 1885 | `zSigPtr', respectively. 1886 *----------------------------------------------------------------------------*/ 1887 1888 static void 1889 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 1890 { 1891 int8_t shiftCount; 1892 1893 shiftCount = countLeadingZeros32( aSig ) - 8; 1894 *zSigPtr = aSig<<shiftCount; 1895 *zExpPtr = 1 - shiftCount; 1896 1897 } 1898 1899 /*---------------------------------------------------------------------------- 1900 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1901 | single-precision floating-point value, returning the result. After being 1902 | shifted into the proper positions, the three fields are simply added 1903 | together to form the result. This means that any integer portion of `zSig' 1904 | will be added into the exponent. Since a properly normalized significand 1905 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1906 | than the desired result exponent whenever `zSig' is a complete, normalized 1907 | significand. 1908 *----------------------------------------------------------------------------*/ 1909 1910 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 1911 { 1912 1913 return make_float32( 1914 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 1915 1916 } 1917 1918 /*---------------------------------------------------------------------------- 1919 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1920 | and significand `zSig', and returns the proper single-precision floating- 1921 | point value corresponding to the abstract input. Ordinarily, the abstract 1922 | value is simply rounded and packed into the single-precision format, with 1923 | the inexact exception raised if the abstract input cannot be represented 1924 | exactly. However, if the abstract value is too large, the overflow and 1925 | inexact exceptions are raised and an infinity or maximal finite value is 1926 | returned. If the abstract value is too small, the input value is rounded to 1927 | a subnormal number, and the underflow and inexact exceptions are raised if 1928 | the abstract input cannot be represented exactly as a subnormal single- 1929 | precision floating-point number. 1930 | The input significand `zSig' has its binary point between bits 30 1931 | and 29, which is 7 bits to the left of the usual location. This shifted 1932 | significand must be normalized or smaller. If `zSig' is not normalized, 1933 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1934 | and it must not require rounding. In the usual case that `zSig' is 1935 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1936 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1937 | Binary Floating-Point Arithmetic. 1938 *----------------------------------------------------------------------------*/ 1939 1940 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1941 float_status *status) 1942 { 1943 int8_t roundingMode; 1944 flag roundNearestEven; 1945 int8_t roundIncrement, roundBits; 1946 flag isTiny; 1947 1948 roundingMode = status->float_rounding_mode; 1949 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1950 switch (roundingMode) { 1951 case float_round_nearest_even: 1952 case float_round_ties_away: 1953 roundIncrement = 0x40; 1954 break; 1955 case float_round_to_zero: 1956 roundIncrement = 0; 1957 break; 1958 case float_round_up: 1959 roundIncrement = zSign ? 0 : 0x7f; 1960 break; 1961 case float_round_down: 1962 roundIncrement = zSign ? 0x7f : 0; 1963 break; 1964 default: 1965 abort(); 1966 break; 1967 } 1968 roundBits = zSig & 0x7F; 1969 if ( 0xFD <= (uint16_t) zExp ) { 1970 if ( ( 0xFD < zExp ) 1971 || ( ( zExp == 0xFD ) 1972 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 1973 ) { 1974 float_raise(float_flag_overflow | float_flag_inexact, status); 1975 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 1976 } 1977 if ( zExp < 0 ) { 1978 if (status->flush_to_zero) { 1979 float_raise(float_flag_output_denormal, status); 1980 return packFloat32(zSign, 0, 0); 1981 } 1982 isTiny = 1983 (status->float_detect_tininess 1984 == float_tininess_before_rounding) 1985 || ( zExp < -1 ) 1986 || ( zSig + roundIncrement < 0x80000000 ); 1987 shift32RightJamming( zSig, - zExp, &zSig ); 1988 zExp = 0; 1989 roundBits = zSig & 0x7F; 1990 if (isTiny && roundBits) { 1991 float_raise(float_flag_underflow, status); 1992 } 1993 } 1994 } 1995 if (roundBits) { 1996 status->float_exception_flags |= float_flag_inexact; 1997 } 1998 zSig = ( zSig + roundIncrement )>>7; 1999 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2000 if ( zSig == 0 ) zExp = 0; 2001 return packFloat32( zSign, zExp, zSig ); 2002 2003 } 2004 2005 /*---------------------------------------------------------------------------- 2006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2007 | and significand `zSig', and returns the proper single-precision floating- 2008 | point value corresponding to the abstract input. This routine is just like 2009 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2010 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2011 | floating-point exponent. 2012 *----------------------------------------------------------------------------*/ 2013 2014 static float32 2015 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2016 float_status *status) 2017 { 2018 int8_t shiftCount; 2019 2020 shiftCount = countLeadingZeros32( zSig ) - 1; 2021 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2022 status); 2023 2024 } 2025 2026 /*---------------------------------------------------------------------------- 2027 | If `a' is denormal and we are in flush-to-zero mode then set the 2028 | input-denormal exception and return zero. Otherwise just return the value. 2029 *----------------------------------------------------------------------------*/ 2030 float64 float64_squash_input_denormal(float64 a, float_status *status) 2031 { 2032 if (status->flush_inputs_to_zero) { 2033 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2034 float_raise(float_flag_input_denormal, status); 2035 return make_float64(float64_val(a) & (1ULL << 63)); 2036 } 2037 } 2038 return a; 2039 } 2040 2041 /*---------------------------------------------------------------------------- 2042 | Normalizes the subnormal double-precision floating-point value represented 2043 | by the denormalized significand `aSig'. The normalized exponent and 2044 | significand are stored at the locations pointed to by `zExpPtr' and 2045 | `zSigPtr', respectively. 2046 *----------------------------------------------------------------------------*/ 2047 2048 static void 2049 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2050 { 2051 int8_t shiftCount; 2052 2053 shiftCount = countLeadingZeros64( aSig ) - 11; 2054 *zSigPtr = aSig<<shiftCount; 2055 *zExpPtr = 1 - shiftCount; 2056 2057 } 2058 2059 /*---------------------------------------------------------------------------- 2060 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2061 | double-precision floating-point value, returning the result. After being 2062 | shifted into the proper positions, the three fields are simply added 2063 | together to form the result. This means that any integer portion of `zSig' 2064 | will be added into the exponent. Since a properly normalized significand 2065 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2066 | than the desired result exponent whenever `zSig' is a complete, normalized 2067 | significand. 2068 *----------------------------------------------------------------------------*/ 2069 2070 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2071 { 2072 2073 return make_float64( 2074 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2075 2076 } 2077 2078 /*---------------------------------------------------------------------------- 2079 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2080 | and significand `zSig', and returns the proper double-precision floating- 2081 | point value corresponding to the abstract input. Ordinarily, the abstract 2082 | value is simply rounded and packed into the double-precision format, with 2083 | the inexact exception raised if the abstract input cannot be represented 2084 | exactly. However, if the abstract value is too large, the overflow and 2085 | inexact exceptions are raised and an infinity or maximal finite value is 2086 | returned. If the abstract value is too small, the input value is rounded to 2087 | a subnormal number, and the underflow and inexact exceptions are raised if 2088 | the abstract input cannot be represented exactly as a subnormal double- 2089 | precision floating-point number. 2090 | The input significand `zSig' has its binary point between bits 62 2091 | and 61, which is 10 bits to the left of the usual location. This shifted 2092 | significand must be normalized or smaller. If `zSig' is not normalized, 2093 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2094 | and it must not require rounding. In the usual case that `zSig' is 2095 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2096 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2097 | Binary Floating-Point Arithmetic. 2098 *----------------------------------------------------------------------------*/ 2099 2100 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2101 float_status *status) 2102 { 2103 int8_t roundingMode; 2104 flag roundNearestEven; 2105 int roundIncrement, roundBits; 2106 flag isTiny; 2107 2108 roundingMode = status->float_rounding_mode; 2109 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2110 switch (roundingMode) { 2111 case float_round_nearest_even: 2112 case float_round_ties_away: 2113 roundIncrement = 0x200; 2114 break; 2115 case float_round_to_zero: 2116 roundIncrement = 0; 2117 break; 2118 case float_round_up: 2119 roundIncrement = zSign ? 0 : 0x3ff; 2120 break; 2121 case float_round_down: 2122 roundIncrement = zSign ? 0x3ff : 0; 2123 break; 2124 case float_round_to_odd: 2125 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2126 break; 2127 default: 2128 abort(); 2129 } 2130 roundBits = zSig & 0x3FF; 2131 if ( 0x7FD <= (uint16_t) zExp ) { 2132 if ( ( 0x7FD < zExp ) 2133 || ( ( zExp == 0x7FD ) 2134 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2135 ) { 2136 bool overflow_to_inf = roundingMode != float_round_to_odd && 2137 roundIncrement != 0; 2138 float_raise(float_flag_overflow | float_flag_inexact, status); 2139 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2140 } 2141 if ( zExp < 0 ) { 2142 if (status->flush_to_zero) { 2143 float_raise(float_flag_output_denormal, status); 2144 return packFloat64(zSign, 0, 0); 2145 } 2146 isTiny = 2147 (status->float_detect_tininess 2148 == float_tininess_before_rounding) 2149 || ( zExp < -1 ) 2150 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2151 shift64RightJamming( zSig, - zExp, &zSig ); 2152 zExp = 0; 2153 roundBits = zSig & 0x3FF; 2154 if (isTiny && roundBits) { 2155 float_raise(float_flag_underflow, status); 2156 } 2157 if (roundingMode == float_round_to_odd) { 2158 /* 2159 * For round-to-odd case, the roundIncrement depends on 2160 * zSig which just changed. 2161 */ 2162 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2163 } 2164 } 2165 } 2166 if (roundBits) { 2167 status->float_exception_flags |= float_flag_inexact; 2168 } 2169 zSig = ( zSig + roundIncrement )>>10; 2170 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2171 if ( zSig == 0 ) zExp = 0; 2172 return packFloat64( zSign, zExp, zSig ); 2173 2174 } 2175 2176 /*---------------------------------------------------------------------------- 2177 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2178 | and significand `zSig', and returns the proper double-precision floating- 2179 | point value corresponding to the abstract input. This routine is just like 2180 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2181 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2182 | floating-point exponent. 2183 *----------------------------------------------------------------------------*/ 2184 2185 static float64 2186 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2187 float_status *status) 2188 { 2189 int8_t shiftCount; 2190 2191 shiftCount = countLeadingZeros64( zSig ) - 1; 2192 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2193 status); 2194 2195 } 2196 2197 /*---------------------------------------------------------------------------- 2198 | Returns the fraction bits of the extended double-precision floating-point 2199 | value `a'. 2200 *----------------------------------------------------------------------------*/ 2201 2202 static inline uint64_t extractFloatx80Frac( floatx80 a ) 2203 { 2204 2205 return a.low; 2206 2207 } 2208 2209 /*---------------------------------------------------------------------------- 2210 | Returns the exponent bits of the extended double-precision floating-point 2211 | value `a'. 2212 *----------------------------------------------------------------------------*/ 2213 2214 static inline int32_t extractFloatx80Exp( floatx80 a ) 2215 { 2216 2217 return a.high & 0x7FFF; 2218 2219 } 2220 2221 /*---------------------------------------------------------------------------- 2222 | Returns the sign bit of the extended double-precision floating-point value 2223 | `a'. 2224 *----------------------------------------------------------------------------*/ 2225 2226 static inline flag extractFloatx80Sign( floatx80 a ) 2227 { 2228 2229 return a.high>>15; 2230 2231 } 2232 2233 /*---------------------------------------------------------------------------- 2234 | Normalizes the subnormal extended double-precision floating-point value 2235 | represented by the denormalized significand `aSig'. The normalized exponent 2236 | and significand are stored at the locations pointed to by `zExpPtr' and 2237 | `zSigPtr', respectively. 2238 *----------------------------------------------------------------------------*/ 2239 2240 static void 2241 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 2242 { 2243 int8_t shiftCount; 2244 2245 shiftCount = countLeadingZeros64( aSig ); 2246 *zSigPtr = aSig<<shiftCount; 2247 *zExpPtr = 1 - shiftCount; 2248 2249 } 2250 2251 /*---------------------------------------------------------------------------- 2252 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 2253 | extended double-precision floating-point value, returning the result. 2254 *----------------------------------------------------------------------------*/ 2255 2256 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 2257 { 2258 floatx80 z; 2259 2260 z.low = zSig; 2261 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 2262 return z; 2263 2264 } 2265 2266 /*---------------------------------------------------------------------------- 2267 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2268 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2269 | and returns the proper extended double-precision floating-point value 2270 | corresponding to the abstract input. Ordinarily, the abstract value is 2271 | rounded and packed into the extended double-precision format, with the 2272 | inexact exception raised if the abstract input cannot be represented 2273 | exactly. However, if the abstract value is too large, the overflow and 2274 | inexact exceptions are raised and an infinity or maximal finite value is 2275 | returned. If the abstract value is too small, the input value is rounded to 2276 | a subnormal number, and the underflow and inexact exceptions are raised if 2277 | the abstract input cannot be represented exactly as a subnormal extended 2278 | double-precision floating-point number. 2279 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2280 | number of bits as single or double precision, respectively. Otherwise, the 2281 | result is rounded to the full precision of the extended double-precision 2282 | format. 2283 | The input significand must be normalized or smaller. If the input 2284 | significand is not normalized, `zExp' must be 0; in that case, the result 2285 | returned is a subnormal number, and it must not require rounding. The 2286 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2287 | Floating-Point Arithmetic. 2288 *----------------------------------------------------------------------------*/ 2289 2290 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2291 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2292 float_status *status) 2293 { 2294 int8_t roundingMode; 2295 flag roundNearestEven, increment, isTiny; 2296 int64_t roundIncrement, roundMask, roundBits; 2297 2298 roundingMode = status->float_rounding_mode; 2299 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2300 if ( roundingPrecision == 80 ) goto precision80; 2301 if ( roundingPrecision == 64 ) { 2302 roundIncrement = LIT64( 0x0000000000000400 ); 2303 roundMask = LIT64( 0x00000000000007FF ); 2304 } 2305 else if ( roundingPrecision == 32 ) { 2306 roundIncrement = LIT64( 0x0000008000000000 ); 2307 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2308 } 2309 else { 2310 goto precision80; 2311 } 2312 zSig0 |= ( zSig1 != 0 ); 2313 switch (roundingMode) { 2314 case float_round_nearest_even: 2315 case float_round_ties_away: 2316 break; 2317 case float_round_to_zero: 2318 roundIncrement = 0; 2319 break; 2320 case float_round_up: 2321 roundIncrement = zSign ? 0 : roundMask; 2322 break; 2323 case float_round_down: 2324 roundIncrement = zSign ? roundMask : 0; 2325 break; 2326 default: 2327 abort(); 2328 } 2329 roundBits = zSig0 & roundMask; 2330 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2331 if ( ( 0x7FFE < zExp ) 2332 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2333 ) { 2334 goto overflow; 2335 } 2336 if ( zExp <= 0 ) { 2337 if (status->flush_to_zero) { 2338 float_raise(float_flag_output_denormal, status); 2339 return packFloatx80(zSign, 0, 0); 2340 } 2341 isTiny = 2342 (status->float_detect_tininess 2343 == float_tininess_before_rounding) 2344 || ( zExp < 0 ) 2345 || ( zSig0 <= zSig0 + roundIncrement ); 2346 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2347 zExp = 0; 2348 roundBits = zSig0 & roundMask; 2349 if (isTiny && roundBits) { 2350 float_raise(float_flag_underflow, status); 2351 } 2352 if (roundBits) { 2353 status->float_exception_flags |= float_flag_inexact; 2354 } 2355 zSig0 += roundIncrement; 2356 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2357 roundIncrement = roundMask + 1; 2358 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2359 roundMask |= roundIncrement; 2360 } 2361 zSig0 &= ~ roundMask; 2362 return packFloatx80( zSign, zExp, zSig0 ); 2363 } 2364 } 2365 if (roundBits) { 2366 status->float_exception_flags |= float_flag_inexact; 2367 } 2368 zSig0 += roundIncrement; 2369 if ( zSig0 < roundIncrement ) { 2370 ++zExp; 2371 zSig0 = LIT64( 0x8000000000000000 ); 2372 } 2373 roundIncrement = roundMask + 1; 2374 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2375 roundMask |= roundIncrement; 2376 } 2377 zSig0 &= ~ roundMask; 2378 if ( zSig0 == 0 ) zExp = 0; 2379 return packFloatx80( zSign, zExp, zSig0 ); 2380 precision80: 2381 switch (roundingMode) { 2382 case float_round_nearest_even: 2383 case float_round_ties_away: 2384 increment = ((int64_t)zSig1 < 0); 2385 break; 2386 case float_round_to_zero: 2387 increment = 0; 2388 break; 2389 case float_round_up: 2390 increment = !zSign && zSig1; 2391 break; 2392 case float_round_down: 2393 increment = zSign && zSig1; 2394 break; 2395 default: 2396 abort(); 2397 } 2398 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2399 if ( ( 0x7FFE < zExp ) 2400 || ( ( zExp == 0x7FFE ) 2401 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2402 && increment 2403 ) 2404 ) { 2405 roundMask = 0; 2406 overflow: 2407 float_raise(float_flag_overflow | float_flag_inexact, status); 2408 if ( ( roundingMode == float_round_to_zero ) 2409 || ( zSign && ( roundingMode == float_round_up ) ) 2410 || ( ! zSign && ( roundingMode == float_round_down ) ) 2411 ) { 2412 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2413 } 2414 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2415 } 2416 if ( zExp <= 0 ) { 2417 isTiny = 2418 (status->float_detect_tininess 2419 == float_tininess_before_rounding) 2420 || ( zExp < 0 ) 2421 || ! increment 2422 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2423 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2424 zExp = 0; 2425 if (isTiny && zSig1) { 2426 float_raise(float_flag_underflow, status); 2427 } 2428 if (zSig1) { 2429 status->float_exception_flags |= float_flag_inexact; 2430 } 2431 switch (roundingMode) { 2432 case float_round_nearest_even: 2433 case float_round_ties_away: 2434 increment = ((int64_t)zSig1 < 0); 2435 break; 2436 case float_round_to_zero: 2437 increment = 0; 2438 break; 2439 case float_round_up: 2440 increment = !zSign && zSig1; 2441 break; 2442 case float_round_down: 2443 increment = zSign && zSig1; 2444 break; 2445 default: 2446 abort(); 2447 } 2448 if ( increment ) { 2449 ++zSig0; 2450 zSig0 &= 2451 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2452 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2453 } 2454 return packFloatx80( zSign, zExp, zSig0 ); 2455 } 2456 } 2457 if (zSig1) { 2458 status->float_exception_flags |= float_flag_inexact; 2459 } 2460 if ( increment ) { 2461 ++zSig0; 2462 if ( zSig0 == 0 ) { 2463 ++zExp; 2464 zSig0 = LIT64( 0x8000000000000000 ); 2465 } 2466 else { 2467 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2468 } 2469 } 2470 else { 2471 if ( zSig0 == 0 ) zExp = 0; 2472 } 2473 return packFloatx80( zSign, zExp, zSig0 ); 2474 2475 } 2476 2477 /*---------------------------------------------------------------------------- 2478 | Takes an abstract floating-point value having sign `zSign', exponent 2479 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2480 | and returns the proper extended double-precision floating-point value 2481 | corresponding to the abstract input. This routine is just like 2482 | `roundAndPackFloatx80' except that the input significand does not have to be 2483 | normalized. 2484 *----------------------------------------------------------------------------*/ 2485 2486 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2487 flag zSign, int32_t zExp, 2488 uint64_t zSig0, uint64_t zSig1, 2489 float_status *status) 2490 { 2491 int8_t shiftCount; 2492 2493 if ( zSig0 == 0 ) { 2494 zSig0 = zSig1; 2495 zSig1 = 0; 2496 zExp -= 64; 2497 } 2498 shiftCount = countLeadingZeros64( zSig0 ); 2499 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2500 zExp -= shiftCount; 2501 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2502 zSig0, zSig1, status); 2503 2504 } 2505 2506 /*---------------------------------------------------------------------------- 2507 | Returns the least-significant 64 fraction bits of the quadruple-precision 2508 | floating-point value `a'. 2509 *----------------------------------------------------------------------------*/ 2510 2511 static inline uint64_t extractFloat128Frac1( float128 a ) 2512 { 2513 2514 return a.low; 2515 2516 } 2517 2518 /*---------------------------------------------------------------------------- 2519 | Returns the most-significant 48 fraction bits of the quadruple-precision 2520 | floating-point value `a'. 2521 *----------------------------------------------------------------------------*/ 2522 2523 static inline uint64_t extractFloat128Frac0( float128 a ) 2524 { 2525 2526 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2527 2528 } 2529 2530 /*---------------------------------------------------------------------------- 2531 | Returns the exponent bits of the quadruple-precision floating-point value 2532 | `a'. 2533 *----------------------------------------------------------------------------*/ 2534 2535 static inline int32_t extractFloat128Exp( float128 a ) 2536 { 2537 2538 return ( a.high>>48 ) & 0x7FFF; 2539 2540 } 2541 2542 /*---------------------------------------------------------------------------- 2543 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2544 *----------------------------------------------------------------------------*/ 2545 2546 static inline flag extractFloat128Sign( float128 a ) 2547 { 2548 2549 return a.high>>63; 2550 2551 } 2552 2553 /*---------------------------------------------------------------------------- 2554 | Normalizes the subnormal quadruple-precision floating-point value 2555 | represented by the denormalized significand formed by the concatenation of 2556 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2557 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2558 | significand are stored at the location pointed to by `zSig0Ptr', and the 2559 | least significant 64 bits of the normalized significand are stored at the 2560 | location pointed to by `zSig1Ptr'. 2561 *----------------------------------------------------------------------------*/ 2562 2563 static void 2564 normalizeFloat128Subnormal( 2565 uint64_t aSig0, 2566 uint64_t aSig1, 2567 int32_t *zExpPtr, 2568 uint64_t *zSig0Ptr, 2569 uint64_t *zSig1Ptr 2570 ) 2571 { 2572 int8_t shiftCount; 2573 2574 if ( aSig0 == 0 ) { 2575 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2576 if ( shiftCount < 0 ) { 2577 *zSig0Ptr = aSig1>>( - shiftCount ); 2578 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2579 } 2580 else { 2581 *zSig0Ptr = aSig1<<shiftCount; 2582 *zSig1Ptr = 0; 2583 } 2584 *zExpPtr = - shiftCount - 63; 2585 } 2586 else { 2587 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2588 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2589 *zExpPtr = 1 - shiftCount; 2590 } 2591 2592 } 2593 2594 /*---------------------------------------------------------------------------- 2595 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2596 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2597 | floating-point value, returning the result. After being shifted into the 2598 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2599 | added together to form the most significant 32 bits of the result. This 2600 | means that any integer portion of `zSig0' will be added into the exponent. 2601 | Since a properly normalized significand will have an integer portion equal 2602 | to 1, the `zExp' input should be 1 less than the desired result exponent 2603 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2604 | significand. 2605 *----------------------------------------------------------------------------*/ 2606 2607 static inline float128 2608 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2609 { 2610 float128 z; 2611 2612 z.low = zSig1; 2613 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2614 return z; 2615 2616 } 2617 2618 /*---------------------------------------------------------------------------- 2619 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2620 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2621 | and `zSig2', and returns the proper quadruple-precision floating-point value 2622 | corresponding to the abstract input. Ordinarily, the abstract value is 2623 | simply rounded and packed into the quadruple-precision format, with the 2624 | inexact exception raised if the abstract input cannot be represented 2625 | exactly. However, if the abstract value is too large, the overflow and 2626 | inexact exceptions are raised and an infinity or maximal finite value is 2627 | returned. If the abstract value is too small, the input value is rounded to 2628 | a subnormal number, and the underflow and inexact exceptions are raised if 2629 | the abstract input cannot be represented exactly as a subnormal quadruple- 2630 | precision floating-point number. 2631 | The input significand must be normalized or smaller. If the input 2632 | significand is not normalized, `zExp' must be 0; in that case, the result 2633 | returned is a subnormal number, and it must not require rounding. In the 2634 | usual case that the input significand is normalized, `zExp' must be 1 less 2635 | than the ``true'' floating-point exponent. The handling of underflow and 2636 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2637 *----------------------------------------------------------------------------*/ 2638 2639 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2640 uint64_t zSig0, uint64_t zSig1, 2641 uint64_t zSig2, float_status *status) 2642 { 2643 int8_t roundingMode; 2644 flag roundNearestEven, increment, isTiny; 2645 2646 roundingMode = status->float_rounding_mode; 2647 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2648 switch (roundingMode) { 2649 case float_round_nearest_even: 2650 case float_round_ties_away: 2651 increment = ((int64_t)zSig2 < 0); 2652 break; 2653 case float_round_to_zero: 2654 increment = 0; 2655 break; 2656 case float_round_up: 2657 increment = !zSign && zSig2; 2658 break; 2659 case float_round_down: 2660 increment = zSign && zSig2; 2661 break; 2662 case float_round_to_odd: 2663 increment = !(zSig1 & 0x1) && zSig2; 2664 break; 2665 default: 2666 abort(); 2667 } 2668 if ( 0x7FFD <= (uint32_t) zExp ) { 2669 if ( ( 0x7FFD < zExp ) 2670 || ( ( zExp == 0x7FFD ) 2671 && eq128( 2672 LIT64( 0x0001FFFFFFFFFFFF ), 2673 LIT64( 0xFFFFFFFFFFFFFFFF ), 2674 zSig0, 2675 zSig1 2676 ) 2677 && increment 2678 ) 2679 ) { 2680 float_raise(float_flag_overflow | float_flag_inexact, status); 2681 if ( ( roundingMode == float_round_to_zero ) 2682 || ( zSign && ( roundingMode == float_round_up ) ) 2683 || ( ! zSign && ( roundingMode == float_round_down ) ) 2684 || (roundingMode == float_round_to_odd) 2685 ) { 2686 return 2687 packFloat128( 2688 zSign, 2689 0x7FFE, 2690 LIT64( 0x0000FFFFFFFFFFFF ), 2691 LIT64( 0xFFFFFFFFFFFFFFFF ) 2692 ); 2693 } 2694 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2695 } 2696 if ( zExp < 0 ) { 2697 if (status->flush_to_zero) { 2698 float_raise(float_flag_output_denormal, status); 2699 return packFloat128(zSign, 0, 0, 0); 2700 } 2701 isTiny = 2702 (status->float_detect_tininess 2703 == float_tininess_before_rounding) 2704 || ( zExp < -1 ) 2705 || ! increment 2706 || lt128( 2707 zSig0, 2708 zSig1, 2709 LIT64( 0x0001FFFFFFFFFFFF ), 2710 LIT64( 0xFFFFFFFFFFFFFFFF ) 2711 ); 2712 shift128ExtraRightJamming( 2713 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2714 zExp = 0; 2715 if (isTiny && zSig2) { 2716 float_raise(float_flag_underflow, status); 2717 } 2718 switch (roundingMode) { 2719 case float_round_nearest_even: 2720 case float_round_ties_away: 2721 increment = ((int64_t)zSig2 < 0); 2722 break; 2723 case float_round_to_zero: 2724 increment = 0; 2725 break; 2726 case float_round_up: 2727 increment = !zSign && zSig2; 2728 break; 2729 case float_round_down: 2730 increment = zSign && zSig2; 2731 break; 2732 case float_round_to_odd: 2733 increment = !(zSig1 & 0x1) && zSig2; 2734 break; 2735 default: 2736 abort(); 2737 } 2738 } 2739 } 2740 if (zSig2) { 2741 status->float_exception_flags |= float_flag_inexact; 2742 } 2743 if ( increment ) { 2744 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2745 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2746 } 2747 else { 2748 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2749 } 2750 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2751 2752 } 2753 2754 /*---------------------------------------------------------------------------- 2755 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2756 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2757 | returns the proper quadruple-precision floating-point value corresponding 2758 | to the abstract input. This routine is just like `roundAndPackFloat128' 2759 | except that the input significand has fewer bits and does not have to be 2760 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2761 | point exponent. 2762 *----------------------------------------------------------------------------*/ 2763 2764 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2765 uint64_t zSig0, uint64_t zSig1, 2766 float_status *status) 2767 { 2768 int8_t shiftCount; 2769 uint64_t zSig2; 2770 2771 if ( zSig0 == 0 ) { 2772 zSig0 = zSig1; 2773 zSig1 = 0; 2774 zExp -= 64; 2775 } 2776 shiftCount = countLeadingZeros64( zSig0 ) - 15; 2777 if ( 0 <= shiftCount ) { 2778 zSig2 = 0; 2779 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2780 } 2781 else { 2782 shift128ExtraRightJamming( 2783 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 2784 } 2785 zExp -= shiftCount; 2786 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 2787 2788 } 2789 2790 2791 /*---------------------------------------------------------------------------- 2792 | Returns the result of converting the 32-bit two's complement integer `a' 2793 | to the extended double-precision floating-point format. The conversion 2794 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2795 | Arithmetic. 2796 *----------------------------------------------------------------------------*/ 2797 2798 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2799 { 2800 flag zSign; 2801 uint32_t absA; 2802 int8_t shiftCount; 2803 uint64_t zSig; 2804 2805 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2806 zSign = ( a < 0 ); 2807 absA = zSign ? - a : a; 2808 shiftCount = countLeadingZeros32( absA ) + 32; 2809 zSig = absA; 2810 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 2811 2812 } 2813 2814 /*---------------------------------------------------------------------------- 2815 | Returns the result of converting the 32-bit two's complement integer `a' to 2816 | the quadruple-precision floating-point format. The conversion is performed 2817 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2818 *----------------------------------------------------------------------------*/ 2819 2820 float128 int32_to_float128(int32_t a, float_status *status) 2821 { 2822 flag zSign; 2823 uint32_t absA; 2824 int8_t shiftCount; 2825 uint64_t zSig0; 2826 2827 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2828 zSign = ( a < 0 ); 2829 absA = zSign ? - a : a; 2830 shiftCount = countLeadingZeros32( absA ) + 17; 2831 zSig0 = absA; 2832 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 2833 2834 } 2835 2836 /*---------------------------------------------------------------------------- 2837 | Returns the result of converting the 64-bit two's complement integer `a' 2838 | to the extended double-precision floating-point format. The conversion 2839 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2840 | Arithmetic. 2841 *----------------------------------------------------------------------------*/ 2842 2843 floatx80 int64_to_floatx80(int64_t a, float_status *status) 2844 { 2845 flag zSign; 2846 uint64_t absA; 2847 int8_t shiftCount; 2848 2849 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2850 zSign = ( a < 0 ); 2851 absA = zSign ? - a : a; 2852 shiftCount = countLeadingZeros64( absA ); 2853 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 2854 2855 } 2856 2857 /*---------------------------------------------------------------------------- 2858 | Returns the result of converting the 64-bit two's complement integer `a' to 2859 | the quadruple-precision floating-point format. The conversion is performed 2860 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2861 *----------------------------------------------------------------------------*/ 2862 2863 float128 int64_to_float128(int64_t a, float_status *status) 2864 { 2865 flag zSign; 2866 uint64_t absA; 2867 int8_t shiftCount; 2868 int32_t zExp; 2869 uint64_t zSig0, zSig1; 2870 2871 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2872 zSign = ( a < 0 ); 2873 absA = zSign ? - a : a; 2874 shiftCount = countLeadingZeros64( absA ) + 49; 2875 zExp = 0x406E - shiftCount; 2876 if ( 64 <= shiftCount ) { 2877 zSig1 = 0; 2878 zSig0 = absA; 2879 shiftCount -= 64; 2880 } 2881 else { 2882 zSig1 = absA; 2883 zSig0 = 0; 2884 } 2885 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2886 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2887 2888 } 2889 2890 /*---------------------------------------------------------------------------- 2891 | Returns the result of converting the 64-bit unsigned integer `a' 2892 | to the quadruple-precision floating-point format. The conversion is performed 2893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2894 *----------------------------------------------------------------------------*/ 2895 2896 float128 uint64_to_float128(uint64_t a, float_status *status) 2897 { 2898 if (a == 0) { 2899 return float128_zero; 2900 } 2901 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 2902 } 2903 2904 2905 2906 2907 /*---------------------------------------------------------------------------- 2908 | Returns the result of converting the single-precision floating-point value 2909 | `a' to the double-precision floating-point format. The conversion is 2910 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2911 | Arithmetic. 2912 *----------------------------------------------------------------------------*/ 2913 2914 float64 float32_to_float64(float32 a, float_status *status) 2915 { 2916 flag aSign; 2917 int aExp; 2918 uint32_t aSig; 2919 a = float32_squash_input_denormal(a, status); 2920 2921 aSig = extractFloat32Frac( a ); 2922 aExp = extractFloat32Exp( a ); 2923 aSign = extractFloat32Sign( a ); 2924 if ( aExp == 0xFF ) { 2925 if (aSig) { 2926 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 2927 } 2928 return packFloat64( aSign, 0x7FF, 0 ); 2929 } 2930 if ( aExp == 0 ) { 2931 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 2932 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2933 --aExp; 2934 } 2935 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 2936 2937 } 2938 2939 /*---------------------------------------------------------------------------- 2940 | Returns the result of converting the single-precision floating-point value 2941 | `a' to the extended double-precision floating-point format. The conversion 2942 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2943 | Arithmetic. 2944 *----------------------------------------------------------------------------*/ 2945 2946 floatx80 float32_to_floatx80(float32 a, float_status *status) 2947 { 2948 flag aSign; 2949 int aExp; 2950 uint32_t aSig; 2951 2952 a = float32_squash_input_denormal(a, status); 2953 aSig = extractFloat32Frac( a ); 2954 aExp = extractFloat32Exp( a ); 2955 aSign = extractFloat32Sign( a ); 2956 if ( aExp == 0xFF ) { 2957 if (aSig) { 2958 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 2959 } 2960 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2961 } 2962 if ( aExp == 0 ) { 2963 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2964 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2965 } 2966 aSig |= 0x00800000; 2967 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 2968 2969 } 2970 2971 /*---------------------------------------------------------------------------- 2972 | Returns the result of converting the single-precision floating-point value 2973 | `a' to the double-precision floating-point format. The conversion is 2974 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2975 | Arithmetic. 2976 *----------------------------------------------------------------------------*/ 2977 2978 float128 float32_to_float128(float32 a, float_status *status) 2979 { 2980 flag aSign; 2981 int aExp; 2982 uint32_t aSig; 2983 2984 a = float32_squash_input_denormal(a, status); 2985 aSig = extractFloat32Frac( a ); 2986 aExp = extractFloat32Exp( a ); 2987 aSign = extractFloat32Sign( a ); 2988 if ( aExp == 0xFF ) { 2989 if (aSig) { 2990 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 2991 } 2992 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2993 } 2994 if ( aExp == 0 ) { 2995 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2996 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2997 --aExp; 2998 } 2999 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3000 3001 } 3002 3003 /*---------------------------------------------------------------------------- 3004 | Returns the remainder of the single-precision floating-point value `a' 3005 | with respect to the corresponding value `b'. The operation is performed 3006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3007 *----------------------------------------------------------------------------*/ 3008 3009 float32 float32_rem(float32 a, float32 b, float_status *status) 3010 { 3011 flag aSign, zSign; 3012 int aExp, bExp, expDiff; 3013 uint32_t aSig, bSig; 3014 uint32_t q; 3015 uint64_t aSig64, bSig64, q64; 3016 uint32_t alternateASig; 3017 int32_t sigMean; 3018 a = float32_squash_input_denormal(a, status); 3019 b = float32_squash_input_denormal(b, status); 3020 3021 aSig = extractFloat32Frac( a ); 3022 aExp = extractFloat32Exp( a ); 3023 aSign = extractFloat32Sign( a ); 3024 bSig = extractFloat32Frac( b ); 3025 bExp = extractFloat32Exp( b ); 3026 if ( aExp == 0xFF ) { 3027 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3028 return propagateFloat32NaN(a, b, status); 3029 } 3030 float_raise(float_flag_invalid, status); 3031 return float32_default_nan(status); 3032 } 3033 if ( bExp == 0xFF ) { 3034 if (bSig) { 3035 return propagateFloat32NaN(a, b, status); 3036 } 3037 return a; 3038 } 3039 if ( bExp == 0 ) { 3040 if ( bSig == 0 ) { 3041 float_raise(float_flag_invalid, status); 3042 return float32_default_nan(status); 3043 } 3044 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3045 } 3046 if ( aExp == 0 ) { 3047 if ( aSig == 0 ) return a; 3048 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3049 } 3050 expDiff = aExp - bExp; 3051 aSig |= 0x00800000; 3052 bSig |= 0x00800000; 3053 if ( expDiff < 32 ) { 3054 aSig <<= 8; 3055 bSig <<= 8; 3056 if ( expDiff < 0 ) { 3057 if ( expDiff < -1 ) return a; 3058 aSig >>= 1; 3059 } 3060 q = ( bSig <= aSig ); 3061 if ( q ) aSig -= bSig; 3062 if ( 0 < expDiff ) { 3063 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3064 q >>= 32 - expDiff; 3065 bSig >>= 2; 3066 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3067 } 3068 else { 3069 aSig >>= 2; 3070 bSig >>= 2; 3071 } 3072 } 3073 else { 3074 if ( bSig <= aSig ) aSig -= bSig; 3075 aSig64 = ( (uint64_t) aSig )<<40; 3076 bSig64 = ( (uint64_t) bSig )<<40; 3077 expDiff -= 64; 3078 while ( 0 < expDiff ) { 3079 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3080 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3081 aSig64 = - ( ( bSig * q64 )<<38 ); 3082 expDiff -= 62; 3083 } 3084 expDiff += 64; 3085 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3086 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3087 q = q64>>( 64 - expDiff ); 3088 bSig <<= 6; 3089 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3090 } 3091 do { 3092 alternateASig = aSig; 3093 ++q; 3094 aSig -= bSig; 3095 } while ( 0 <= (int32_t) aSig ); 3096 sigMean = aSig + alternateASig; 3097 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3098 aSig = alternateASig; 3099 } 3100 zSign = ( (int32_t) aSig < 0 ); 3101 if ( zSign ) aSig = - aSig; 3102 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3103 } 3104 3105 3106 /*---------------------------------------------------------------------------- 3107 | Returns the square root of the single-precision floating-point value `a'. 3108 | The operation is performed according to the IEC/IEEE Standard for Binary 3109 | Floating-Point Arithmetic. 3110 *----------------------------------------------------------------------------*/ 3111 3112 float32 float32_sqrt(float32 a, float_status *status) 3113 { 3114 flag aSign; 3115 int aExp, zExp; 3116 uint32_t aSig, zSig; 3117 uint64_t rem, term; 3118 a = float32_squash_input_denormal(a, status); 3119 3120 aSig = extractFloat32Frac( a ); 3121 aExp = extractFloat32Exp( a ); 3122 aSign = extractFloat32Sign( a ); 3123 if ( aExp == 0xFF ) { 3124 if (aSig) { 3125 return propagateFloat32NaN(a, float32_zero, status); 3126 } 3127 if ( ! aSign ) return a; 3128 float_raise(float_flag_invalid, status); 3129 return float32_default_nan(status); 3130 } 3131 if ( aSign ) { 3132 if ( ( aExp | aSig ) == 0 ) return a; 3133 float_raise(float_flag_invalid, status); 3134 return float32_default_nan(status); 3135 } 3136 if ( aExp == 0 ) { 3137 if ( aSig == 0 ) return float32_zero; 3138 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3139 } 3140 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3141 aSig = ( aSig | 0x00800000 )<<8; 3142 zSig = estimateSqrt32( aExp, aSig ) + 2; 3143 if ( ( zSig & 0x7F ) <= 5 ) { 3144 if ( zSig < 2 ) { 3145 zSig = 0x7FFFFFFF; 3146 goto roundAndPack; 3147 } 3148 aSig >>= aExp & 1; 3149 term = ( (uint64_t) zSig ) * zSig; 3150 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3151 while ( (int64_t) rem < 0 ) { 3152 --zSig; 3153 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3154 } 3155 zSig |= ( rem != 0 ); 3156 } 3157 shift32RightJamming( zSig, 1, &zSig ); 3158 roundAndPack: 3159 return roundAndPackFloat32(0, zExp, zSig, status); 3160 3161 } 3162 3163 /*---------------------------------------------------------------------------- 3164 | Returns the binary exponential of the single-precision floating-point value 3165 | `a'. The operation is performed according to the IEC/IEEE Standard for 3166 | Binary Floating-Point Arithmetic. 3167 | 3168 | Uses the following identities: 3169 | 3170 | 1. ------------------------------------------------------------------------- 3171 | x x*ln(2) 3172 | 2 = e 3173 | 3174 | 2. ------------------------------------------------------------------------- 3175 | 2 3 4 5 n 3176 | x x x x x x x 3177 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3178 | 1! 2! 3! 4! 5! n! 3179 *----------------------------------------------------------------------------*/ 3180 3181 static const float64 float32_exp2_coefficients[15] = 3182 { 3183 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3184 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3185 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3186 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3187 const_float64( 0x3f81111111111111ll ), /* 5 */ 3188 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3189 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3190 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3191 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3192 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3193 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3194 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3195 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3196 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3197 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3198 }; 3199 3200 float32 float32_exp2(float32 a, float_status *status) 3201 { 3202 flag aSign; 3203 int aExp; 3204 uint32_t aSig; 3205 float64 r, x, xn; 3206 int i; 3207 a = float32_squash_input_denormal(a, status); 3208 3209 aSig = extractFloat32Frac( a ); 3210 aExp = extractFloat32Exp( a ); 3211 aSign = extractFloat32Sign( a ); 3212 3213 if ( aExp == 0xFF) { 3214 if (aSig) { 3215 return propagateFloat32NaN(a, float32_zero, status); 3216 } 3217 return (aSign) ? float32_zero : a; 3218 } 3219 if (aExp == 0) { 3220 if (aSig == 0) return float32_one; 3221 } 3222 3223 float_raise(float_flag_inexact, status); 3224 3225 /* ******************************* */ 3226 /* using float64 for approximation */ 3227 /* ******************************* */ 3228 x = float32_to_float64(a, status); 3229 x = float64_mul(x, float64_ln2, status); 3230 3231 xn = x; 3232 r = float64_one; 3233 for (i = 0 ; i < 15 ; i++) { 3234 float64 f; 3235 3236 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3237 r = float64_add(r, f, status); 3238 3239 xn = float64_mul(xn, x, status); 3240 } 3241 3242 return float64_to_float32(r, status); 3243 } 3244 3245 /*---------------------------------------------------------------------------- 3246 | Returns the binary log of the single-precision floating-point value `a'. 3247 | The operation is performed according to the IEC/IEEE Standard for Binary 3248 | Floating-Point Arithmetic. 3249 *----------------------------------------------------------------------------*/ 3250 float32 float32_log2(float32 a, float_status *status) 3251 { 3252 flag aSign, zSign; 3253 int aExp; 3254 uint32_t aSig, zSig, i; 3255 3256 a = float32_squash_input_denormal(a, status); 3257 aSig = extractFloat32Frac( a ); 3258 aExp = extractFloat32Exp( a ); 3259 aSign = extractFloat32Sign( a ); 3260 3261 if ( aExp == 0 ) { 3262 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3263 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3264 } 3265 if ( aSign ) { 3266 float_raise(float_flag_invalid, status); 3267 return float32_default_nan(status); 3268 } 3269 if ( aExp == 0xFF ) { 3270 if (aSig) { 3271 return propagateFloat32NaN(a, float32_zero, status); 3272 } 3273 return a; 3274 } 3275 3276 aExp -= 0x7F; 3277 aSig |= 0x00800000; 3278 zSign = aExp < 0; 3279 zSig = aExp << 23; 3280 3281 for (i = 1 << 22; i > 0; i >>= 1) { 3282 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3283 if ( aSig & 0x01000000 ) { 3284 aSig >>= 1; 3285 zSig |= i; 3286 } 3287 } 3288 3289 if ( zSign ) 3290 zSig = -zSig; 3291 3292 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3293 } 3294 3295 /*---------------------------------------------------------------------------- 3296 | Returns 1 if the single-precision floating-point value `a' is equal to 3297 | the corresponding value `b', and 0 otherwise. The invalid exception is 3298 | raised if either operand is a NaN. Otherwise, the comparison is performed 3299 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3300 *----------------------------------------------------------------------------*/ 3301 3302 int float32_eq(float32 a, float32 b, float_status *status) 3303 { 3304 uint32_t av, bv; 3305 a = float32_squash_input_denormal(a, status); 3306 b = float32_squash_input_denormal(b, status); 3307 3308 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3309 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3310 ) { 3311 float_raise(float_flag_invalid, status); 3312 return 0; 3313 } 3314 av = float32_val(a); 3315 bv = float32_val(b); 3316 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3317 } 3318 3319 /*---------------------------------------------------------------------------- 3320 | Returns 1 if the single-precision floating-point value `a' is less than 3321 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3322 | exception is raised if either operand is a NaN. The comparison is performed 3323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3324 *----------------------------------------------------------------------------*/ 3325 3326 int float32_le(float32 a, float32 b, float_status *status) 3327 { 3328 flag aSign, bSign; 3329 uint32_t av, bv; 3330 a = float32_squash_input_denormal(a, status); 3331 b = float32_squash_input_denormal(b, status); 3332 3333 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3334 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3335 ) { 3336 float_raise(float_flag_invalid, status); 3337 return 0; 3338 } 3339 aSign = extractFloat32Sign( a ); 3340 bSign = extractFloat32Sign( b ); 3341 av = float32_val(a); 3342 bv = float32_val(b); 3343 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3344 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3345 3346 } 3347 3348 /*---------------------------------------------------------------------------- 3349 | Returns 1 if the single-precision floating-point value `a' is less than 3350 | the corresponding value `b', and 0 otherwise. The invalid exception is 3351 | raised if either operand is a NaN. The comparison is performed according 3352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3353 *----------------------------------------------------------------------------*/ 3354 3355 int float32_lt(float32 a, float32 b, float_status *status) 3356 { 3357 flag aSign, bSign; 3358 uint32_t av, bv; 3359 a = float32_squash_input_denormal(a, status); 3360 b = float32_squash_input_denormal(b, status); 3361 3362 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3363 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3364 ) { 3365 float_raise(float_flag_invalid, status); 3366 return 0; 3367 } 3368 aSign = extractFloat32Sign( a ); 3369 bSign = extractFloat32Sign( b ); 3370 av = float32_val(a); 3371 bv = float32_val(b); 3372 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3373 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3374 3375 } 3376 3377 /*---------------------------------------------------------------------------- 3378 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3379 | be compared, and 0 otherwise. The invalid exception is raised if either 3380 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3381 | Standard for Binary Floating-Point Arithmetic. 3382 *----------------------------------------------------------------------------*/ 3383 3384 int float32_unordered(float32 a, float32 b, float_status *status) 3385 { 3386 a = float32_squash_input_denormal(a, status); 3387 b = float32_squash_input_denormal(b, status); 3388 3389 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3390 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3391 ) { 3392 float_raise(float_flag_invalid, status); 3393 return 1; 3394 } 3395 return 0; 3396 } 3397 3398 /*---------------------------------------------------------------------------- 3399 | Returns 1 if the single-precision floating-point value `a' is equal to 3400 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3401 | exception. The comparison is performed according to the IEC/IEEE Standard 3402 | for Binary Floating-Point Arithmetic. 3403 *----------------------------------------------------------------------------*/ 3404 3405 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3406 { 3407 a = float32_squash_input_denormal(a, status); 3408 b = float32_squash_input_denormal(b, status); 3409 3410 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3411 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3412 ) { 3413 if (float32_is_signaling_nan(a, status) 3414 || float32_is_signaling_nan(b, status)) { 3415 float_raise(float_flag_invalid, status); 3416 } 3417 return 0; 3418 } 3419 return ( float32_val(a) == float32_val(b) ) || 3420 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3421 } 3422 3423 /*---------------------------------------------------------------------------- 3424 | Returns 1 if the single-precision floating-point value `a' is less than or 3425 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3426 | cause an exception. Otherwise, the comparison is performed according to the 3427 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3428 *----------------------------------------------------------------------------*/ 3429 3430 int float32_le_quiet(float32 a, float32 b, float_status *status) 3431 { 3432 flag aSign, bSign; 3433 uint32_t av, bv; 3434 a = float32_squash_input_denormal(a, status); 3435 b = float32_squash_input_denormal(b, status); 3436 3437 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3438 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3439 ) { 3440 if (float32_is_signaling_nan(a, status) 3441 || float32_is_signaling_nan(b, status)) { 3442 float_raise(float_flag_invalid, status); 3443 } 3444 return 0; 3445 } 3446 aSign = extractFloat32Sign( a ); 3447 bSign = extractFloat32Sign( b ); 3448 av = float32_val(a); 3449 bv = float32_val(b); 3450 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3451 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3452 3453 } 3454 3455 /*---------------------------------------------------------------------------- 3456 | Returns 1 if the single-precision floating-point value `a' is less than 3457 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3458 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3459 | Standard for Binary Floating-Point Arithmetic. 3460 *----------------------------------------------------------------------------*/ 3461 3462 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3463 { 3464 flag aSign, bSign; 3465 uint32_t av, bv; 3466 a = float32_squash_input_denormal(a, status); 3467 b = float32_squash_input_denormal(b, status); 3468 3469 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3470 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3471 ) { 3472 if (float32_is_signaling_nan(a, status) 3473 || float32_is_signaling_nan(b, status)) { 3474 float_raise(float_flag_invalid, status); 3475 } 3476 return 0; 3477 } 3478 aSign = extractFloat32Sign( a ); 3479 bSign = extractFloat32Sign( b ); 3480 av = float32_val(a); 3481 bv = float32_val(b); 3482 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3483 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3484 3485 } 3486 3487 /*---------------------------------------------------------------------------- 3488 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3489 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3490 | comparison is performed according to the IEC/IEEE Standard for Binary 3491 | Floating-Point Arithmetic. 3492 *----------------------------------------------------------------------------*/ 3493 3494 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3495 { 3496 a = float32_squash_input_denormal(a, status); 3497 b = float32_squash_input_denormal(b, status); 3498 3499 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3500 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3501 ) { 3502 if (float32_is_signaling_nan(a, status) 3503 || float32_is_signaling_nan(b, status)) { 3504 float_raise(float_flag_invalid, status); 3505 } 3506 return 1; 3507 } 3508 return 0; 3509 } 3510 3511 3512 /*---------------------------------------------------------------------------- 3513 | Returns the result of converting the double-precision floating-point value 3514 | `a' to the single-precision floating-point format. The conversion is 3515 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3516 | Arithmetic. 3517 *----------------------------------------------------------------------------*/ 3518 3519 float32 float64_to_float32(float64 a, float_status *status) 3520 { 3521 flag aSign; 3522 int aExp; 3523 uint64_t aSig; 3524 uint32_t zSig; 3525 a = float64_squash_input_denormal(a, status); 3526 3527 aSig = extractFloat64Frac( a ); 3528 aExp = extractFloat64Exp( a ); 3529 aSign = extractFloat64Sign( a ); 3530 if ( aExp == 0x7FF ) { 3531 if (aSig) { 3532 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3533 } 3534 return packFloat32( aSign, 0xFF, 0 ); 3535 } 3536 shift64RightJamming( aSig, 22, &aSig ); 3537 zSig = aSig; 3538 if ( aExp || zSig ) { 3539 zSig |= 0x40000000; 3540 aExp -= 0x381; 3541 } 3542 return roundAndPackFloat32(aSign, aExp, zSig, status); 3543 3544 } 3545 3546 3547 /*---------------------------------------------------------------------------- 3548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3549 | half-precision floating-point value, returning the result. After being 3550 | shifted into the proper positions, the three fields are simply added 3551 | together to form the result. This means that any integer portion of `zSig' 3552 | will be added into the exponent. Since a properly normalized significand 3553 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3554 | than the desired result exponent whenever `zSig' is a complete, normalized 3555 | significand. 3556 *----------------------------------------------------------------------------*/ 3557 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3558 { 3559 return make_float16( 3560 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3561 } 3562 3563 /*---------------------------------------------------------------------------- 3564 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3565 | and significand `zSig', and returns the proper half-precision floating- 3566 | point value corresponding to the abstract input. Ordinarily, the abstract 3567 | value is simply rounded and packed into the half-precision format, with 3568 | the inexact exception raised if the abstract input cannot be represented 3569 | exactly. However, if the abstract value is too large, the overflow and 3570 | inexact exceptions are raised and an infinity or maximal finite value is 3571 | returned. If the abstract value is too small, the input value is rounded to 3572 | a subnormal number, and the underflow and inexact exceptions are raised if 3573 | the abstract input cannot be represented exactly as a subnormal half- 3574 | precision floating-point number. 3575 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3576 | ARM-style "alternative representation", which omits the NaN and Inf 3577 | encodings in order to raise the maximum representable exponent by one. 3578 | The input significand `zSig' has its binary point between bits 22 3579 | and 23, which is 13 bits to the left of the usual location. This shifted 3580 | significand must be normalized or smaller. If `zSig' is not normalized, 3581 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3582 | and it must not require rounding. In the usual case that `zSig' is 3583 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3584 | Note the slightly odd position of the binary point in zSig compared with the 3585 | other roundAndPackFloat functions. This should probably be fixed if we 3586 | need to implement more float16 routines than just conversion. 3587 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3588 | Binary Floating-Point Arithmetic. 3589 *----------------------------------------------------------------------------*/ 3590 3591 static float16 roundAndPackFloat16(flag zSign, int zExp, 3592 uint32_t zSig, flag ieee, 3593 float_status *status) 3594 { 3595 int maxexp = ieee ? 29 : 30; 3596 uint32_t mask; 3597 uint32_t increment; 3598 bool rounding_bumps_exp; 3599 bool is_tiny = false; 3600 3601 /* Calculate the mask of bits of the mantissa which are not 3602 * representable in half-precision and will be lost. 3603 */ 3604 if (zExp < 1) { 3605 /* Will be denormal in halfprec */ 3606 mask = 0x00ffffff; 3607 if (zExp >= -11) { 3608 mask >>= 11 + zExp; 3609 } 3610 } else { 3611 /* Normal number in halfprec */ 3612 mask = 0x00001fff; 3613 } 3614 3615 switch (status->float_rounding_mode) { 3616 case float_round_nearest_even: 3617 increment = (mask + 1) >> 1; 3618 if ((zSig & mask) == increment) { 3619 increment = zSig & (increment << 1); 3620 } 3621 break; 3622 case float_round_ties_away: 3623 increment = (mask + 1) >> 1; 3624 break; 3625 case float_round_up: 3626 increment = zSign ? 0 : mask; 3627 break; 3628 case float_round_down: 3629 increment = zSign ? mask : 0; 3630 break; 3631 default: /* round_to_zero */ 3632 increment = 0; 3633 break; 3634 } 3635 3636 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3637 3638 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3639 if (ieee) { 3640 float_raise(float_flag_overflow | float_flag_inexact, status); 3641 return packFloat16(zSign, 0x1f, 0); 3642 } else { 3643 float_raise(float_flag_invalid, status); 3644 return packFloat16(zSign, 0x1f, 0x3ff); 3645 } 3646 } 3647 3648 if (zExp < 0) { 3649 /* Note that flush-to-zero does not affect half-precision results */ 3650 is_tiny = 3651 (status->float_detect_tininess == float_tininess_before_rounding) 3652 || (zExp < -1) 3653 || (!rounding_bumps_exp); 3654 } 3655 if (zSig & mask) { 3656 float_raise(float_flag_inexact, status); 3657 if (is_tiny) { 3658 float_raise(float_flag_underflow, status); 3659 } 3660 } 3661 3662 zSig += increment; 3663 if (rounding_bumps_exp) { 3664 zSig >>= 1; 3665 zExp++; 3666 } 3667 3668 if (zExp < -10) { 3669 return packFloat16(zSign, 0, 0); 3670 } 3671 if (zExp < 0) { 3672 zSig >>= -zExp; 3673 zExp = 0; 3674 } 3675 return packFloat16(zSign, zExp, zSig >> 13); 3676 } 3677 3678 /*---------------------------------------------------------------------------- 3679 | If `a' is denormal and we are in flush-to-zero mode then set the 3680 | input-denormal exception and return zero. Otherwise just return the value. 3681 *----------------------------------------------------------------------------*/ 3682 float16 float16_squash_input_denormal(float16 a, float_status *status) 3683 { 3684 if (status->flush_inputs_to_zero) { 3685 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3686 float_raise(float_flag_input_denormal, status); 3687 return make_float16(float16_val(a) & 0x8000); 3688 } 3689 } 3690 return a; 3691 } 3692 3693 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3694 uint32_t *zSigPtr) 3695 { 3696 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3697 *zSigPtr = aSig << shiftCount; 3698 *zExpPtr = 1 - shiftCount; 3699 } 3700 3701 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3702 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3703 3704 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3705 { 3706 flag aSign; 3707 int aExp; 3708 uint32_t aSig; 3709 3710 aSign = extractFloat16Sign(a); 3711 aExp = extractFloat16Exp(a); 3712 aSig = extractFloat16Frac(a); 3713 3714 if (aExp == 0x1f && ieee) { 3715 if (aSig) { 3716 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3717 } 3718 return packFloat32(aSign, 0xff, 0); 3719 } 3720 if (aExp == 0) { 3721 if (aSig == 0) { 3722 return packFloat32(aSign, 0, 0); 3723 } 3724 3725 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3726 aExp--; 3727 } 3728 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3729 } 3730 3731 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3732 { 3733 flag aSign; 3734 int aExp; 3735 uint32_t aSig; 3736 3737 a = float32_squash_input_denormal(a, status); 3738 3739 aSig = extractFloat32Frac( a ); 3740 aExp = extractFloat32Exp( a ); 3741 aSign = extractFloat32Sign( a ); 3742 if ( aExp == 0xFF ) { 3743 if (aSig) { 3744 /* Input is a NaN */ 3745 if (!ieee) { 3746 float_raise(float_flag_invalid, status); 3747 return packFloat16(aSign, 0, 0); 3748 } 3749 return commonNaNToFloat16( 3750 float32ToCommonNaN(a, status), status); 3751 } 3752 /* Infinity */ 3753 if (!ieee) { 3754 float_raise(float_flag_invalid, status); 3755 return packFloat16(aSign, 0x1f, 0x3ff); 3756 } 3757 return packFloat16(aSign, 0x1f, 0); 3758 } 3759 if (aExp == 0 && aSig == 0) { 3760 return packFloat16(aSign, 0, 0); 3761 } 3762 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3763 * even if the input is denormal; however this is harmless because 3764 * the largest possible single-precision denormal is still smaller 3765 * than the smallest representable half-precision denormal, and so we 3766 * will end up ignoring aSig and returning via the "always return zero" 3767 * codepath. 3768 */ 3769 aSig |= 0x00800000; 3770 aExp -= 0x71; 3771 3772 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3773 } 3774 3775 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3776 { 3777 flag aSign; 3778 int aExp; 3779 uint32_t aSig; 3780 3781 aSign = extractFloat16Sign(a); 3782 aExp = extractFloat16Exp(a); 3783 aSig = extractFloat16Frac(a); 3784 3785 if (aExp == 0x1f && ieee) { 3786 if (aSig) { 3787 return commonNaNToFloat64( 3788 float16ToCommonNaN(a, status), status); 3789 } 3790 return packFloat64(aSign, 0x7ff, 0); 3791 } 3792 if (aExp == 0) { 3793 if (aSig == 0) { 3794 return packFloat64(aSign, 0, 0); 3795 } 3796 3797 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3798 aExp--; 3799 } 3800 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3801 } 3802 3803 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3804 { 3805 flag aSign; 3806 int aExp; 3807 uint64_t aSig; 3808 uint32_t zSig; 3809 3810 a = float64_squash_input_denormal(a, status); 3811 3812 aSig = extractFloat64Frac(a); 3813 aExp = extractFloat64Exp(a); 3814 aSign = extractFloat64Sign(a); 3815 if (aExp == 0x7FF) { 3816 if (aSig) { 3817 /* Input is a NaN */ 3818 if (!ieee) { 3819 float_raise(float_flag_invalid, status); 3820 return packFloat16(aSign, 0, 0); 3821 } 3822 return commonNaNToFloat16( 3823 float64ToCommonNaN(a, status), status); 3824 } 3825 /* Infinity */ 3826 if (!ieee) { 3827 float_raise(float_flag_invalid, status); 3828 return packFloat16(aSign, 0x1f, 0x3ff); 3829 } 3830 return packFloat16(aSign, 0x1f, 0); 3831 } 3832 shift64RightJamming(aSig, 29, &aSig); 3833 zSig = aSig; 3834 if (aExp == 0 && zSig == 0) { 3835 return packFloat16(aSign, 0, 0); 3836 } 3837 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3838 * even if the input is denormal; however this is harmless because 3839 * the largest possible single-precision denormal is still smaller 3840 * than the smallest representable half-precision denormal, and so we 3841 * will end up ignoring aSig and returning via the "always return zero" 3842 * codepath. 3843 */ 3844 zSig |= 0x00800000; 3845 aExp -= 0x3F1; 3846 3847 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3848 } 3849 3850 /*---------------------------------------------------------------------------- 3851 | Returns the result of converting the double-precision floating-point value 3852 | `a' to the extended double-precision floating-point format. The conversion 3853 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3854 | Arithmetic. 3855 *----------------------------------------------------------------------------*/ 3856 3857 floatx80 float64_to_floatx80(float64 a, float_status *status) 3858 { 3859 flag aSign; 3860 int aExp; 3861 uint64_t aSig; 3862 3863 a = float64_squash_input_denormal(a, status); 3864 aSig = extractFloat64Frac( a ); 3865 aExp = extractFloat64Exp( a ); 3866 aSign = extractFloat64Sign( a ); 3867 if ( aExp == 0x7FF ) { 3868 if (aSig) { 3869 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3870 } 3871 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3872 } 3873 if ( aExp == 0 ) { 3874 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3875 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3876 } 3877 return 3878 packFloatx80( 3879 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3880 3881 } 3882 3883 /*---------------------------------------------------------------------------- 3884 | Returns the result of converting the double-precision floating-point value 3885 | `a' to the quadruple-precision floating-point format. The conversion is 3886 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3887 | Arithmetic. 3888 *----------------------------------------------------------------------------*/ 3889 3890 float128 float64_to_float128(float64 a, float_status *status) 3891 { 3892 flag aSign; 3893 int aExp; 3894 uint64_t aSig, zSig0, zSig1; 3895 3896 a = float64_squash_input_denormal(a, status); 3897 aSig = extractFloat64Frac( a ); 3898 aExp = extractFloat64Exp( a ); 3899 aSign = extractFloat64Sign( a ); 3900 if ( aExp == 0x7FF ) { 3901 if (aSig) { 3902 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3903 } 3904 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3905 } 3906 if ( aExp == 0 ) { 3907 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3908 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3909 --aExp; 3910 } 3911 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3912 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3913 3914 } 3915 3916 3917 /*---------------------------------------------------------------------------- 3918 | Returns the remainder of the double-precision floating-point value `a' 3919 | with respect to the corresponding value `b'. The operation is performed 3920 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3921 *----------------------------------------------------------------------------*/ 3922 3923 float64 float64_rem(float64 a, float64 b, float_status *status) 3924 { 3925 flag aSign, zSign; 3926 int aExp, bExp, expDiff; 3927 uint64_t aSig, bSig; 3928 uint64_t q, alternateASig; 3929 int64_t sigMean; 3930 3931 a = float64_squash_input_denormal(a, status); 3932 b = float64_squash_input_denormal(b, status); 3933 aSig = extractFloat64Frac( a ); 3934 aExp = extractFloat64Exp( a ); 3935 aSign = extractFloat64Sign( a ); 3936 bSig = extractFloat64Frac( b ); 3937 bExp = extractFloat64Exp( b ); 3938 if ( aExp == 0x7FF ) { 3939 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3940 return propagateFloat64NaN(a, b, status); 3941 } 3942 float_raise(float_flag_invalid, status); 3943 return float64_default_nan(status); 3944 } 3945 if ( bExp == 0x7FF ) { 3946 if (bSig) { 3947 return propagateFloat64NaN(a, b, status); 3948 } 3949 return a; 3950 } 3951 if ( bExp == 0 ) { 3952 if ( bSig == 0 ) { 3953 float_raise(float_flag_invalid, status); 3954 return float64_default_nan(status); 3955 } 3956 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3957 } 3958 if ( aExp == 0 ) { 3959 if ( aSig == 0 ) return a; 3960 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3961 } 3962 expDiff = aExp - bExp; 3963 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3964 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3965 if ( expDiff < 0 ) { 3966 if ( expDiff < -1 ) return a; 3967 aSig >>= 1; 3968 } 3969 q = ( bSig <= aSig ); 3970 if ( q ) aSig -= bSig; 3971 expDiff -= 64; 3972 while ( 0 < expDiff ) { 3973 q = estimateDiv128To64( aSig, 0, bSig ); 3974 q = ( 2 < q ) ? q - 2 : 0; 3975 aSig = - ( ( bSig>>2 ) * q ); 3976 expDiff -= 62; 3977 } 3978 expDiff += 64; 3979 if ( 0 < expDiff ) { 3980 q = estimateDiv128To64( aSig, 0, bSig ); 3981 q = ( 2 < q ) ? q - 2 : 0; 3982 q >>= 64 - expDiff; 3983 bSig >>= 2; 3984 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3985 } 3986 else { 3987 aSig >>= 2; 3988 bSig >>= 2; 3989 } 3990 do { 3991 alternateASig = aSig; 3992 ++q; 3993 aSig -= bSig; 3994 } while ( 0 <= (int64_t) aSig ); 3995 sigMean = aSig + alternateASig; 3996 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3997 aSig = alternateASig; 3998 } 3999 zSign = ( (int64_t) aSig < 0 ); 4000 if ( zSign ) aSig = - aSig; 4001 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4002 4003 } 4004 4005 4006 /*---------------------------------------------------------------------------- 4007 | Returns the square root of the double-precision floating-point value `a'. 4008 | The operation is performed according to the IEC/IEEE Standard for Binary 4009 | Floating-Point Arithmetic. 4010 *----------------------------------------------------------------------------*/ 4011 4012 float64 float64_sqrt(float64 a, float_status *status) 4013 { 4014 flag aSign; 4015 int aExp, zExp; 4016 uint64_t aSig, zSig, doubleZSig; 4017 uint64_t rem0, rem1, term0, term1; 4018 a = float64_squash_input_denormal(a, status); 4019 4020 aSig = extractFloat64Frac( a ); 4021 aExp = extractFloat64Exp( a ); 4022 aSign = extractFloat64Sign( a ); 4023 if ( aExp == 0x7FF ) { 4024 if (aSig) { 4025 return propagateFloat64NaN(a, a, status); 4026 } 4027 if ( ! aSign ) return a; 4028 float_raise(float_flag_invalid, status); 4029 return float64_default_nan(status); 4030 } 4031 if ( aSign ) { 4032 if ( ( aExp | aSig ) == 0 ) return a; 4033 float_raise(float_flag_invalid, status); 4034 return float64_default_nan(status); 4035 } 4036 if ( aExp == 0 ) { 4037 if ( aSig == 0 ) return float64_zero; 4038 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4039 } 4040 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4041 aSig |= LIT64( 0x0010000000000000 ); 4042 zSig = estimateSqrt32( aExp, aSig>>21 ); 4043 aSig <<= 9 - ( aExp & 1 ); 4044 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4045 if ( ( zSig & 0x1FF ) <= 5 ) { 4046 doubleZSig = zSig<<1; 4047 mul64To128( zSig, zSig, &term0, &term1 ); 4048 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4049 while ( (int64_t) rem0 < 0 ) { 4050 --zSig; 4051 doubleZSig -= 2; 4052 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4053 } 4054 zSig |= ( ( rem0 | rem1 ) != 0 ); 4055 } 4056 return roundAndPackFloat64(0, zExp, zSig, status); 4057 4058 } 4059 4060 /*---------------------------------------------------------------------------- 4061 | Returns the binary log of the double-precision floating-point value `a'. 4062 | The operation is performed according to the IEC/IEEE Standard for Binary 4063 | Floating-Point Arithmetic. 4064 *----------------------------------------------------------------------------*/ 4065 float64 float64_log2(float64 a, float_status *status) 4066 { 4067 flag aSign, zSign; 4068 int aExp; 4069 uint64_t aSig, aSig0, aSig1, zSig, i; 4070 a = float64_squash_input_denormal(a, status); 4071 4072 aSig = extractFloat64Frac( a ); 4073 aExp = extractFloat64Exp( a ); 4074 aSign = extractFloat64Sign( a ); 4075 4076 if ( aExp == 0 ) { 4077 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4078 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4079 } 4080 if ( aSign ) { 4081 float_raise(float_flag_invalid, status); 4082 return float64_default_nan(status); 4083 } 4084 if ( aExp == 0x7FF ) { 4085 if (aSig) { 4086 return propagateFloat64NaN(a, float64_zero, status); 4087 } 4088 return a; 4089 } 4090 4091 aExp -= 0x3FF; 4092 aSig |= LIT64( 0x0010000000000000 ); 4093 zSign = aExp < 0; 4094 zSig = (uint64_t)aExp << 52; 4095 for (i = 1LL << 51; i > 0; i >>= 1) { 4096 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4097 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4098 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4099 aSig >>= 1; 4100 zSig |= i; 4101 } 4102 } 4103 4104 if ( zSign ) 4105 zSig = -zSig; 4106 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4107 } 4108 4109 /*---------------------------------------------------------------------------- 4110 | Returns 1 if the double-precision floating-point value `a' is equal to the 4111 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4112 | if either operand is a NaN. Otherwise, the comparison is performed 4113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4114 *----------------------------------------------------------------------------*/ 4115 4116 int float64_eq(float64 a, float64 b, float_status *status) 4117 { 4118 uint64_t av, bv; 4119 a = float64_squash_input_denormal(a, status); 4120 b = float64_squash_input_denormal(b, status); 4121 4122 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4123 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4124 ) { 4125 float_raise(float_flag_invalid, status); 4126 return 0; 4127 } 4128 av = float64_val(a); 4129 bv = float64_val(b); 4130 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4131 4132 } 4133 4134 /*---------------------------------------------------------------------------- 4135 | Returns 1 if the double-precision floating-point value `a' is less than or 4136 | equal to the corresponding value `b', and 0 otherwise. The invalid 4137 | exception is raised if either operand is a NaN. The comparison is performed 4138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4139 *----------------------------------------------------------------------------*/ 4140 4141 int float64_le(float64 a, float64 b, float_status *status) 4142 { 4143 flag aSign, bSign; 4144 uint64_t av, bv; 4145 a = float64_squash_input_denormal(a, status); 4146 b = float64_squash_input_denormal(b, status); 4147 4148 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4149 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4150 ) { 4151 float_raise(float_flag_invalid, status); 4152 return 0; 4153 } 4154 aSign = extractFloat64Sign( a ); 4155 bSign = extractFloat64Sign( b ); 4156 av = float64_val(a); 4157 bv = float64_val(b); 4158 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4159 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4160 4161 } 4162 4163 /*---------------------------------------------------------------------------- 4164 | Returns 1 if the double-precision floating-point value `a' is less than 4165 | the corresponding value `b', and 0 otherwise. The invalid exception is 4166 | raised if either operand is a NaN. The comparison is performed according 4167 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4168 *----------------------------------------------------------------------------*/ 4169 4170 int float64_lt(float64 a, float64 b, float_status *status) 4171 { 4172 flag aSign, bSign; 4173 uint64_t av, bv; 4174 4175 a = float64_squash_input_denormal(a, status); 4176 b = float64_squash_input_denormal(b, status); 4177 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4178 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4179 ) { 4180 float_raise(float_flag_invalid, status); 4181 return 0; 4182 } 4183 aSign = extractFloat64Sign( a ); 4184 bSign = extractFloat64Sign( b ); 4185 av = float64_val(a); 4186 bv = float64_val(b); 4187 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4188 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4189 4190 } 4191 4192 /*---------------------------------------------------------------------------- 4193 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4194 | be compared, and 0 otherwise. The invalid exception is raised if either 4195 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4196 | Standard for Binary Floating-Point Arithmetic. 4197 *----------------------------------------------------------------------------*/ 4198 4199 int float64_unordered(float64 a, float64 b, float_status *status) 4200 { 4201 a = float64_squash_input_denormal(a, status); 4202 b = float64_squash_input_denormal(b, status); 4203 4204 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4205 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4206 ) { 4207 float_raise(float_flag_invalid, status); 4208 return 1; 4209 } 4210 return 0; 4211 } 4212 4213 /*---------------------------------------------------------------------------- 4214 | Returns 1 if the double-precision floating-point value `a' is equal to the 4215 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4216 | exception.The comparison is performed according to the IEC/IEEE Standard 4217 | for Binary Floating-Point Arithmetic. 4218 *----------------------------------------------------------------------------*/ 4219 4220 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4221 { 4222 uint64_t av, bv; 4223 a = float64_squash_input_denormal(a, status); 4224 b = float64_squash_input_denormal(b, status); 4225 4226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4228 ) { 4229 if (float64_is_signaling_nan(a, status) 4230 || float64_is_signaling_nan(b, status)) { 4231 float_raise(float_flag_invalid, status); 4232 } 4233 return 0; 4234 } 4235 av = float64_val(a); 4236 bv = float64_val(b); 4237 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4238 4239 } 4240 4241 /*---------------------------------------------------------------------------- 4242 | Returns 1 if the double-precision floating-point value `a' is less than or 4243 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4244 | cause an exception. Otherwise, the comparison is performed according to the 4245 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4246 *----------------------------------------------------------------------------*/ 4247 4248 int float64_le_quiet(float64 a, float64 b, float_status *status) 4249 { 4250 flag aSign, bSign; 4251 uint64_t av, bv; 4252 a = float64_squash_input_denormal(a, status); 4253 b = float64_squash_input_denormal(b, status); 4254 4255 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4256 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4257 ) { 4258 if (float64_is_signaling_nan(a, status) 4259 || float64_is_signaling_nan(b, status)) { 4260 float_raise(float_flag_invalid, status); 4261 } 4262 return 0; 4263 } 4264 aSign = extractFloat64Sign( a ); 4265 bSign = extractFloat64Sign( b ); 4266 av = float64_val(a); 4267 bv = float64_val(b); 4268 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4269 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4270 4271 } 4272 4273 /*---------------------------------------------------------------------------- 4274 | Returns 1 if the double-precision floating-point value `a' is less than 4275 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4276 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4277 | Standard for Binary Floating-Point Arithmetic. 4278 *----------------------------------------------------------------------------*/ 4279 4280 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4281 { 4282 flag aSign, bSign; 4283 uint64_t av, bv; 4284 a = float64_squash_input_denormal(a, status); 4285 b = float64_squash_input_denormal(b, status); 4286 4287 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4288 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4289 ) { 4290 if (float64_is_signaling_nan(a, status) 4291 || float64_is_signaling_nan(b, status)) { 4292 float_raise(float_flag_invalid, status); 4293 } 4294 return 0; 4295 } 4296 aSign = extractFloat64Sign( a ); 4297 bSign = extractFloat64Sign( b ); 4298 av = float64_val(a); 4299 bv = float64_val(b); 4300 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4301 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4302 4303 } 4304 4305 /*---------------------------------------------------------------------------- 4306 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4307 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4308 | comparison is performed according to the IEC/IEEE Standard for Binary 4309 | Floating-Point Arithmetic. 4310 *----------------------------------------------------------------------------*/ 4311 4312 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4313 { 4314 a = float64_squash_input_denormal(a, status); 4315 b = float64_squash_input_denormal(b, status); 4316 4317 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4318 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4319 ) { 4320 if (float64_is_signaling_nan(a, status) 4321 || float64_is_signaling_nan(b, status)) { 4322 float_raise(float_flag_invalid, status); 4323 } 4324 return 1; 4325 } 4326 return 0; 4327 } 4328 4329 /*---------------------------------------------------------------------------- 4330 | Returns the result of converting the extended double-precision floating- 4331 | point value `a' to the 32-bit two's complement integer format. The 4332 | conversion is performed according to the IEC/IEEE Standard for Binary 4333 | Floating-Point Arithmetic---which means in particular that the conversion 4334 | is rounded according to the current rounding mode. If `a' is a NaN, the 4335 | largest positive integer is returned. Otherwise, if the conversion 4336 | overflows, the largest integer with the same sign as `a' is returned. 4337 *----------------------------------------------------------------------------*/ 4338 4339 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4340 { 4341 flag aSign; 4342 int32_t aExp, shiftCount; 4343 uint64_t aSig; 4344 4345 if (floatx80_invalid_encoding(a)) { 4346 float_raise(float_flag_invalid, status); 4347 return 1 << 31; 4348 } 4349 aSig = extractFloatx80Frac( a ); 4350 aExp = extractFloatx80Exp( a ); 4351 aSign = extractFloatx80Sign( a ); 4352 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4353 shiftCount = 0x4037 - aExp; 4354 if ( shiftCount <= 0 ) shiftCount = 1; 4355 shift64RightJamming( aSig, shiftCount, &aSig ); 4356 return roundAndPackInt32(aSign, aSig, status); 4357 4358 } 4359 4360 /*---------------------------------------------------------------------------- 4361 | Returns the result of converting the extended double-precision floating- 4362 | point value `a' to the 32-bit two's complement integer format. The 4363 | conversion is performed according to the IEC/IEEE Standard for Binary 4364 | Floating-Point Arithmetic, except that the conversion is always rounded 4365 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4366 | Otherwise, if the conversion overflows, the largest integer with the same 4367 | sign as `a' is returned. 4368 *----------------------------------------------------------------------------*/ 4369 4370 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4371 { 4372 flag aSign; 4373 int32_t aExp, shiftCount; 4374 uint64_t aSig, savedASig; 4375 int32_t z; 4376 4377 if (floatx80_invalid_encoding(a)) { 4378 float_raise(float_flag_invalid, status); 4379 return 1 << 31; 4380 } 4381 aSig = extractFloatx80Frac( a ); 4382 aExp = extractFloatx80Exp( a ); 4383 aSign = extractFloatx80Sign( a ); 4384 if ( 0x401E < aExp ) { 4385 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4386 goto invalid; 4387 } 4388 else if ( aExp < 0x3FFF ) { 4389 if (aExp || aSig) { 4390 status->float_exception_flags |= float_flag_inexact; 4391 } 4392 return 0; 4393 } 4394 shiftCount = 0x403E - aExp; 4395 savedASig = aSig; 4396 aSig >>= shiftCount; 4397 z = aSig; 4398 if ( aSign ) z = - z; 4399 if ( ( z < 0 ) ^ aSign ) { 4400 invalid: 4401 float_raise(float_flag_invalid, status); 4402 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4403 } 4404 if ( ( aSig<<shiftCount ) != savedASig ) { 4405 status->float_exception_flags |= float_flag_inexact; 4406 } 4407 return z; 4408 4409 } 4410 4411 /*---------------------------------------------------------------------------- 4412 | Returns the result of converting the extended double-precision floating- 4413 | point value `a' to the 64-bit two's complement integer format. The 4414 | conversion is performed according to the IEC/IEEE Standard for Binary 4415 | Floating-Point Arithmetic---which means in particular that the conversion 4416 | is rounded according to the current rounding mode. If `a' is a NaN, 4417 | the largest positive integer is returned. Otherwise, if the conversion 4418 | overflows, the largest integer with the same sign as `a' is returned. 4419 *----------------------------------------------------------------------------*/ 4420 4421 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4422 { 4423 flag aSign; 4424 int32_t aExp, shiftCount; 4425 uint64_t aSig, aSigExtra; 4426 4427 if (floatx80_invalid_encoding(a)) { 4428 float_raise(float_flag_invalid, status); 4429 return 1ULL << 63; 4430 } 4431 aSig = extractFloatx80Frac( a ); 4432 aExp = extractFloatx80Exp( a ); 4433 aSign = extractFloatx80Sign( a ); 4434 shiftCount = 0x403E - aExp; 4435 if ( shiftCount <= 0 ) { 4436 if ( shiftCount ) { 4437 float_raise(float_flag_invalid, status); 4438 if ( ! aSign 4439 || ( ( aExp == 0x7FFF ) 4440 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4441 ) { 4442 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4443 } 4444 return (int64_t) LIT64( 0x8000000000000000 ); 4445 } 4446 aSigExtra = 0; 4447 } 4448 else { 4449 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4450 } 4451 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4452 4453 } 4454 4455 /*---------------------------------------------------------------------------- 4456 | Returns the result of converting the extended double-precision floating- 4457 | point value `a' to the 64-bit two's complement integer format. The 4458 | conversion is performed according to the IEC/IEEE Standard for Binary 4459 | Floating-Point Arithmetic, except that the conversion is always rounded 4460 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4461 | Otherwise, if the conversion overflows, the largest integer with the same 4462 | sign as `a' is returned. 4463 *----------------------------------------------------------------------------*/ 4464 4465 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4466 { 4467 flag aSign; 4468 int32_t aExp, shiftCount; 4469 uint64_t aSig; 4470 int64_t z; 4471 4472 if (floatx80_invalid_encoding(a)) { 4473 float_raise(float_flag_invalid, status); 4474 return 1ULL << 63; 4475 } 4476 aSig = extractFloatx80Frac( a ); 4477 aExp = extractFloatx80Exp( a ); 4478 aSign = extractFloatx80Sign( a ); 4479 shiftCount = aExp - 0x403E; 4480 if ( 0 <= shiftCount ) { 4481 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4482 if ( ( a.high != 0xC03E ) || aSig ) { 4483 float_raise(float_flag_invalid, status); 4484 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4485 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4486 } 4487 } 4488 return (int64_t) LIT64( 0x8000000000000000 ); 4489 } 4490 else if ( aExp < 0x3FFF ) { 4491 if (aExp | aSig) { 4492 status->float_exception_flags |= float_flag_inexact; 4493 } 4494 return 0; 4495 } 4496 z = aSig>>( - shiftCount ); 4497 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4498 status->float_exception_flags |= float_flag_inexact; 4499 } 4500 if ( aSign ) z = - z; 4501 return z; 4502 4503 } 4504 4505 /*---------------------------------------------------------------------------- 4506 | Returns the result of converting the extended double-precision floating- 4507 | point value `a' to the single-precision floating-point format. The 4508 | conversion is performed according to the IEC/IEEE Standard for Binary 4509 | Floating-Point Arithmetic. 4510 *----------------------------------------------------------------------------*/ 4511 4512 float32 floatx80_to_float32(floatx80 a, float_status *status) 4513 { 4514 flag aSign; 4515 int32_t aExp; 4516 uint64_t aSig; 4517 4518 if (floatx80_invalid_encoding(a)) { 4519 float_raise(float_flag_invalid, status); 4520 return float32_default_nan(status); 4521 } 4522 aSig = extractFloatx80Frac( a ); 4523 aExp = extractFloatx80Exp( a ); 4524 aSign = extractFloatx80Sign( a ); 4525 if ( aExp == 0x7FFF ) { 4526 if ( (uint64_t) ( aSig<<1 ) ) { 4527 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4528 } 4529 return packFloat32( aSign, 0xFF, 0 ); 4530 } 4531 shift64RightJamming( aSig, 33, &aSig ); 4532 if ( aExp || aSig ) aExp -= 0x3F81; 4533 return roundAndPackFloat32(aSign, aExp, aSig, status); 4534 4535 } 4536 4537 /*---------------------------------------------------------------------------- 4538 | Returns the result of converting the extended double-precision floating- 4539 | point value `a' to the double-precision floating-point format. The 4540 | conversion is performed according to the IEC/IEEE Standard for Binary 4541 | Floating-Point Arithmetic. 4542 *----------------------------------------------------------------------------*/ 4543 4544 float64 floatx80_to_float64(floatx80 a, float_status *status) 4545 { 4546 flag aSign; 4547 int32_t aExp; 4548 uint64_t aSig, zSig; 4549 4550 if (floatx80_invalid_encoding(a)) { 4551 float_raise(float_flag_invalid, status); 4552 return float64_default_nan(status); 4553 } 4554 aSig = extractFloatx80Frac( a ); 4555 aExp = extractFloatx80Exp( a ); 4556 aSign = extractFloatx80Sign( a ); 4557 if ( aExp == 0x7FFF ) { 4558 if ( (uint64_t) ( aSig<<1 ) ) { 4559 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4560 } 4561 return packFloat64( aSign, 0x7FF, 0 ); 4562 } 4563 shift64RightJamming( aSig, 1, &zSig ); 4564 if ( aExp || aSig ) aExp -= 0x3C01; 4565 return roundAndPackFloat64(aSign, aExp, zSig, status); 4566 4567 } 4568 4569 /*---------------------------------------------------------------------------- 4570 | Returns the result of converting the extended double-precision floating- 4571 | point value `a' to the quadruple-precision floating-point format. The 4572 | conversion is performed according to the IEC/IEEE Standard for Binary 4573 | Floating-Point Arithmetic. 4574 *----------------------------------------------------------------------------*/ 4575 4576 float128 floatx80_to_float128(floatx80 a, float_status *status) 4577 { 4578 flag aSign; 4579 int aExp; 4580 uint64_t aSig, zSig0, zSig1; 4581 4582 if (floatx80_invalid_encoding(a)) { 4583 float_raise(float_flag_invalid, status); 4584 return float128_default_nan(status); 4585 } 4586 aSig = extractFloatx80Frac( a ); 4587 aExp = extractFloatx80Exp( a ); 4588 aSign = extractFloatx80Sign( a ); 4589 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4590 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4591 } 4592 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4593 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4594 4595 } 4596 4597 /*---------------------------------------------------------------------------- 4598 | Rounds the extended double-precision floating-point value `a' 4599 | to the precision provided by floatx80_rounding_precision and returns the 4600 | result as an extended double-precision floating-point value. 4601 | The operation is performed according to the IEC/IEEE Standard for Binary 4602 | Floating-Point Arithmetic. 4603 *----------------------------------------------------------------------------*/ 4604 4605 floatx80 floatx80_round(floatx80 a, float_status *status) 4606 { 4607 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4608 extractFloatx80Sign(a), 4609 extractFloatx80Exp(a), 4610 extractFloatx80Frac(a), 0, status); 4611 } 4612 4613 /*---------------------------------------------------------------------------- 4614 | Rounds the extended double-precision floating-point value `a' to an integer, 4615 | and returns the result as an extended quadruple-precision floating-point 4616 | value. The operation is performed according to the IEC/IEEE Standard for 4617 | Binary Floating-Point Arithmetic. 4618 *----------------------------------------------------------------------------*/ 4619 4620 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4621 { 4622 flag aSign; 4623 int32_t aExp; 4624 uint64_t lastBitMask, roundBitsMask; 4625 floatx80 z; 4626 4627 if (floatx80_invalid_encoding(a)) { 4628 float_raise(float_flag_invalid, status); 4629 return floatx80_default_nan(status); 4630 } 4631 aExp = extractFloatx80Exp( a ); 4632 if ( 0x403E <= aExp ) { 4633 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4634 return propagateFloatx80NaN(a, a, status); 4635 } 4636 return a; 4637 } 4638 if ( aExp < 0x3FFF ) { 4639 if ( ( aExp == 0 ) 4640 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4641 return a; 4642 } 4643 status->float_exception_flags |= float_flag_inexact; 4644 aSign = extractFloatx80Sign( a ); 4645 switch (status->float_rounding_mode) { 4646 case float_round_nearest_even: 4647 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4648 ) { 4649 return 4650 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4651 } 4652 break; 4653 case float_round_ties_away: 4654 if (aExp == 0x3FFE) { 4655 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4656 } 4657 break; 4658 case float_round_down: 4659 return 4660 aSign ? 4661 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4662 : packFloatx80( 0, 0, 0 ); 4663 case float_round_up: 4664 return 4665 aSign ? packFloatx80( 1, 0, 0 ) 4666 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4667 } 4668 return packFloatx80( aSign, 0, 0 ); 4669 } 4670 lastBitMask = 1; 4671 lastBitMask <<= 0x403E - aExp; 4672 roundBitsMask = lastBitMask - 1; 4673 z = a; 4674 switch (status->float_rounding_mode) { 4675 case float_round_nearest_even: 4676 z.low += lastBitMask>>1; 4677 if ((z.low & roundBitsMask) == 0) { 4678 z.low &= ~lastBitMask; 4679 } 4680 break; 4681 case float_round_ties_away: 4682 z.low += lastBitMask >> 1; 4683 break; 4684 case float_round_to_zero: 4685 break; 4686 case float_round_up: 4687 if (!extractFloatx80Sign(z)) { 4688 z.low += roundBitsMask; 4689 } 4690 break; 4691 case float_round_down: 4692 if (extractFloatx80Sign(z)) { 4693 z.low += roundBitsMask; 4694 } 4695 break; 4696 default: 4697 abort(); 4698 } 4699 z.low &= ~ roundBitsMask; 4700 if ( z.low == 0 ) { 4701 ++z.high; 4702 z.low = LIT64( 0x8000000000000000 ); 4703 } 4704 if (z.low != a.low) { 4705 status->float_exception_flags |= float_flag_inexact; 4706 } 4707 return z; 4708 4709 } 4710 4711 /*---------------------------------------------------------------------------- 4712 | Returns the result of adding the absolute values of the extended double- 4713 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4714 | negated before being returned. `zSign' is ignored if the result is a NaN. 4715 | The addition is performed according to the IEC/IEEE Standard for Binary 4716 | Floating-Point Arithmetic. 4717 *----------------------------------------------------------------------------*/ 4718 4719 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4720 float_status *status) 4721 { 4722 int32_t aExp, bExp, zExp; 4723 uint64_t aSig, bSig, zSig0, zSig1; 4724 int32_t expDiff; 4725 4726 aSig = extractFloatx80Frac( a ); 4727 aExp = extractFloatx80Exp( a ); 4728 bSig = extractFloatx80Frac( b ); 4729 bExp = extractFloatx80Exp( b ); 4730 expDiff = aExp - bExp; 4731 if ( 0 < expDiff ) { 4732 if ( aExp == 0x7FFF ) { 4733 if ((uint64_t)(aSig << 1)) { 4734 return propagateFloatx80NaN(a, b, status); 4735 } 4736 return a; 4737 } 4738 if ( bExp == 0 ) --expDiff; 4739 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4740 zExp = aExp; 4741 } 4742 else if ( expDiff < 0 ) { 4743 if ( bExp == 0x7FFF ) { 4744 if ((uint64_t)(bSig << 1)) { 4745 return propagateFloatx80NaN(a, b, status); 4746 } 4747 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4748 } 4749 if ( aExp == 0 ) ++expDiff; 4750 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4751 zExp = bExp; 4752 } 4753 else { 4754 if ( aExp == 0x7FFF ) { 4755 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4756 return propagateFloatx80NaN(a, b, status); 4757 } 4758 return a; 4759 } 4760 zSig1 = 0; 4761 zSig0 = aSig + bSig; 4762 if ( aExp == 0 ) { 4763 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4764 goto roundAndPack; 4765 } 4766 zExp = aExp; 4767 goto shiftRight1; 4768 } 4769 zSig0 = aSig + bSig; 4770 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4771 shiftRight1: 4772 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4773 zSig0 |= LIT64( 0x8000000000000000 ); 4774 ++zExp; 4775 roundAndPack: 4776 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4777 zSign, zExp, zSig0, zSig1, status); 4778 } 4779 4780 /*---------------------------------------------------------------------------- 4781 | Returns the result of subtracting the absolute values of the extended 4782 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4783 | difference is negated before being returned. `zSign' is ignored if the 4784 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4785 | Standard for Binary Floating-Point Arithmetic. 4786 *----------------------------------------------------------------------------*/ 4787 4788 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4789 float_status *status) 4790 { 4791 int32_t aExp, bExp, zExp; 4792 uint64_t aSig, bSig, zSig0, zSig1; 4793 int32_t expDiff; 4794 4795 aSig = extractFloatx80Frac( a ); 4796 aExp = extractFloatx80Exp( a ); 4797 bSig = extractFloatx80Frac( b ); 4798 bExp = extractFloatx80Exp( b ); 4799 expDiff = aExp - bExp; 4800 if ( 0 < expDiff ) goto aExpBigger; 4801 if ( expDiff < 0 ) goto bExpBigger; 4802 if ( aExp == 0x7FFF ) { 4803 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4804 return propagateFloatx80NaN(a, b, status); 4805 } 4806 float_raise(float_flag_invalid, status); 4807 return floatx80_default_nan(status); 4808 } 4809 if ( aExp == 0 ) { 4810 aExp = 1; 4811 bExp = 1; 4812 } 4813 zSig1 = 0; 4814 if ( bSig < aSig ) goto aBigger; 4815 if ( aSig < bSig ) goto bBigger; 4816 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4817 bExpBigger: 4818 if ( bExp == 0x7FFF ) { 4819 if ((uint64_t)(bSig << 1)) { 4820 return propagateFloatx80NaN(a, b, status); 4821 } 4822 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4823 } 4824 if ( aExp == 0 ) ++expDiff; 4825 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4826 bBigger: 4827 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4828 zExp = bExp; 4829 zSign ^= 1; 4830 goto normalizeRoundAndPack; 4831 aExpBigger: 4832 if ( aExp == 0x7FFF ) { 4833 if ((uint64_t)(aSig << 1)) { 4834 return propagateFloatx80NaN(a, b, status); 4835 } 4836 return a; 4837 } 4838 if ( bExp == 0 ) --expDiff; 4839 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4840 aBigger: 4841 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4842 zExp = aExp; 4843 normalizeRoundAndPack: 4844 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4845 zSign, zExp, zSig0, zSig1, status); 4846 } 4847 4848 /*---------------------------------------------------------------------------- 4849 | Returns the result of adding the extended double-precision floating-point 4850 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4851 | Standard for Binary Floating-Point Arithmetic. 4852 *----------------------------------------------------------------------------*/ 4853 4854 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4855 { 4856 flag aSign, bSign; 4857 4858 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4859 float_raise(float_flag_invalid, status); 4860 return floatx80_default_nan(status); 4861 } 4862 aSign = extractFloatx80Sign( a ); 4863 bSign = extractFloatx80Sign( b ); 4864 if ( aSign == bSign ) { 4865 return addFloatx80Sigs(a, b, aSign, status); 4866 } 4867 else { 4868 return subFloatx80Sigs(a, b, aSign, status); 4869 } 4870 4871 } 4872 4873 /*---------------------------------------------------------------------------- 4874 | Returns the result of subtracting the extended double-precision floating- 4875 | point values `a' and `b'. The operation is performed according to the 4876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4877 *----------------------------------------------------------------------------*/ 4878 4879 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 4880 { 4881 flag aSign, bSign; 4882 4883 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4884 float_raise(float_flag_invalid, status); 4885 return floatx80_default_nan(status); 4886 } 4887 aSign = extractFloatx80Sign( a ); 4888 bSign = extractFloatx80Sign( b ); 4889 if ( aSign == bSign ) { 4890 return subFloatx80Sigs(a, b, aSign, status); 4891 } 4892 else { 4893 return addFloatx80Sigs(a, b, aSign, status); 4894 } 4895 4896 } 4897 4898 /*---------------------------------------------------------------------------- 4899 | Returns the result of multiplying the extended double-precision floating- 4900 | point values `a' and `b'. The operation is performed according to the 4901 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4902 *----------------------------------------------------------------------------*/ 4903 4904 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 4905 { 4906 flag aSign, bSign, zSign; 4907 int32_t aExp, bExp, zExp; 4908 uint64_t aSig, bSig, zSig0, zSig1; 4909 4910 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4911 float_raise(float_flag_invalid, status); 4912 return floatx80_default_nan(status); 4913 } 4914 aSig = extractFloatx80Frac( a ); 4915 aExp = extractFloatx80Exp( a ); 4916 aSign = extractFloatx80Sign( a ); 4917 bSig = extractFloatx80Frac( b ); 4918 bExp = extractFloatx80Exp( b ); 4919 bSign = extractFloatx80Sign( b ); 4920 zSign = aSign ^ bSign; 4921 if ( aExp == 0x7FFF ) { 4922 if ( (uint64_t) ( aSig<<1 ) 4923 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4924 return propagateFloatx80NaN(a, b, status); 4925 } 4926 if ( ( bExp | bSig ) == 0 ) goto invalid; 4927 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4928 } 4929 if ( bExp == 0x7FFF ) { 4930 if ((uint64_t)(bSig << 1)) { 4931 return propagateFloatx80NaN(a, b, status); 4932 } 4933 if ( ( aExp | aSig ) == 0 ) { 4934 invalid: 4935 float_raise(float_flag_invalid, status); 4936 return floatx80_default_nan(status); 4937 } 4938 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4939 } 4940 if ( aExp == 0 ) { 4941 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4942 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4943 } 4944 if ( bExp == 0 ) { 4945 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4946 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4947 } 4948 zExp = aExp + bExp - 0x3FFE; 4949 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4950 if ( 0 < (int64_t) zSig0 ) { 4951 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4952 --zExp; 4953 } 4954 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4955 zSign, zExp, zSig0, zSig1, status); 4956 } 4957 4958 /*---------------------------------------------------------------------------- 4959 | Returns the result of dividing the extended double-precision floating-point 4960 | value `a' by the corresponding value `b'. The operation is performed 4961 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4962 *----------------------------------------------------------------------------*/ 4963 4964 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 4965 { 4966 flag aSign, bSign, zSign; 4967 int32_t aExp, bExp, zExp; 4968 uint64_t aSig, bSig, zSig0, zSig1; 4969 uint64_t rem0, rem1, rem2, term0, term1, term2; 4970 4971 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4972 float_raise(float_flag_invalid, status); 4973 return floatx80_default_nan(status); 4974 } 4975 aSig = extractFloatx80Frac( a ); 4976 aExp = extractFloatx80Exp( a ); 4977 aSign = extractFloatx80Sign( a ); 4978 bSig = extractFloatx80Frac( b ); 4979 bExp = extractFloatx80Exp( b ); 4980 bSign = extractFloatx80Sign( b ); 4981 zSign = aSign ^ bSign; 4982 if ( aExp == 0x7FFF ) { 4983 if ((uint64_t)(aSig << 1)) { 4984 return propagateFloatx80NaN(a, b, status); 4985 } 4986 if ( bExp == 0x7FFF ) { 4987 if ((uint64_t)(bSig << 1)) { 4988 return propagateFloatx80NaN(a, b, status); 4989 } 4990 goto invalid; 4991 } 4992 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4993 } 4994 if ( bExp == 0x7FFF ) { 4995 if ((uint64_t)(bSig << 1)) { 4996 return propagateFloatx80NaN(a, b, status); 4997 } 4998 return packFloatx80( zSign, 0, 0 ); 4999 } 5000 if ( bExp == 0 ) { 5001 if ( bSig == 0 ) { 5002 if ( ( aExp | aSig ) == 0 ) { 5003 invalid: 5004 float_raise(float_flag_invalid, status); 5005 return floatx80_default_nan(status); 5006 } 5007 float_raise(float_flag_divbyzero, status); 5008 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5009 } 5010 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5011 } 5012 if ( aExp == 0 ) { 5013 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5014 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5015 } 5016 zExp = aExp - bExp + 0x3FFE; 5017 rem1 = 0; 5018 if ( bSig <= aSig ) { 5019 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5020 ++zExp; 5021 } 5022 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5023 mul64To128( bSig, zSig0, &term0, &term1 ); 5024 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5025 while ( (int64_t) rem0 < 0 ) { 5026 --zSig0; 5027 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5028 } 5029 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5030 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5031 mul64To128( bSig, zSig1, &term1, &term2 ); 5032 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5033 while ( (int64_t) rem1 < 0 ) { 5034 --zSig1; 5035 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5036 } 5037 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5038 } 5039 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5040 zSign, zExp, zSig0, zSig1, status); 5041 } 5042 5043 /*---------------------------------------------------------------------------- 5044 | Returns the remainder of the extended double-precision floating-point value 5045 | `a' with respect to the corresponding value `b'. The operation is performed 5046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5047 *----------------------------------------------------------------------------*/ 5048 5049 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5050 { 5051 flag aSign, zSign; 5052 int32_t aExp, bExp, expDiff; 5053 uint64_t aSig0, aSig1, bSig; 5054 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5055 5056 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5057 float_raise(float_flag_invalid, status); 5058 return floatx80_default_nan(status); 5059 } 5060 aSig0 = extractFloatx80Frac( a ); 5061 aExp = extractFloatx80Exp( a ); 5062 aSign = extractFloatx80Sign( a ); 5063 bSig = extractFloatx80Frac( b ); 5064 bExp = extractFloatx80Exp( b ); 5065 if ( aExp == 0x7FFF ) { 5066 if ( (uint64_t) ( aSig0<<1 ) 5067 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5068 return propagateFloatx80NaN(a, b, status); 5069 } 5070 goto invalid; 5071 } 5072 if ( bExp == 0x7FFF ) { 5073 if ((uint64_t)(bSig << 1)) { 5074 return propagateFloatx80NaN(a, b, status); 5075 } 5076 return a; 5077 } 5078 if ( bExp == 0 ) { 5079 if ( bSig == 0 ) { 5080 invalid: 5081 float_raise(float_flag_invalid, status); 5082 return floatx80_default_nan(status); 5083 } 5084 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5085 } 5086 if ( aExp == 0 ) { 5087 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5088 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5089 } 5090 bSig |= LIT64( 0x8000000000000000 ); 5091 zSign = aSign; 5092 expDiff = aExp - bExp; 5093 aSig1 = 0; 5094 if ( expDiff < 0 ) { 5095 if ( expDiff < -1 ) return a; 5096 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5097 expDiff = 0; 5098 } 5099 q = ( bSig <= aSig0 ); 5100 if ( q ) aSig0 -= bSig; 5101 expDiff -= 64; 5102 while ( 0 < expDiff ) { 5103 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5104 q = ( 2 < q ) ? q - 2 : 0; 5105 mul64To128( bSig, q, &term0, &term1 ); 5106 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5107 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5108 expDiff -= 62; 5109 } 5110 expDiff += 64; 5111 if ( 0 < expDiff ) { 5112 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5113 q = ( 2 < q ) ? q - 2 : 0; 5114 q >>= 64 - expDiff; 5115 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5116 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5117 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5118 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5119 ++q; 5120 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5121 } 5122 } 5123 else { 5124 term1 = 0; 5125 term0 = bSig; 5126 } 5127 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5128 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5129 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5130 && ( q & 1 ) ) 5131 ) { 5132 aSig0 = alternateASig0; 5133 aSig1 = alternateASig1; 5134 zSign = ! zSign; 5135 } 5136 return 5137 normalizeRoundAndPackFloatx80( 5138 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5139 5140 } 5141 5142 /*---------------------------------------------------------------------------- 5143 | Returns the square root of the extended double-precision floating-point 5144 | value `a'. The operation is performed according to the IEC/IEEE Standard 5145 | for Binary Floating-Point Arithmetic. 5146 *----------------------------------------------------------------------------*/ 5147 5148 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5149 { 5150 flag aSign; 5151 int32_t aExp, zExp; 5152 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5153 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5154 5155 if (floatx80_invalid_encoding(a)) { 5156 float_raise(float_flag_invalid, status); 5157 return floatx80_default_nan(status); 5158 } 5159 aSig0 = extractFloatx80Frac( a ); 5160 aExp = extractFloatx80Exp( a ); 5161 aSign = extractFloatx80Sign( a ); 5162 if ( aExp == 0x7FFF ) { 5163 if ((uint64_t)(aSig0 << 1)) { 5164 return propagateFloatx80NaN(a, a, status); 5165 } 5166 if ( ! aSign ) return a; 5167 goto invalid; 5168 } 5169 if ( aSign ) { 5170 if ( ( aExp | aSig0 ) == 0 ) return a; 5171 invalid: 5172 float_raise(float_flag_invalid, status); 5173 return floatx80_default_nan(status); 5174 } 5175 if ( aExp == 0 ) { 5176 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5177 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5178 } 5179 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5180 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5181 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5182 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5183 doubleZSig0 = zSig0<<1; 5184 mul64To128( zSig0, zSig0, &term0, &term1 ); 5185 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5186 while ( (int64_t) rem0 < 0 ) { 5187 --zSig0; 5188 doubleZSig0 -= 2; 5189 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5190 } 5191 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5192 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5193 if ( zSig1 == 0 ) zSig1 = 1; 5194 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5195 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5196 mul64To128( zSig1, zSig1, &term2, &term3 ); 5197 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5198 while ( (int64_t) rem1 < 0 ) { 5199 --zSig1; 5200 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5201 term3 |= 1; 5202 term2 |= doubleZSig0; 5203 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5204 } 5205 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5206 } 5207 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5208 zSig0 |= doubleZSig0; 5209 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5210 0, zExp, zSig0, zSig1, status); 5211 } 5212 5213 /*---------------------------------------------------------------------------- 5214 | Returns 1 if the extended double-precision floating-point value `a' is equal 5215 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5216 | raised if either operand is a NaN. Otherwise, the comparison is performed 5217 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5218 *----------------------------------------------------------------------------*/ 5219 5220 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5221 { 5222 5223 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5224 || (extractFloatx80Exp(a) == 0x7FFF 5225 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5226 || (extractFloatx80Exp(b) == 0x7FFF 5227 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5228 ) { 5229 float_raise(float_flag_invalid, status); 5230 return 0; 5231 } 5232 return 5233 ( a.low == b.low ) 5234 && ( ( a.high == b.high ) 5235 || ( ( a.low == 0 ) 5236 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5237 ); 5238 5239 } 5240 5241 /*---------------------------------------------------------------------------- 5242 | Returns 1 if the extended double-precision floating-point value `a' is 5243 | less than or equal to the corresponding value `b', and 0 otherwise. The 5244 | invalid exception is raised if either operand is a NaN. The comparison is 5245 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5246 | Arithmetic. 5247 *----------------------------------------------------------------------------*/ 5248 5249 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5250 { 5251 flag aSign, bSign; 5252 5253 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5254 || (extractFloatx80Exp(a) == 0x7FFF 5255 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5256 || (extractFloatx80Exp(b) == 0x7FFF 5257 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5258 ) { 5259 float_raise(float_flag_invalid, status); 5260 return 0; 5261 } 5262 aSign = extractFloatx80Sign( a ); 5263 bSign = extractFloatx80Sign( b ); 5264 if ( aSign != bSign ) { 5265 return 5266 aSign 5267 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5268 == 0 ); 5269 } 5270 return 5271 aSign ? le128( b.high, b.low, a.high, a.low ) 5272 : le128( a.high, a.low, b.high, b.low ); 5273 5274 } 5275 5276 /*---------------------------------------------------------------------------- 5277 | Returns 1 if the extended double-precision floating-point value `a' is 5278 | less than the corresponding value `b', and 0 otherwise. The invalid 5279 | exception is raised if either operand is a NaN. The comparison is performed 5280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5281 *----------------------------------------------------------------------------*/ 5282 5283 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5284 { 5285 flag aSign, bSign; 5286 5287 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5288 || (extractFloatx80Exp(a) == 0x7FFF 5289 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5290 || (extractFloatx80Exp(b) == 0x7FFF 5291 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5292 ) { 5293 float_raise(float_flag_invalid, status); 5294 return 0; 5295 } 5296 aSign = extractFloatx80Sign( a ); 5297 bSign = extractFloatx80Sign( b ); 5298 if ( aSign != bSign ) { 5299 return 5300 aSign 5301 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5302 != 0 ); 5303 } 5304 return 5305 aSign ? lt128( b.high, b.low, a.high, a.low ) 5306 : lt128( a.high, a.low, b.high, b.low ); 5307 5308 } 5309 5310 /*---------------------------------------------------------------------------- 5311 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5312 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5313 | either operand is a NaN. The comparison is performed according to the 5314 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5315 *----------------------------------------------------------------------------*/ 5316 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5317 { 5318 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5319 || (extractFloatx80Exp(a) == 0x7FFF 5320 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5321 || (extractFloatx80Exp(b) == 0x7FFF 5322 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5323 ) { 5324 float_raise(float_flag_invalid, status); 5325 return 1; 5326 } 5327 return 0; 5328 } 5329 5330 /*---------------------------------------------------------------------------- 5331 | Returns 1 if the extended double-precision floating-point value `a' is 5332 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5333 | cause an exception. The comparison is performed according to the IEC/IEEE 5334 | Standard for Binary Floating-Point Arithmetic. 5335 *----------------------------------------------------------------------------*/ 5336 5337 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5338 { 5339 5340 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5341 float_raise(float_flag_invalid, status); 5342 return 0; 5343 } 5344 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5345 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5346 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5347 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5348 ) { 5349 if (floatx80_is_signaling_nan(a, status) 5350 || floatx80_is_signaling_nan(b, status)) { 5351 float_raise(float_flag_invalid, status); 5352 } 5353 return 0; 5354 } 5355 return 5356 ( a.low == b.low ) 5357 && ( ( a.high == b.high ) 5358 || ( ( a.low == 0 ) 5359 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5360 ); 5361 5362 } 5363 5364 /*---------------------------------------------------------------------------- 5365 | Returns 1 if the extended double-precision floating-point value `a' is less 5366 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5367 | do not cause an exception. Otherwise, the comparison is performed according 5368 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5369 *----------------------------------------------------------------------------*/ 5370 5371 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5372 { 5373 flag aSign, bSign; 5374 5375 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5376 float_raise(float_flag_invalid, status); 5377 return 0; 5378 } 5379 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5380 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5381 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5382 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5383 ) { 5384 if (floatx80_is_signaling_nan(a, status) 5385 || floatx80_is_signaling_nan(b, status)) { 5386 float_raise(float_flag_invalid, status); 5387 } 5388 return 0; 5389 } 5390 aSign = extractFloatx80Sign( a ); 5391 bSign = extractFloatx80Sign( b ); 5392 if ( aSign != bSign ) { 5393 return 5394 aSign 5395 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5396 == 0 ); 5397 } 5398 return 5399 aSign ? le128( b.high, b.low, a.high, a.low ) 5400 : le128( a.high, a.low, b.high, b.low ); 5401 5402 } 5403 5404 /*---------------------------------------------------------------------------- 5405 | Returns 1 if the extended double-precision floating-point value `a' is less 5406 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5407 | an exception. Otherwise, the comparison is performed according to the 5408 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5409 *----------------------------------------------------------------------------*/ 5410 5411 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5412 { 5413 flag aSign, bSign; 5414 5415 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5416 float_raise(float_flag_invalid, status); 5417 return 0; 5418 } 5419 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5420 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5421 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5422 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5423 ) { 5424 if (floatx80_is_signaling_nan(a, status) 5425 || floatx80_is_signaling_nan(b, status)) { 5426 float_raise(float_flag_invalid, status); 5427 } 5428 return 0; 5429 } 5430 aSign = extractFloatx80Sign( a ); 5431 bSign = extractFloatx80Sign( b ); 5432 if ( aSign != bSign ) { 5433 return 5434 aSign 5435 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5436 != 0 ); 5437 } 5438 return 5439 aSign ? lt128( b.high, b.low, a.high, a.low ) 5440 : lt128( a.high, a.low, b.high, b.low ); 5441 5442 } 5443 5444 /*---------------------------------------------------------------------------- 5445 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5446 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5447 | The comparison is performed according to the IEC/IEEE Standard for Binary 5448 | Floating-Point Arithmetic. 5449 *----------------------------------------------------------------------------*/ 5450 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5451 { 5452 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5453 float_raise(float_flag_invalid, status); 5454 return 1; 5455 } 5456 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5457 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5458 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5459 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5460 ) { 5461 if (floatx80_is_signaling_nan(a, status) 5462 || floatx80_is_signaling_nan(b, status)) { 5463 float_raise(float_flag_invalid, status); 5464 } 5465 return 1; 5466 } 5467 return 0; 5468 } 5469 5470 /*---------------------------------------------------------------------------- 5471 | Returns the result of converting the quadruple-precision floating-point 5472 | value `a' to the 32-bit two's complement integer format. The conversion 5473 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5474 | Arithmetic---which means in particular that the conversion is rounded 5475 | according to the current rounding mode. If `a' is a NaN, the largest 5476 | positive integer is returned. Otherwise, if the conversion overflows, the 5477 | largest integer with the same sign as `a' is returned. 5478 *----------------------------------------------------------------------------*/ 5479 5480 int32_t float128_to_int32(float128 a, float_status *status) 5481 { 5482 flag aSign; 5483 int32_t aExp, shiftCount; 5484 uint64_t aSig0, aSig1; 5485 5486 aSig1 = extractFloat128Frac1( a ); 5487 aSig0 = extractFloat128Frac0( a ); 5488 aExp = extractFloat128Exp( a ); 5489 aSign = extractFloat128Sign( a ); 5490 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5491 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5492 aSig0 |= ( aSig1 != 0 ); 5493 shiftCount = 0x4028 - aExp; 5494 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5495 return roundAndPackInt32(aSign, aSig0, status); 5496 5497 } 5498 5499 /*---------------------------------------------------------------------------- 5500 | Returns the result of converting the quadruple-precision floating-point 5501 | value `a' to the 32-bit two's complement integer format. The conversion 5502 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5503 | Arithmetic, except that the conversion is always rounded toward zero. If 5504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5505 | conversion overflows, the largest integer with the same sign as `a' is 5506 | returned. 5507 *----------------------------------------------------------------------------*/ 5508 5509 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5510 { 5511 flag aSign; 5512 int32_t aExp, shiftCount; 5513 uint64_t aSig0, aSig1, savedASig; 5514 int32_t z; 5515 5516 aSig1 = extractFloat128Frac1( a ); 5517 aSig0 = extractFloat128Frac0( a ); 5518 aExp = extractFloat128Exp( a ); 5519 aSign = extractFloat128Sign( a ); 5520 aSig0 |= ( aSig1 != 0 ); 5521 if ( 0x401E < aExp ) { 5522 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5523 goto invalid; 5524 } 5525 else if ( aExp < 0x3FFF ) { 5526 if (aExp || aSig0) { 5527 status->float_exception_flags |= float_flag_inexact; 5528 } 5529 return 0; 5530 } 5531 aSig0 |= LIT64( 0x0001000000000000 ); 5532 shiftCount = 0x402F - aExp; 5533 savedASig = aSig0; 5534 aSig0 >>= shiftCount; 5535 z = aSig0; 5536 if ( aSign ) z = - z; 5537 if ( ( z < 0 ) ^ aSign ) { 5538 invalid: 5539 float_raise(float_flag_invalid, status); 5540 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5541 } 5542 if ( ( aSig0<<shiftCount ) != savedASig ) { 5543 status->float_exception_flags |= float_flag_inexact; 5544 } 5545 return z; 5546 5547 } 5548 5549 /*---------------------------------------------------------------------------- 5550 | Returns the result of converting the quadruple-precision floating-point 5551 | value `a' to the 64-bit two's complement integer format. The conversion 5552 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5553 | Arithmetic---which means in particular that the conversion is rounded 5554 | according to the current rounding mode. If `a' is a NaN, the largest 5555 | positive integer is returned. Otherwise, if the conversion overflows, the 5556 | largest integer with the same sign as `a' is returned. 5557 *----------------------------------------------------------------------------*/ 5558 5559 int64_t float128_to_int64(float128 a, float_status *status) 5560 { 5561 flag aSign; 5562 int32_t aExp, shiftCount; 5563 uint64_t aSig0, aSig1; 5564 5565 aSig1 = extractFloat128Frac1( a ); 5566 aSig0 = extractFloat128Frac0( a ); 5567 aExp = extractFloat128Exp( a ); 5568 aSign = extractFloat128Sign( a ); 5569 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5570 shiftCount = 0x402F - aExp; 5571 if ( shiftCount <= 0 ) { 5572 if ( 0x403E < aExp ) { 5573 float_raise(float_flag_invalid, status); 5574 if ( ! aSign 5575 || ( ( aExp == 0x7FFF ) 5576 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5577 ) 5578 ) { 5579 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5580 } 5581 return (int64_t) LIT64( 0x8000000000000000 ); 5582 } 5583 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5584 } 5585 else { 5586 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5587 } 5588 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5589 5590 } 5591 5592 /*---------------------------------------------------------------------------- 5593 | Returns the result of converting the quadruple-precision floating-point 5594 | value `a' to the 64-bit two's complement integer format. The conversion 5595 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5596 | Arithmetic, except that the conversion is always rounded toward zero. 5597 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5598 | the conversion overflows, the largest integer with the same sign as `a' is 5599 | returned. 5600 *----------------------------------------------------------------------------*/ 5601 5602 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5603 { 5604 flag aSign; 5605 int32_t aExp, shiftCount; 5606 uint64_t aSig0, aSig1; 5607 int64_t z; 5608 5609 aSig1 = extractFloat128Frac1( a ); 5610 aSig0 = extractFloat128Frac0( a ); 5611 aExp = extractFloat128Exp( a ); 5612 aSign = extractFloat128Sign( a ); 5613 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5614 shiftCount = aExp - 0x402F; 5615 if ( 0 < shiftCount ) { 5616 if ( 0x403E <= aExp ) { 5617 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5618 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5619 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5620 if (aSig1) { 5621 status->float_exception_flags |= float_flag_inexact; 5622 } 5623 } 5624 else { 5625 float_raise(float_flag_invalid, status); 5626 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5627 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5628 } 5629 } 5630 return (int64_t) LIT64( 0x8000000000000000 ); 5631 } 5632 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5633 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5634 status->float_exception_flags |= float_flag_inexact; 5635 } 5636 } 5637 else { 5638 if ( aExp < 0x3FFF ) { 5639 if ( aExp | aSig0 | aSig1 ) { 5640 status->float_exception_flags |= float_flag_inexact; 5641 } 5642 return 0; 5643 } 5644 z = aSig0>>( - shiftCount ); 5645 if ( aSig1 5646 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5647 status->float_exception_flags |= float_flag_inexact; 5648 } 5649 } 5650 if ( aSign ) z = - z; 5651 return z; 5652 5653 } 5654 5655 /*---------------------------------------------------------------------------- 5656 | Returns the result of converting the quadruple-precision floating-point value 5657 | `a' to the 64-bit unsigned integer format. The conversion is 5658 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5659 | Arithmetic---which means in particular that the conversion is rounded 5660 | according to the current rounding mode. If `a' is a NaN, the largest 5661 | positive integer is returned. If the conversion overflows, the 5662 | largest unsigned integer is returned. If 'a' is negative, the value is 5663 | rounded and zero is returned; negative values that do not round to zero 5664 | will raise the inexact exception. 5665 *----------------------------------------------------------------------------*/ 5666 5667 uint64_t float128_to_uint64(float128 a, float_status *status) 5668 { 5669 flag aSign; 5670 int aExp; 5671 int shiftCount; 5672 uint64_t aSig0, aSig1; 5673 5674 aSig0 = extractFloat128Frac0(a); 5675 aSig1 = extractFloat128Frac1(a); 5676 aExp = extractFloat128Exp(a); 5677 aSign = extractFloat128Sign(a); 5678 if (aSign && (aExp > 0x3FFE)) { 5679 float_raise(float_flag_invalid, status); 5680 if (float128_is_any_nan(a)) { 5681 return LIT64(0xFFFFFFFFFFFFFFFF); 5682 } else { 5683 return 0; 5684 } 5685 } 5686 if (aExp) { 5687 aSig0 |= LIT64(0x0001000000000000); 5688 } 5689 shiftCount = 0x402F - aExp; 5690 if (shiftCount <= 0) { 5691 if (0x403E < aExp) { 5692 float_raise(float_flag_invalid, status); 5693 return LIT64(0xFFFFFFFFFFFFFFFF); 5694 } 5695 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5696 } else { 5697 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5698 } 5699 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5700 } 5701 5702 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5703 { 5704 uint64_t v; 5705 signed char current_rounding_mode = status->float_rounding_mode; 5706 5707 set_float_rounding_mode(float_round_to_zero, status); 5708 v = float128_to_uint64(a, status); 5709 set_float_rounding_mode(current_rounding_mode, status); 5710 5711 return v; 5712 } 5713 5714 /*---------------------------------------------------------------------------- 5715 | Returns the result of converting the quadruple-precision floating-point 5716 | value `a' to the 32-bit unsigned integer format. The conversion 5717 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5718 | Arithmetic except that the conversion is always rounded toward zero. 5719 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5720 | if the conversion overflows, the largest unsigned integer is returned. 5721 | If 'a' is negative, the value is rounded and zero is returned; negative 5722 | values that do not round to zero will raise the inexact exception. 5723 *----------------------------------------------------------------------------*/ 5724 5725 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5726 { 5727 uint64_t v; 5728 uint32_t res; 5729 int old_exc_flags = get_float_exception_flags(status); 5730 5731 v = float128_to_uint64_round_to_zero(a, status); 5732 if (v > 0xffffffff) { 5733 res = 0xffffffff; 5734 } else { 5735 return v; 5736 } 5737 set_float_exception_flags(old_exc_flags, status); 5738 float_raise(float_flag_invalid, status); 5739 return res; 5740 } 5741 5742 /*---------------------------------------------------------------------------- 5743 | Returns the result of converting the quadruple-precision floating-point 5744 | value `a' to the single-precision floating-point format. The conversion 5745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5746 | Arithmetic. 5747 *----------------------------------------------------------------------------*/ 5748 5749 float32 float128_to_float32(float128 a, float_status *status) 5750 { 5751 flag aSign; 5752 int32_t aExp; 5753 uint64_t aSig0, aSig1; 5754 uint32_t zSig; 5755 5756 aSig1 = extractFloat128Frac1( a ); 5757 aSig0 = extractFloat128Frac0( a ); 5758 aExp = extractFloat128Exp( a ); 5759 aSign = extractFloat128Sign( a ); 5760 if ( aExp == 0x7FFF ) { 5761 if ( aSig0 | aSig1 ) { 5762 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5763 } 5764 return packFloat32( aSign, 0xFF, 0 ); 5765 } 5766 aSig0 |= ( aSig1 != 0 ); 5767 shift64RightJamming( aSig0, 18, &aSig0 ); 5768 zSig = aSig0; 5769 if ( aExp || zSig ) { 5770 zSig |= 0x40000000; 5771 aExp -= 0x3F81; 5772 } 5773 return roundAndPackFloat32(aSign, aExp, zSig, status); 5774 5775 } 5776 5777 /*---------------------------------------------------------------------------- 5778 | Returns the result of converting the quadruple-precision floating-point 5779 | value `a' to the double-precision floating-point format. The conversion 5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5781 | Arithmetic. 5782 *----------------------------------------------------------------------------*/ 5783 5784 float64 float128_to_float64(float128 a, float_status *status) 5785 { 5786 flag aSign; 5787 int32_t aExp; 5788 uint64_t aSig0, aSig1; 5789 5790 aSig1 = extractFloat128Frac1( a ); 5791 aSig0 = extractFloat128Frac0( a ); 5792 aExp = extractFloat128Exp( a ); 5793 aSign = extractFloat128Sign( a ); 5794 if ( aExp == 0x7FFF ) { 5795 if ( aSig0 | aSig1 ) { 5796 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5797 } 5798 return packFloat64( aSign, 0x7FF, 0 ); 5799 } 5800 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5801 aSig0 |= ( aSig1 != 0 ); 5802 if ( aExp || aSig0 ) { 5803 aSig0 |= LIT64( 0x4000000000000000 ); 5804 aExp -= 0x3C01; 5805 } 5806 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5807 5808 } 5809 5810 /*---------------------------------------------------------------------------- 5811 | Returns the result of converting the quadruple-precision floating-point 5812 | value `a' to the extended double-precision floating-point format. The 5813 | conversion is performed according to the IEC/IEEE Standard for Binary 5814 | Floating-Point Arithmetic. 5815 *----------------------------------------------------------------------------*/ 5816 5817 floatx80 float128_to_floatx80(float128 a, float_status *status) 5818 { 5819 flag aSign; 5820 int32_t aExp; 5821 uint64_t aSig0, aSig1; 5822 5823 aSig1 = extractFloat128Frac1( a ); 5824 aSig0 = extractFloat128Frac0( a ); 5825 aExp = extractFloat128Exp( a ); 5826 aSign = extractFloat128Sign( a ); 5827 if ( aExp == 0x7FFF ) { 5828 if ( aSig0 | aSig1 ) { 5829 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5830 } 5831 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5832 } 5833 if ( aExp == 0 ) { 5834 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5835 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5836 } 5837 else { 5838 aSig0 |= LIT64( 0x0001000000000000 ); 5839 } 5840 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5841 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5842 5843 } 5844 5845 /*---------------------------------------------------------------------------- 5846 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5847 | returns the result as a quadruple-precision floating-point value. The 5848 | operation is performed according to the IEC/IEEE Standard for Binary 5849 | Floating-Point Arithmetic. 5850 *----------------------------------------------------------------------------*/ 5851 5852 float128 float128_round_to_int(float128 a, float_status *status) 5853 { 5854 flag aSign; 5855 int32_t aExp; 5856 uint64_t lastBitMask, roundBitsMask; 5857 float128 z; 5858 5859 aExp = extractFloat128Exp( a ); 5860 if ( 0x402F <= aExp ) { 5861 if ( 0x406F <= aExp ) { 5862 if ( ( aExp == 0x7FFF ) 5863 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5864 ) { 5865 return propagateFloat128NaN(a, a, status); 5866 } 5867 return a; 5868 } 5869 lastBitMask = 1; 5870 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5871 roundBitsMask = lastBitMask - 1; 5872 z = a; 5873 switch (status->float_rounding_mode) { 5874 case float_round_nearest_even: 5875 if ( lastBitMask ) { 5876 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5877 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5878 } 5879 else { 5880 if ( (int64_t) z.low < 0 ) { 5881 ++z.high; 5882 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5883 } 5884 } 5885 break; 5886 case float_round_ties_away: 5887 if (lastBitMask) { 5888 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 5889 } else { 5890 if ((int64_t) z.low < 0) { 5891 ++z.high; 5892 } 5893 } 5894 break; 5895 case float_round_to_zero: 5896 break; 5897 case float_round_up: 5898 if (!extractFloat128Sign(z)) { 5899 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5900 } 5901 break; 5902 case float_round_down: 5903 if (extractFloat128Sign(z)) { 5904 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5905 } 5906 break; 5907 default: 5908 abort(); 5909 } 5910 z.low &= ~ roundBitsMask; 5911 } 5912 else { 5913 if ( aExp < 0x3FFF ) { 5914 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5915 status->float_exception_flags |= float_flag_inexact; 5916 aSign = extractFloat128Sign( a ); 5917 switch (status->float_rounding_mode) { 5918 case float_round_nearest_even: 5919 if ( ( aExp == 0x3FFE ) 5920 && ( extractFloat128Frac0( a ) 5921 | extractFloat128Frac1( a ) ) 5922 ) { 5923 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5924 } 5925 break; 5926 case float_round_ties_away: 5927 if (aExp == 0x3FFE) { 5928 return packFloat128(aSign, 0x3FFF, 0, 0); 5929 } 5930 break; 5931 case float_round_down: 5932 return 5933 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5934 : packFloat128( 0, 0, 0, 0 ); 5935 case float_round_up: 5936 return 5937 aSign ? packFloat128( 1, 0, 0, 0 ) 5938 : packFloat128( 0, 0x3FFF, 0, 0 ); 5939 } 5940 return packFloat128( aSign, 0, 0, 0 ); 5941 } 5942 lastBitMask = 1; 5943 lastBitMask <<= 0x402F - aExp; 5944 roundBitsMask = lastBitMask - 1; 5945 z.low = 0; 5946 z.high = a.high; 5947 switch (status->float_rounding_mode) { 5948 case float_round_nearest_even: 5949 z.high += lastBitMask>>1; 5950 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5951 z.high &= ~ lastBitMask; 5952 } 5953 break; 5954 case float_round_ties_away: 5955 z.high += lastBitMask>>1; 5956 break; 5957 case float_round_to_zero: 5958 break; 5959 case float_round_up: 5960 if (!extractFloat128Sign(z)) { 5961 z.high |= ( a.low != 0 ); 5962 z.high += roundBitsMask; 5963 } 5964 break; 5965 case float_round_down: 5966 if (extractFloat128Sign(z)) { 5967 z.high |= (a.low != 0); 5968 z.high += roundBitsMask; 5969 } 5970 break; 5971 default: 5972 abort(); 5973 } 5974 z.high &= ~ roundBitsMask; 5975 } 5976 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5977 status->float_exception_flags |= float_flag_inexact; 5978 } 5979 return z; 5980 5981 } 5982 5983 /*---------------------------------------------------------------------------- 5984 | Returns the result of adding the absolute values of the quadruple-precision 5985 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5986 | before being returned. `zSign' is ignored if the result is a NaN. 5987 | The addition is performed according to the IEC/IEEE Standard for Binary 5988 | Floating-Point Arithmetic. 5989 *----------------------------------------------------------------------------*/ 5990 5991 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 5992 float_status *status) 5993 { 5994 int32_t aExp, bExp, zExp; 5995 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5996 int32_t expDiff; 5997 5998 aSig1 = extractFloat128Frac1( a ); 5999 aSig0 = extractFloat128Frac0( a ); 6000 aExp = extractFloat128Exp( a ); 6001 bSig1 = extractFloat128Frac1( b ); 6002 bSig0 = extractFloat128Frac0( b ); 6003 bExp = extractFloat128Exp( b ); 6004 expDiff = aExp - bExp; 6005 if ( 0 < expDiff ) { 6006 if ( aExp == 0x7FFF ) { 6007 if (aSig0 | aSig1) { 6008 return propagateFloat128NaN(a, b, status); 6009 } 6010 return a; 6011 } 6012 if ( bExp == 0 ) { 6013 --expDiff; 6014 } 6015 else { 6016 bSig0 |= LIT64( 0x0001000000000000 ); 6017 } 6018 shift128ExtraRightJamming( 6019 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6020 zExp = aExp; 6021 } 6022 else if ( expDiff < 0 ) { 6023 if ( bExp == 0x7FFF ) { 6024 if (bSig0 | bSig1) { 6025 return propagateFloat128NaN(a, b, status); 6026 } 6027 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6028 } 6029 if ( aExp == 0 ) { 6030 ++expDiff; 6031 } 6032 else { 6033 aSig0 |= LIT64( 0x0001000000000000 ); 6034 } 6035 shift128ExtraRightJamming( 6036 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6037 zExp = bExp; 6038 } 6039 else { 6040 if ( aExp == 0x7FFF ) { 6041 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6042 return propagateFloat128NaN(a, b, status); 6043 } 6044 return a; 6045 } 6046 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6047 if ( aExp == 0 ) { 6048 if (status->flush_to_zero) { 6049 if (zSig0 | zSig1) { 6050 float_raise(float_flag_output_denormal, status); 6051 } 6052 return packFloat128(zSign, 0, 0, 0); 6053 } 6054 return packFloat128( zSign, 0, zSig0, zSig1 ); 6055 } 6056 zSig2 = 0; 6057 zSig0 |= LIT64( 0x0002000000000000 ); 6058 zExp = aExp; 6059 goto shiftRight1; 6060 } 6061 aSig0 |= LIT64( 0x0001000000000000 ); 6062 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6063 --zExp; 6064 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6065 ++zExp; 6066 shiftRight1: 6067 shift128ExtraRightJamming( 6068 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6069 roundAndPack: 6070 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6071 6072 } 6073 6074 /*---------------------------------------------------------------------------- 6075 | Returns the result of subtracting the absolute values of the quadruple- 6076 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6077 | difference is negated before being returned. `zSign' is ignored if the 6078 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6079 | Standard for Binary Floating-Point Arithmetic. 6080 *----------------------------------------------------------------------------*/ 6081 6082 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6083 float_status *status) 6084 { 6085 int32_t aExp, bExp, zExp; 6086 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6087 int32_t expDiff; 6088 6089 aSig1 = extractFloat128Frac1( a ); 6090 aSig0 = extractFloat128Frac0( a ); 6091 aExp = extractFloat128Exp( a ); 6092 bSig1 = extractFloat128Frac1( b ); 6093 bSig0 = extractFloat128Frac0( b ); 6094 bExp = extractFloat128Exp( b ); 6095 expDiff = aExp - bExp; 6096 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6097 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6098 if ( 0 < expDiff ) goto aExpBigger; 6099 if ( expDiff < 0 ) goto bExpBigger; 6100 if ( aExp == 0x7FFF ) { 6101 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6102 return propagateFloat128NaN(a, b, status); 6103 } 6104 float_raise(float_flag_invalid, status); 6105 return float128_default_nan(status); 6106 } 6107 if ( aExp == 0 ) { 6108 aExp = 1; 6109 bExp = 1; 6110 } 6111 if ( bSig0 < aSig0 ) goto aBigger; 6112 if ( aSig0 < bSig0 ) goto bBigger; 6113 if ( bSig1 < aSig1 ) goto aBigger; 6114 if ( aSig1 < bSig1 ) goto bBigger; 6115 return packFloat128(status->float_rounding_mode == float_round_down, 6116 0, 0, 0); 6117 bExpBigger: 6118 if ( bExp == 0x7FFF ) { 6119 if (bSig0 | bSig1) { 6120 return propagateFloat128NaN(a, b, status); 6121 } 6122 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6123 } 6124 if ( aExp == 0 ) { 6125 ++expDiff; 6126 } 6127 else { 6128 aSig0 |= LIT64( 0x4000000000000000 ); 6129 } 6130 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6131 bSig0 |= LIT64( 0x4000000000000000 ); 6132 bBigger: 6133 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6134 zExp = bExp; 6135 zSign ^= 1; 6136 goto normalizeRoundAndPack; 6137 aExpBigger: 6138 if ( aExp == 0x7FFF ) { 6139 if (aSig0 | aSig1) { 6140 return propagateFloat128NaN(a, b, status); 6141 } 6142 return a; 6143 } 6144 if ( bExp == 0 ) { 6145 --expDiff; 6146 } 6147 else { 6148 bSig0 |= LIT64( 0x4000000000000000 ); 6149 } 6150 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6151 aSig0 |= LIT64( 0x4000000000000000 ); 6152 aBigger: 6153 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6154 zExp = aExp; 6155 normalizeRoundAndPack: 6156 --zExp; 6157 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6158 status); 6159 6160 } 6161 6162 /*---------------------------------------------------------------------------- 6163 | Returns the result of adding the quadruple-precision floating-point values 6164 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6165 | for Binary Floating-Point Arithmetic. 6166 *----------------------------------------------------------------------------*/ 6167 6168 float128 float128_add(float128 a, float128 b, float_status *status) 6169 { 6170 flag aSign, bSign; 6171 6172 aSign = extractFloat128Sign( a ); 6173 bSign = extractFloat128Sign( b ); 6174 if ( aSign == bSign ) { 6175 return addFloat128Sigs(a, b, aSign, status); 6176 } 6177 else { 6178 return subFloat128Sigs(a, b, aSign, status); 6179 } 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the result of subtracting the quadruple-precision floating-point 6185 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6186 | Standard for Binary Floating-Point Arithmetic. 6187 *----------------------------------------------------------------------------*/ 6188 6189 float128 float128_sub(float128 a, float128 b, float_status *status) 6190 { 6191 flag aSign, bSign; 6192 6193 aSign = extractFloat128Sign( a ); 6194 bSign = extractFloat128Sign( b ); 6195 if ( aSign == bSign ) { 6196 return subFloat128Sigs(a, b, aSign, status); 6197 } 6198 else { 6199 return addFloat128Sigs(a, b, aSign, status); 6200 } 6201 6202 } 6203 6204 /*---------------------------------------------------------------------------- 6205 | Returns the result of multiplying the quadruple-precision floating-point 6206 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6207 | Standard for Binary Floating-Point Arithmetic. 6208 *----------------------------------------------------------------------------*/ 6209 6210 float128 float128_mul(float128 a, float128 b, float_status *status) 6211 { 6212 flag aSign, bSign, zSign; 6213 int32_t aExp, bExp, zExp; 6214 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6215 6216 aSig1 = extractFloat128Frac1( a ); 6217 aSig0 = extractFloat128Frac0( a ); 6218 aExp = extractFloat128Exp( a ); 6219 aSign = extractFloat128Sign( a ); 6220 bSig1 = extractFloat128Frac1( b ); 6221 bSig0 = extractFloat128Frac0( b ); 6222 bExp = extractFloat128Exp( b ); 6223 bSign = extractFloat128Sign( b ); 6224 zSign = aSign ^ bSign; 6225 if ( aExp == 0x7FFF ) { 6226 if ( ( aSig0 | aSig1 ) 6227 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6228 return propagateFloat128NaN(a, b, status); 6229 } 6230 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6231 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6232 } 6233 if ( bExp == 0x7FFF ) { 6234 if (bSig0 | bSig1) { 6235 return propagateFloat128NaN(a, b, status); 6236 } 6237 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6238 invalid: 6239 float_raise(float_flag_invalid, status); 6240 return float128_default_nan(status); 6241 } 6242 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6243 } 6244 if ( aExp == 0 ) { 6245 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6246 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6247 } 6248 if ( bExp == 0 ) { 6249 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6250 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6251 } 6252 zExp = aExp + bExp - 0x4000; 6253 aSig0 |= LIT64( 0x0001000000000000 ); 6254 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6255 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6256 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6257 zSig2 |= ( zSig3 != 0 ); 6258 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6259 shift128ExtraRightJamming( 6260 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6261 ++zExp; 6262 } 6263 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6264 6265 } 6266 6267 /*---------------------------------------------------------------------------- 6268 | Returns the result of dividing the quadruple-precision floating-point value 6269 | `a' by the corresponding value `b'. The operation is performed according to 6270 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6271 *----------------------------------------------------------------------------*/ 6272 6273 float128 float128_div(float128 a, float128 b, float_status *status) 6274 { 6275 flag aSign, bSign, zSign; 6276 int32_t aExp, bExp, zExp; 6277 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6278 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6279 6280 aSig1 = extractFloat128Frac1( a ); 6281 aSig0 = extractFloat128Frac0( a ); 6282 aExp = extractFloat128Exp( a ); 6283 aSign = extractFloat128Sign( a ); 6284 bSig1 = extractFloat128Frac1( b ); 6285 bSig0 = extractFloat128Frac0( b ); 6286 bExp = extractFloat128Exp( b ); 6287 bSign = extractFloat128Sign( b ); 6288 zSign = aSign ^ bSign; 6289 if ( aExp == 0x7FFF ) { 6290 if (aSig0 | aSig1) { 6291 return propagateFloat128NaN(a, b, status); 6292 } 6293 if ( bExp == 0x7FFF ) { 6294 if (bSig0 | bSig1) { 6295 return propagateFloat128NaN(a, b, status); 6296 } 6297 goto invalid; 6298 } 6299 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6300 } 6301 if ( bExp == 0x7FFF ) { 6302 if (bSig0 | bSig1) { 6303 return propagateFloat128NaN(a, b, status); 6304 } 6305 return packFloat128( zSign, 0, 0, 0 ); 6306 } 6307 if ( bExp == 0 ) { 6308 if ( ( bSig0 | bSig1 ) == 0 ) { 6309 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6310 invalid: 6311 float_raise(float_flag_invalid, status); 6312 return float128_default_nan(status); 6313 } 6314 float_raise(float_flag_divbyzero, status); 6315 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6316 } 6317 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6318 } 6319 if ( aExp == 0 ) { 6320 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6321 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6322 } 6323 zExp = aExp - bExp + 0x3FFD; 6324 shortShift128Left( 6325 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6326 shortShift128Left( 6327 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6328 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6329 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6330 ++zExp; 6331 } 6332 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6333 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6334 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6335 while ( (int64_t) rem0 < 0 ) { 6336 --zSig0; 6337 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6338 } 6339 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6340 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6341 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6342 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6343 while ( (int64_t) rem1 < 0 ) { 6344 --zSig1; 6345 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6346 } 6347 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6348 } 6349 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6350 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6351 6352 } 6353 6354 /*---------------------------------------------------------------------------- 6355 | Returns the remainder of the quadruple-precision floating-point value `a' 6356 | with respect to the corresponding value `b'. The operation is performed 6357 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6358 *----------------------------------------------------------------------------*/ 6359 6360 float128 float128_rem(float128 a, float128 b, float_status *status) 6361 { 6362 flag aSign, zSign; 6363 int32_t aExp, bExp, expDiff; 6364 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6365 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6366 int64_t sigMean0; 6367 6368 aSig1 = extractFloat128Frac1( a ); 6369 aSig0 = extractFloat128Frac0( a ); 6370 aExp = extractFloat128Exp( a ); 6371 aSign = extractFloat128Sign( a ); 6372 bSig1 = extractFloat128Frac1( b ); 6373 bSig0 = extractFloat128Frac0( b ); 6374 bExp = extractFloat128Exp( b ); 6375 if ( aExp == 0x7FFF ) { 6376 if ( ( aSig0 | aSig1 ) 6377 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6378 return propagateFloat128NaN(a, b, status); 6379 } 6380 goto invalid; 6381 } 6382 if ( bExp == 0x7FFF ) { 6383 if (bSig0 | bSig1) { 6384 return propagateFloat128NaN(a, b, status); 6385 } 6386 return a; 6387 } 6388 if ( bExp == 0 ) { 6389 if ( ( bSig0 | bSig1 ) == 0 ) { 6390 invalid: 6391 float_raise(float_flag_invalid, status); 6392 return float128_default_nan(status); 6393 } 6394 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6395 } 6396 if ( aExp == 0 ) { 6397 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6398 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6399 } 6400 expDiff = aExp - bExp; 6401 if ( expDiff < -1 ) return a; 6402 shortShift128Left( 6403 aSig0 | LIT64( 0x0001000000000000 ), 6404 aSig1, 6405 15 - ( expDiff < 0 ), 6406 &aSig0, 6407 &aSig1 6408 ); 6409 shortShift128Left( 6410 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6411 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6412 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6413 expDiff -= 64; 6414 while ( 0 < expDiff ) { 6415 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6416 q = ( 4 < q ) ? q - 4 : 0; 6417 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6418 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6419 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6420 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6421 expDiff -= 61; 6422 } 6423 if ( -64 < expDiff ) { 6424 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6425 q = ( 4 < q ) ? q - 4 : 0; 6426 q >>= - expDiff; 6427 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6428 expDiff += 52; 6429 if ( expDiff < 0 ) { 6430 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6431 } 6432 else { 6433 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6434 } 6435 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6436 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6437 } 6438 else { 6439 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6440 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6441 } 6442 do { 6443 alternateASig0 = aSig0; 6444 alternateASig1 = aSig1; 6445 ++q; 6446 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6447 } while ( 0 <= (int64_t) aSig0 ); 6448 add128( 6449 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6450 if ( ( sigMean0 < 0 ) 6451 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6452 aSig0 = alternateASig0; 6453 aSig1 = alternateASig1; 6454 } 6455 zSign = ( (int64_t) aSig0 < 0 ); 6456 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6457 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6458 status); 6459 } 6460 6461 /*---------------------------------------------------------------------------- 6462 | Returns the square root of the quadruple-precision floating-point value `a'. 6463 | The operation is performed according to the IEC/IEEE Standard for Binary 6464 | Floating-Point Arithmetic. 6465 *----------------------------------------------------------------------------*/ 6466 6467 float128 float128_sqrt(float128 a, float_status *status) 6468 { 6469 flag aSign; 6470 int32_t aExp, zExp; 6471 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6472 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6473 6474 aSig1 = extractFloat128Frac1( a ); 6475 aSig0 = extractFloat128Frac0( a ); 6476 aExp = extractFloat128Exp( a ); 6477 aSign = extractFloat128Sign( a ); 6478 if ( aExp == 0x7FFF ) { 6479 if (aSig0 | aSig1) { 6480 return propagateFloat128NaN(a, a, status); 6481 } 6482 if ( ! aSign ) return a; 6483 goto invalid; 6484 } 6485 if ( aSign ) { 6486 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6487 invalid: 6488 float_raise(float_flag_invalid, status); 6489 return float128_default_nan(status); 6490 } 6491 if ( aExp == 0 ) { 6492 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6493 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6494 } 6495 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6496 aSig0 |= LIT64( 0x0001000000000000 ); 6497 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6498 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6499 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6500 doubleZSig0 = zSig0<<1; 6501 mul64To128( zSig0, zSig0, &term0, &term1 ); 6502 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6503 while ( (int64_t) rem0 < 0 ) { 6504 --zSig0; 6505 doubleZSig0 -= 2; 6506 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6507 } 6508 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6509 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6510 if ( zSig1 == 0 ) zSig1 = 1; 6511 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6512 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6513 mul64To128( zSig1, zSig1, &term2, &term3 ); 6514 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6515 while ( (int64_t) rem1 < 0 ) { 6516 --zSig1; 6517 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6518 term3 |= 1; 6519 term2 |= doubleZSig0; 6520 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6521 } 6522 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6523 } 6524 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6525 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6526 6527 } 6528 6529 /*---------------------------------------------------------------------------- 6530 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6531 | the corresponding value `b', and 0 otherwise. The invalid exception is 6532 | raised if either operand is a NaN. Otherwise, the comparison is performed 6533 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6534 *----------------------------------------------------------------------------*/ 6535 6536 int float128_eq(float128 a, float128 b, float_status *status) 6537 { 6538 6539 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6540 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6541 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6542 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6543 ) { 6544 float_raise(float_flag_invalid, status); 6545 return 0; 6546 } 6547 return 6548 ( a.low == b.low ) 6549 && ( ( a.high == b.high ) 6550 || ( ( a.low == 0 ) 6551 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6552 ); 6553 6554 } 6555 6556 /*---------------------------------------------------------------------------- 6557 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6558 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6559 | exception is raised if either operand is a NaN. The comparison is performed 6560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6561 *----------------------------------------------------------------------------*/ 6562 6563 int float128_le(float128 a, float128 b, float_status *status) 6564 { 6565 flag aSign, bSign; 6566 6567 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6568 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6569 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6570 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6571 ) { 6572 float_raise(float_flag_invalid, status); 6573 return 0; 6574 } 6575 aSign = extractFloat128Sign( a ); 6576 bSign = extractFloat128Sign( b ); 6577 if ( aSign != bSign ) { 6578 return 6579 aSign 6580 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6581 == 0 ); 6582 } 6583 return 6584 aSign ? le128( b.high, b.low, a.high, a.low ) 6585 : le128( a.high, a.low, b.high, b.low ); 6586 6587 } 6588 6589 /*---------------------------------------------------------------------------- 6590 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6591 | the corresponding value `b', and 0 otherwise. The invalid exception is 6592 | raised if either operand is a NaN. The comparison is performed according 6593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6594 *----------------------------------------------------------------------------*/ 6595 6596 int float128_lt(float128 a, float128 b, float_status *status) 6597 { 6598 flag aSign, bSign; 6599 6600 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6601 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6602 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6603 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6604 ) { 6605 float_raise(float_flag_invalid, status); 6606 return 0; 6607 } 6608 aSign = extractFloat128Sign( a ); 6609 bSign = extractFloat128Sign( b ); 6610 if ( aSign != bSign ) { 6611 return 6612 aSign 6613 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6614 != 0 ); 6615 } 6616 return 6617 aSign ? lt128( b.high, b.low, a.high, a.low ) 6618 : lt128( a.high, a.low, b.high, b.low ); 6619 6620 } 6621 6622 /*---------------------------------------------------------------------------- 6623 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6624 | be compared, and 0 otherwise. The invalid exception is raised if either 6625 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6626 | Standard for Binary Floating-Point Arithmetic. 6627 *----------------------------------------------------------------------------*/ 6628 6629 int float128_unordered(float128 a, float128 b, float_status *status) 6630 { 6631 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6632 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6633 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6634 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6635 ) { 6636 float_raise(float_flag_invalid, status); 6637 return 1; 6638 } 6639 return 0; 6640 } 6641 6642 /*---------------------------------------------------------------------------- 6643 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6644 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6645 | exception. The comparison is performed according to the IEC/IEEE Standard 6646 | for Binary Floating-Point Arithmetic. 6647 *----------------------------------------------------------------------------*/ 6648 6649 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6650 { 6651 6652 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6653 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6654 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6655 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6656 ) { 6657 if (float128_is_signaling_nan(a, status) 6658 || float128_is_signaling_nan(b, status)) { 6659 float_raise(float_flag_invalid, status); 6660 } 6661 return 0; 6662 } 6663 return 6664 ( a.low == b.low ) 6665 && ( ( a.high == b.high ) 6666 || ( ( a.low == 0 ) 6667 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6668 ); 6669 6670 } 6671 6672 /*---------------------------------------------------------------------------- 6673 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6674 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6675 | cause an exception. Otherwise, the comparison is performed according to the 6676 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6677 *----------------------------------------------------------------------------*/ 6678 6679 int float128_le_quiet(float128 a, float128 b, float_status *status) 6680 { 6681 flag aSign, bSign; 6682 6683 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6684 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6685 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6686 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6687 ) { 6688 if (float128_is_signaling_nan(a, status) 6689 || float128_is_signaling_nan(b, status)) { 6690 float_raise(float_flag_invalid, status); 6691 } 6692 return 0; 6693 } 6694 aSign = extractFloat128Sign( a ); 6695 bSign = extractFloat128Sign( b ); 6696 if ( aSign != bSign ) { 6697 return 6698 aSign 6699 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6700 == 0 ); 6701 } 6702 return 6703 aSign ? le128( b.high, b.low, a.high, a.low ) 6704 : le128( a.high, a.low, b.high, b.low ); 6705 6706 } 6707 6708 /*---------------------------------------------------------------------------- 6709 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6710 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6711 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6712 | Standard for Binary Floating-Point Arithmetic. 6713 *----------------------------------------------------------------------------*/ 6714 6715 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6716 { 6717 flag aSign, bSign; 6718 6719 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6720 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6721 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6722 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6723 ) { 6724 if (float128_is_signaling_nan(a, status) 6725 || float128_is_signaling_nan(b, status)) { 6726 float_raise(float_flag_invalid, status); 6727 } 6728 return 0; 6729 } 6730 aSign = extractFloat128Sign( a ); 6731 bSign = extractFloat128Sign( b ); 6732 if ( aSign != bSign ) { 6733 return 6734 aSign 6735 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6736 != 0 ); 6737 } 6738 return 6739 aSign ? lt128( b.high, b.low, a.high, a.low ) 6740 : lt128( a.high, a.low, b.high, b.low ); 6741 6742 } 6743 6744 /*---------------------------------------------------------------------------- 6745 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6746 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6747 | comparison is performed according to the IEC/IEEE Standard for Binary 6748 | Floating-Point Arithmetic. 6749 *----------------------------------------------------------------------------*/ 6750 6751 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6752 { 6753 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6754 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6755 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6756 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6757 ) { 6758 if (float128_is_signaling_nan(a, status) 6759 || float128_is_signaling_nan(b, status)) { 6760 float_raise(float_flag_invalid, status); 6761 } 6762 return 1; 6763 } 6764 return 0; 6765 } 6766 6767 #define COMPARE(s, nan_exp) \ 6768 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 6769 int is_quiet, float_status *status) \ 6770 { \ 6771 flag aSign, bSign; \ 6772 uint ## s ## _t av, bv; \ 6773 a = float ## s ## _squash_input_denormal(a, status); \ 6774 b = float ## s ## _squash_input_denormal(b, status); \ 6775 \ 6776 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 6777 extractFloat ## s ## Frac( a ) ) || \ 6778 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 6779 extractFloat ## s ## Frac( b ) )) { \ 6780 if (!is_quiet || \ 6781 float ## s ## _is_signaling_nan(a, status) || \ 6782 float ## s ## _is_signaling_nan(b, status)) { \ 6783 float_raise(float_flag_invalid, status); \ 6784 } \ 6785 return float_relation_unordered; \ 6786 } \ 6787 aSign = extractFloat ## s ## Sign( a ); \ 6788 bSign = extractFloat ## s ## Sign( b ); \ 6789 av = float ## s ## _val(a); \ 6790 bv = float ## s ## _val(b); \ 6791 if ( aSign != bSign ) { \ 6792 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 6793 /* zero case */ \ 6794 return float_relation_equal; \ 6795 } else { \ 6796 return 1 - (2 * aSign); \ 6797 } \ 6798 } else { \ 6799 if (av == bv) { \ 6800 return float_relation_equal; \ 6801 } else { \ 6802 return 1 - 2 * (aSign ^ ( av < bv )); \ 6803 } \ 6804 } \ 6805 } \ 6806 \ 6807 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 6808 { \ 6809 return float ## s ## _compare_internal(a, b, 0, status); \ 6810 } \ 6811 \ 6812 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 6813 float_status *status) \ 6814 { \ 6815 return float ## s ## _compare_internal(a, b, 1, status); \ 6816 } 6817 6818 COMPARE(32, 0xff) 6819 COMPARE(64, 0x7ff) 6820 6821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6822 int is_quiet, float_status *status) 6823 { 6824 flag aSign, bSign; 6825 6826 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6827 float_raise(float_flag_invalid, status); 6828 return float_relation_unordered; 6829 } 6830 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6831 ( extractFloatx80Frac( a )<<1 ) ) || 6832 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6833 ( extractFloatx80Frac( b )<<1 ) )) { 6834 if (!is_quiet || 6835 floatx80_is_signaling_nan(a, status) || 6836 floatx80_is_signaling_nan(b, status)) { 6837 float_raise(float_flag_invalid, status); 6838 } 6839 return float_relation_unordered; 6840 } 6841 aSign = extractFloatx80Sign( a ); 6842 bSign = extractFloatx80Sign( b ); 6843 if ( aSign != bSign ) { 6844 6845 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6846 ( ( a.low | b.low ) == 0 ) ) { 6847 /* zero case */ 6848 return float_relation_equal; 6849 } else { 6850 return 1 - (2 * aSign); 6851 } 6852 } else { 6853 if (a.low == b.low && a.high == b.high) { 6854 return float_relation_equal; 6855 } else { 6856 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6857 } 6858 } 6859 } 6860 6861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6862 { 6863 return floatx80_compare_internal(a, b, 0, status); 6864 } 6865 6866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6867 { 6868 return floatx80_compare_internal(a, b, 1, status); 6869 } 6870 6871 static inline int float128_compare_internal(float128 a, float128 b, 6872 int is_quiet, float_status *status) 6873 { 6874 flag aSign, bSign; 6875 6876 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6877 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6878 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6879 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6880 if (!is_quiet || 6881 float128_is_signaling_nan(a, status) || 6882 float128_is_signaling_nan(b, status)) { 6883 float_raise(float_flag_invalid, status); 6884 } 6885 return float_relation_unordered; 6886 } 6887 aSign = extractFloat128Sign( a ); 6888 bSign = extractFloat128Sign( b ); 6889 if ( aSign != bSign ) { 6890 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6891 /* zero case */ 6892 return float_relation_equal; 6893 } else { 6894 return 1 - (2 * aSign); 6895 } 6896 } else { 6897 if (a.low == b.low && a.high == b.high) { 6898 return float_relation_equal; 6899 } else { 6900 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6901 } 6902 } 6903 } 6904 6905 int float128_compare(float128 a, float128 b, float_status *status) 6906 { 6907 return float128_compare_internal(a, b, 0, status); 6908 } 6909 6910 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6911 { 6912 return float128_compare_internal(a, b, 1, status); 6913 } 6914 6915 /* min() and max() functions. These can't be implemented as 6916 * 'compare and pick one input' because that would mishandle 6917 * NaNs and +0 vs -0. 6918 * 6919 * minnum() and maxnum() functions. These are similar to the min() 6920 * and max() functions but if one of the arguments is a QNaN and 6921 * the other is numerical then the numerical argument is returned. 6922 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 6923 * and maxNum() operations. min() and max() are the typical min/max 6924 * semantics provided by many CPUs which predate that specification. 6925 * 6926 * minnummag() and maxnummag() functions correspond to minNumMag() 6927 * and minNumMag() from the IEEE-754 2008. 6928 */ 6929 #define MINMAX(s) \ 6930 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 6931 int ismin, int isieee, \ 6932 int ismag, \ 6933 float_status *status) \ 6934 { \ 6935 flag aSign, bSign; \ 6936 uint ## s ## _t av, bv, aav, abv; \ 6937 a = float ## s ## _squash_input_denormal(a, status); \ 6938 b = float ## s ## _squash_input_denormal(b, status); \ 6939 if (float ## s ## _is_any_nan(a) || \ 6940 float ## s ## _is_any_nan(b)) { \ 6941 if (isieee) { \ 6942 if (float ## s ## _is_quiet_nan(a, status) && \ 6943 !float ## s ##_is_any_nan(b)) { \ 6944 return b; \ 6945 } else if (float ## s ## _is_quiet_nan(b, status) && \ 6946 !float ## s ## _is_any_nan(a)) { \ 6947 return a; \ 6948 } \ 6949 } \ 6950 return propagateFloat ## s ## NaN(a, b, status); \ 6951 } \ 6952 aSign = extractFloat ## s ## Sign(a); \ 6953 bSign = extractFloat ## s ## Sign(b); \ 6954 av = float ## s ## _val(a); \ 6955 bv = float ## s ## _val(b); \ 6956 if (ismag) { \ 6957 aav = float ## s ## _abs(av); \ 6958 abv = float ## s ## _abs(bv); \ 6959 if (aav != abv) { \ 6960 if (ismin) { \ 6961 return (aav < abv) ? a : b; \ 6962 } else { \ 6963 return (aav < abv) ? b : a; \ 6964 } \ 6965 } \ 6966 } \ 6967 if (aSign != bSign) { \ 6968 if (ismin) { \ 6969 return aSign ? a : b; \ 6970 } else { \ 6971 return aSign ? b : a; \ 6972 } \ 6973 } else { \ 6974 if (ismin) { \ 6975 return (aSign ^ (av < bv)) ? a : b; \ 6976 } else { \ 6977 return (aSign ^ (av < bv)) ? b : a; \ 6978 } \ 6979 } \ 6980 } \ 6981 \ 6982 float ## s float ## s ## _min(float ## s a, float ## s b, \ 6983 float_status *status) \ 6984 { \ 6985 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 6986 } \ 6987 \ 6988 float ## s float ## s ## _max(float ## s a, float ## s b, \ 6989 float_status *status) \ 6990 { \ 6991 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 6992 } \ 6993 \ 6994 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 6995 float_status *status) \ 6996 { \ 6997 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 6998 } \ 6999 \ 7000 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7001 float_status *status) \ 7002 { \ 7003 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7004 } \ 7005 \ 7006 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7007 float_status *status) \ 7008 { \ 7009 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7010 } \ 7011 \ 7012 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7013 float_status *status) \ 7014 { \ 7015 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7016 } 7017 7018 MINMAX(32) 7019 MINMAX(64) 7020 7021 7022 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7023 { 7024 flag aSign; 7025 int32_t aExp; 7026 uint64_t aSig; 7027 7028 if (floatx80_invalid_encoding(a)) { 7029 float_raise(float_flag_invalid, status); 7030 return floatx80_default_nan(status); 7031 } 7032 aSig = extractFloatx80Frac( a ); 7033 aExp = extractFloatx80Exp( a ); 7034 aSign = extractFloatx80Sign( a ); 7035 7036 if ( aExp == 0x7FFF ) { 7037 if ( aSig<<1 ) { 7038 return propagateFloatx80NaN(a, a, status); 7039 } 7040 return a; 7041 } 7042 7043 if (aExp == 0) { 7044 if (aSig == 0) { 7045 return a; 7046 } 7047 aExp++; 7048 } 7049 7050 if (n > 0x10000) { 7051 n = 0x10000; 7052 } else if (n < -0x10000) { 7053 n = -0x10000; 7054 } 7055 7056 aExp += n; 7057 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7058 aSign, aExp, aSig, 0, status); 7059 } 7060 7061 float128 float128_scalbn(float128 a, int n, float_status *status) 7062 { 7063 flag aSign; 7064 int32_t aExp; 7065 uint64_t aSig0, aSig1; 7066 7067 aSig1 = extractFloat128Frac1( a ); 7068 aSig0 = extractFloat128Frac0( a ); 7069 aExp = extractFloat128Exp( a ); 7070 aSign = extractFloat128Sign( a ); 7071 if ( aExp == 0x7FFF ) { 7072 if ( aSig0 | aSig1 ) { 7073 return propagateFloat128NaN(a, a, status); 7074 } 7075 return a; 7076 } 7077 if (aExp != 0) { 7078 aSig0 |= LIT64( 0x0001000000000000 ); 7079 } else if (aSig0 == 0 && aSig1 == 0) { 7080 return a; 7081 } else { 7082 aExp++; 7083 } 7084 7085 if (n > 0x10000) { 7086 n = 0x10000; 7087 } else if (n < -0x10000) { 7088 n = -0x10000; 7089 } 7090 7091 aExp += n - 1; 7092 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7093 , status); 7094 7095 } 7096