1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /*---------------------------------------------------------------------------- 1667 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1668 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1669 | input. If `zSign' is 1, the input is negated before being converted to an 1670 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 1671 | is simply rounded to an integer, with the inexact exception raised if the 1672 | input cannot be represented exactly as an integer. However, if the fixed- 1673 | point input is too large, the invalid exception is raised and the largest 1674 | positive or negative integer is returned. 1675 *----------------------------------------------------------------------------*/ 1676 1677 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 1678 { 1679 int8_t roundingMode; 1680 flag roundNearestEven; 1681 int8_t roundIncrement, roundBits; 1682 int32_t z; 1683 1684 roundingMode = status->float_rounding_mode; 1685 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1686 switch (roundingMode) { 1687 case float_round_nearest_even: 1688 case float_round_ties_away: 1689 roundIncrement = 0x40; 1690 break; 1691 case float_round_to_zero: 1692 roundIncrement = 0; 1693 break; 1694 case float_round_up: 1695 roundIncrement = zSign ? 0 : 0x7f; 1696 break; 1697 case float_round_down: 1698 roundIncrement = zSign ? 0x7f : 0; 1699 break; 1700 default: 1701 abort(); 1702 } 1703 roundBits = absZ & 0x7F; 1704 absZ = ( absZ + roundIncrement )>>7; 1705 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1706 z = absZ; 1707 if ( zSign ) z = - z; 1708 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 1709 float_raise(float_flag_invalid, status); 1710 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 1711 } 1712 if (roundBits) { 1713 status->float_exception_flags |= float_flag_inexact; 1714 } 1715 return z; 1716 1717 } 1718 1719 /*---------------------------------------------------------------------------- 1720 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1721 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1722 | and returns the properly rounded 64-bit integer corresponding to the input. 1723 | If `zSign' is 1, the input is negated before being converted to an integer. 1724 | Ordinarily, the fixed-point input is simply rounded to an integer, with 1725 | the inexact exception raised if the input cannot be represented exactly as 1726 | an integer. However, if the fixed-point input is too large, the invalid 1727 | exception is raised and the largest positive or negative integer is 1728 | returned. 1729 *----------------------------------------------------------------------------*/ 1730 1731 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 1732 float_status *status) 1733 { 1734 int8_t roundingMode; 1735 flag roundNearestEven, increment; 1736 int64_t z; 1737 1738 roundingMode = status->float_rounding_mode; 1739 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1740 switch (roundingMode) { 1741 case float_round_nearest_even: 1742 case float_round_ties_away: 1743 increment = ((int64_t) absZ1 < 0); 1744 break; 1745 case float_round_to_zero: 1746 increment = 0; 1747 break; 1748 case float_round_up: 1749 increment = !zSign && absZ1; 1750 break; 1751 case float_round_down: 1752 increment = zSign && absZ1; 1753 break; 1754 default: 1755 abort(); 1756 } 1757 if ( increment ) { 1758 ++absZ0; 1759 if ( absZ0 == 0 ) goto overflow; 1760 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1761 } 1762 z = absZ0; 1763 if ( zSign ) z = - z; 1764 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1765 overflow: 1766 float_raise(float_flag_invalid, status); 1767 return 1768 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 1769 : LIT64( 0x7FFFFFFFFFFFFFFF ); 1770 } 1771 if (absZ1) { 1772 status->float_exception_flags |= float_flag_inexact; 1773 } 1774 return z; 1775 1776 } 1777 1778 /*---------------------------------------------------------------------------- 1779 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1780 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1781 | and returns the properly rounded 64-bit unsigned integer corresponding to the 1782 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 1783 | with the inexact exception raised if the input cannot be represented exactly 1784 | as an integer. However, if the fixed-point input is too large, the invalid 1785 | exception is raised and the largest unsigned integer is returned. 1786 *----------------------------------------------------------------------------*/ 1787 1788 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 1789 uint64_t absZ1, float_status *status) 1790 { 1791 int8_t roundingMode; 1792 flag roundNearestEven, increment; 1793 1794 roundingMode = status->float_rounding_mode; 1795 roundNearestEven = (roundingMode == float_round_nearest_even); 1796 switch (roundingMode) { 1797 case float_round_nearest_even: 1798 case float_round_ties_away: 1799 increment = ((int64_t)absZ1 < 0); 1800 break; 1801 case float_round_to_zero: 1802 increment = 0; 1803 break; 1804 case float_round_up: 1805 increment = !zSign && absZ1; 1806 break; 1807 case float_round_down: 1808 increment = zSign && absZ1; 1809 break; 1810 default: 1811 abort(); 1812 } 1813 if (increment) { 1814 ++absZ0; 1815 if (absZ0 == 0) { 1816 float_raise(float_flag_invalid, status); 1817 return LIT64(0xFFFFFFFFFFFFFFFF); 1818 } 1819 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 1820 } 1821 1822 if (zSign && absZ0) { 1823 float_raise(float_flag_invalid, status); 1824 return 0; 1825 } 1826 1827 if (absZ1) { 1828 status->float_exception_flags |= float_flag_inexact; 1829 } 1830 return absZ0; 1831 } 1832 1833 /*---------------------------------------------------------------------------- 1834 | If `a' is denormal and we are in flush-to-zero mode then set the 1835 | input-denormal exception and return zero. Otherwise just return the value. 1836 *----------------------------------------------------------------------------*/ 1837 float32 float32_squash_input_denormal(float32 a, float_status *status) 1838 { 1839 if (status->flush_inputs_to_zero) { 1840 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 1841 float_raise(float_flag_input_denormal, status); 1842 return make_float32(float32_val(a) & 0x80000000); 1843 } 1844 } 1845 return a; 1846 } 1847 1848 /*---------------------------------------------------------------------------- 1849 | Normalizes the subnormal single-precision floating-point value represented 1850 | by the denormalized significand `aSig'. The normalized exponent and 1851 | significand are stored at the locations pointed to by `zExpPtr' and 1852 | `zSigPtr', respectively. 1853 *----------------------------------------------------------------------------*/ 1854 1855 static void 1856 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 1857 { 1858 int8_t shiftCount; 1859 1860 shiftCount = countLeadingZeros32( aSig ) - 8; 1861 *zSigPtr = aSig<<shiftCount; 1862 *zExpPtr = 1 - shiftCount; 1863 1864 } 1865 1866 /*---------------------------------------------------------------------------- 1867 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1868 | single-precision floating-point value, returning the result. After being 1869 | shifted into the proper positions, the three fields are simply added 1870 | together to form the result. This means that any integer portion of `zSig' 1871 | will be added into the exponent. Since a properly normalized significand 1872 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1873 | than the desired result exponent whenever `zSig' is a complete, normalized 1874 | significand. 1875 *----------------------------------------------------------------------------*/ 1876 1877 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 1878 { 1879 1880 return make_float32( 1881 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 1882 1883 } 1884 1885 /*---------------------------------------------------------------------------- 1886 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1887 | and significand `zSig', and returns the proper single-precision floating- 1888 | point value corresponding to the abstract input. Ordinarily, the abstract 1889 | value is simply rounded and packed into the single-precision format, with 1890 | the inexact exception raised if the abstract input cannot be represented 1891 | exactly. However, if the abstract value is too large, the overflow and 1892 | inexact exceptions are raised and an infinity or maximal finite value is 1893 | returned. If the abstract value is too small, the input value is rounded to 1894 | a subnormal number, and the underflow and inexact exceptions are raised if 1895 | the abstract input cannot be represented exactly as a subnormal single- 1896 | precision floating-point number. 1897 | The input significand `zSig' has its binary point between bits 30 1898 | and 29, which is 7 bits to the left of the usual location. This shifted 1899 | significand must be normalized or smaller. If `zSig' is not normalized, 1900 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1901 | and it must not require rounding. In the usual case that `zSig' is 1902 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1903 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1904 | Binary Floating-Point Arithmetic. 1905 *----------------------------------------------------------------------------*/ 1906 1907 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1908 float_status *status) 1909 { 1910 int8_t roundingMode; 1911 flag roundNearestEven; 1912 int8_t roundIncrement, roundBits; 1913 flag isTiny; 1914 1915 roundingMode = status->float_rounding_mode; 1916 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1917 switch (roundingMode) { 1918 case float_round_nearest_even: 1919 case float_round_ties_away: 1920 roundIncrement = 0x40; 1921 break; 1922 case float_round_to_zero: 1923 roundIncrement = 0; 1924 break; 1925 case float_round_up: 1926 roundIncrement = zSign ? 0 : 0x7f; 1927 break; 1928 case float_round_down: 1929 roundIncrement = zSign ? 0x7f : 0; 1930 break; 1931 default: 1932 abort(); 1933 break; 1934 } 1935 roundBits = zSig & 0x7F; 1936 if ( 0xFD <= (uint16_t) zExp ) { 1937 if ( ( 0xFD < zExp ) 1938 || ( ( zExp == 0xFD ) 1939 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 1940 ) { 1941 float_raise(float_flag_overflow | float_flag_inexact, status); 1942 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 1943 } 1944 if ( zExp < 0 ) { 1945 if (status->flush_to_zero) { 1946 float_raise(float_flag_output_denormal, status); 1947 return packFloat32(zSign, 0, 0); 1948 } 1949 isTiny = 1950 (status->float_detect_tininess 1951 == float_tininess_before_rounding) 1952 || ( zExp < -1 ) 1953 || ( zSig + roundIncrement < 0x80000000 ); 1954 shift32RightJamming( zSig, - zExp, &zSig ); 1955 zExp = 0; 1956 roundBits = zSig & 0x7F; 1957 if (isTiny && roundBits) { 1958 float_raise(float_flag_underflow, status); 1959 } 1960 } 1961 } 1962 if (roundBits) { 1963 status->float_exception_flags |= float_flag_inexact; 1964 } 1965 zSig = ( zSig + roundIncrement )>>7; 1966 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1967 if ( zSig == 0 ) zExp = 0; 1968 return packFloat32( zSign, zExp, zSig ); 1969 1970 } 1971 1972 /*---------------------------------------------------------------------------- 1973 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1974 | and significand `zSig', and returns the proper single-precision floating- 1975 | point value corresponding to the abstract input. This routine is just like 1976 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 1977 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1978 | floating-point exponent. 1979 *----------------------------------------------------------------------------*/ 1980 1981 static float32 1982 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1983 float_status *status) 1984 { 1985 int8_t shiftCount; 1986 1987 shiftCount = countLeadingZeros32( zSig ) - 1; 1988 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 1989 status); 1990 1991 } 1992 1993 /*---------------------------------------------------------------------------- 1994 | If `a' is denormal and we are in flush-to-zero mode then set the 1995 | input-denormal exception and return zero. Otherwise just return the value. 1996 *----------------------------------------------------------------------------*/ 1997 float64 float64_squash_input_denormal(float64 a, float_status *status) 1998 { 1999 if (status->flush_inputs_to_zero) { 2000 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2001 float_raise(float_flag_input_denormal, status); 2002 return make_float64(float64_val(a) & (1ULL << 63)); 2003 } 2004 } 2005 return a; 2006 } 2007 2008 /*---------------------------------------------------------------------------- 2009 | Normalizes the subnormal double-precision floating-point value represented 2010 | by the denormalized significand `aSig'. The normalized exponent and 2011 | significand are stored at the locations pointed to by `zExpPtr' and 2012 | `zSigPtr', respectively. 2013 *----------------------------------------------------------------------------*/ 2014 2015 static void 2016 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2017 { 2018 int8_t shiftCount; 2019 2020 shiftCount = countLeadingZeros64( aSig ) - 11; 2021 *zSigPtr = aSig<<shiftCount; 2022 *zExpPtr = 1 - shiftCount; 2023 2024 } 2025 2026 /*---------------------------------------------------------------------------- 2027 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2028 | double-precision floating-point value, returning the result. After being 2029 | shifted into the proper positions, the three fields are simply added 2030 | together to form the result. This means that any integer portion of `zSig' 2031 | will be added into the exponent. Since a properly normalized significand 2032 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2033 | than the desired result exponent whenever `zSig' is a complete, normalized 2034 | significand. 2035 *----------------------------------------------------------------------------*/ 2036 2037 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2038 { 2039 2040 return make_float64( 2041 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2042 2043 } 2044 2045 /*---------------------------------------------------------------------------- 2046 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2047 | and significand `zSig', and returns the proper double-precision floating- 2048 | point value corresponding to the abstract input. Ordinarily, the abstract 2049 | value is simply rounded and packed into the double-precision format, with 2050 | the inexact exception raised if the abstract input cannot be represented 2051 | exactly. However, if the abstract value is too large, the overflow and 2052 | inexact exceptions are raised and an infinity or maximal finite value is 2053 | returned. If the abstract value is too small, the input value is rounded to 2054 | a subnormal number, and the underflow and inexact exceptions are raised if 2055 | the abstract input cannot be represented exactly as a subnormal double- 2056 | precision floating-point number. 2057 | The input significand `zSig' has its binary point between bits 62 2058 | and 61, which is 10 bits to the left of the usual location. This shifted 2059 | significand must be normalized or smaller. If `zSig' is not normalized, 2060 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2061 | and it must not require rounding. In the usual case that `zSig' is 2062 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2063 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2064 | Binary Floating-Point Arithmetic. 2065 *----------------------------------------------------------------------------*/ 2066 2067 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2068 float_status *status) 2069 { 2070 int8_t roundingMode; 2071 flag roundNearestEven; 2072 int roundIncrement, roundBits; 2073 flag isTiny; 2074 2075 roundingMode = status->float_rounding_mode; 2076 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2077 switch (roundingMode) { 2078 case float_round_nearest_even: 2079 case float_round_ties_away: 2080 roundIncrement = 0x200; 2081 break; 2082 case float_round_to_zero: 2083 roundIncrement = 0; 2084 break; 2085 case float_round_up: 2086 roundIncrement = zSign ? 0 : 0x3ff; 2087 break; 2088 case float_round_down: 2089 roundIncrement = zSign ? 0x3ff : 0; 2090 break; 2091 case float_round_to_odd: 2092 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2093 break; 2094 default: 2095 abort(); 2096 } 2097 roundBits = zSig & 0x3FF; 2098 if ( 0x7FD <= (uint16_t) zExp ) { 2099 if ( ( 0x7FD < zExp ) 2100 || ( ( zExp == 0x7FD ) 2101 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2102 ) { 2103 bool overflow_to_inf = roundingMode != float_round_to_odd && 2104 roundIncrement != 0; 2105 float_raise(float_flag_overflow | float_flag_inexact, status); 2106 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2107 } 2108 if ( zExp < 0 ) { 2109 if (status->flush_to_zero) { 2110 float_raise(float_flag_output_denormal, status); 2111 return packFloat64(zSign, 0, 0); 2112 } 2113 isTiny = 2114 (status->float_detect_tininess 2115 == float_tininess_before_rounding) 2116 || ( zExp < -1 ) 2117 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2118 shift64RightJamming( zSig, - zExp, &zSig ); 2119 zExp = 0; 2120 roundBits = zSig & 0x3FF; 2121 if (isTiny && roundBits) { 2122 float_raise(float_flag_underflow, status); 2123 } 2124 if (roundingMode == float_round_to_odd) { 2125 /* 2126 * For round-to-odd case, the roundIncrement depends on 2127 * zSig which just changed. 2128 */ 2129 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2130 } 2131 } 2132 } 2133 if (roundBits) { 2134 status->float_exception_flags |= float_flag_inexact; 2135 } 2136 zSig = ( zSig + roundIncrement )>>10; 2137 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2138 if ( zSig == 0 ) zExp = 0; 2139 return packFloat64( zSign, zExp, zSig ); 2140 2141 } 2142 2143 /*---------------------------------------------------------------------------- 2144 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2145 | and significand `zSig', and returns the proper double-precision floating- 2146 | point value corresponding to the abstract input. This routine is just like 2147 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2148 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2149 | floating-point exponent. 2150 *----------------------------------------------------------------------------*/ 2151 2152 static float64 2153 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2154 float_status *status) 2155 { 2156 int8_t shiftCount; 2157 2158 shiftCount = countLeadingZeros64( zSig ) - 1; 2159 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2160 status); 2161 2162 } 2163 2164 /*---------------------------------------------------------------------------- 2165 | Returns the fraction bits of the extended double-precision floating-point 2166 | value `a'. 2167 *----------------------------------------------------------------------------*/ 2168 2169 static inline uint64_t extractFloatx80Frac( floatx80 a ) 2170 { 2171 2172 return a.low; 2173 2174 } 2175 2176 /*---------------------------------------------------------------------------- 2177 | Returns the exponent bits of the extended double-precision floating-point 2178 | value `a'. 2179 *----------------------------------------------------------------------------*/ 2180 2181 static inline int32_t extractFloatx80Exp( floatx80 a ) 2182 { 2183 2184 return a.high & 0x7FFF; 2185 2186 } 2187 2188 /*---------------------------------------------------------------------------- 2189 | Returns the sign bit of the extended double-precision floating-point value 2190 | `a'. 2191 *----------------------------------------------------------------------------*/ 2192 2193 static inline flag extractFloatx80Sign( floatx80 a ) 2194 { 2195 2196 return a.high>>15; 2197 2198 } 2199 2200 /*---------------------------------------------------------------------------- 2201 | Normalizes the subnormal extended double-precision floating-point value 2202 | represented by the denormalized significand `aSig'. The normalized exponent 2203 | and significand are stored at the locations pointed to by `zExpPtr' and 2204 | `zSigPtr', respectively. 2205 *----------------------------------------------------------------------------*/ 2206 2207 static void 2208 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 2209 { 2210 int8_t shiftCount; 2211 2212 shiftCount = countLeadingZeros64( aSig ); 2213 *zSigPtr = aSig<<shiftCount; 2214 *zExpPtr = 1 - shiftCount; 2215 2216 } 2217 2218 /*---------------------------------------------------------------------------- 2219 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 2220 | extended double-precision floating-point value, returning the result. 2221 *----------------------------------------------------------------------------*/ 2222 2223 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 2224 { 2225 floatx80 z; 2226 2227 z.low = zSig; 2228 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 2229 return z; 2230 2231 } 2232 2233 /*---------------------------------------------------------------------------- 2234 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2235 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2236 | and returns the proper extended double-precision floating-point value 2237 | corresponding to the abstract input. Ordinarily, the abstract value is 2238 | rounded and packed into the extended double-precision format, with the 2239 | inexact exception raised if the abstract input cannot be represented 2240 | exactly. However, if the abstract value is too large, the overflow and 2241 | inexact exceptions are raised and an infinity or maximal finite value is 2242 | returned. If the abstract value is too small, the input value is rounded to 2243 | a subnormal number, and the underflow and inexact exceptions are raised if 2244 | the abstract input cannot be represented exactly as a subnormal extended 2245 | double-precision floating-point number. 2246 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2247 | number of bits as single or double precision, respectively. Otherwise, the 2248 | result is rounded to the full precision of the extended double-precision 2249 | format. 2250 | The input significand must be normalized or smaller. If the input 2251 | significand is not normalized, `zExp' must be 0; in that case, the result 2252 | returned is a subnormal number, and it must not require rounding. The 2253 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2254 | Floating-Point Arithmetic. 2255 *----------------------------------------------------------------------------*/ 2256 2257 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2258 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2259 float_status *status) 2260 { 2261 int8_t roundingMode; 2262 flag roundNearestEven, increment, isTiny; 2263 int64_t roundIncrement, roundMask, roundBits; 2264 2265 roundingMode = status->float_rounding_mode; 2266 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2267 if ( roundingPrecision == 80 ) goto precision80; 2268 if ( roundingPrecision == 64 ) { 2269 roundIncrement = LIT64( 0x0000000000000400 ); 2270 roundMask = LIT64( 0x00000000000007FF ); 2271 } 2272 else if ( roundingPrecision == 32 ) { 2273 roundIncrement = LIT64( 0x0000008000000000 ); 2274 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2275 } 2276 else { 2277 goto precision80; 2278 } 2279 zSig0 |= ( zSig1 != 0 ); 2280 switch (roundingMode) { 2281 case float_round_nearest_even: 2282 case float_round_ties_away: 2283 break; 2284 case float_round_to_zero: 2285 roundIncrement = 0; 2286 break; 2287 case float_round_up: 2288 roundIncrement = zSign ? 0 : roundMask; 2289 break; 2290 case float_round_down: 2291 roundIncrement = zSign ? roundMask : 0; 2292 break; 2293 default: 2294 abort(); 2295 } 2296 roundBits = zSig0 & roundMask; 2297 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2298 if ( ( 0x7FFE < zExp ) 2299 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2300 ) { 2301 goto overflow; 2302 } 2303 if ( zExp <= 0 ) { 2304 if (status->flush_to_zero) { 2305 float_raise(float_flag_output_denormal, status); 2306 return packFloatx80(zSign, 0, 0); 2307 } 2308 isTiny = 2309 (status->float_detect_tininess 2310 == float_tininess_before_rounding) 2311 || ( zExp < 0 ) 2312 || ( zSig0 <= zSig0 + roundIncrement ); 2313 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2314 zExp = 0; 2315 roundBits = zSig0 & roundMask; 2316 if (isTiny && roundBits) { 2317 float_raise(float_flag_underflow, status); 2318 } 2319 if (roundBits) { 2320 status->float_exception_flags |= float_flag_inexact; 2321 } 2322 zSig0 += roundIncrement; 2323 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2324 roundIncrement = roundMask + 1; 2325 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2326 roundMask |= roundIncrement; 2327 } 2328 zSig0 &= ~ roundMask; 2329 return packFloatx80( zSign, zExp, zSig0 ); 2330 } 2331 } 2332 if (roundBits) { 2333 status->float_exception_flags |= float_flag_inexact; 2334 } 2335 zSig0 += roundIncrement; 2336 if ( zSig0 < roundIncrement ) { 2337 ++zExp; 2338 zSig0 = LIT64( 0x8000000000000000 ); 2339 } 2340 roundIncrement = roundMask + 1; 2341 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2342 roundMask |= roundIncrement; 2343 } 2344 zSig0 &= ~ roundMask; 2345 if ( zSig0 == 0 ) zExp = 0; 2346 return packFloatx80( zSign, zExp, zSig0 ); 2347 precision80: 2348 switch (roundingMode) { 2349 case float_round_nearest_even: 2350 case float_round_ties_away: 2351 increment = ((int64_t)zSig1 < 0); 2352 break; 2353 case float_round_to_zero: 2354 increment = 0; 2355 break; 2356 case float_round_up: 2357 increment = !zSign && zSig1; 2358 break; 2359 case float_round_down: 2360 increment = zSign && zSig1; 2361 break; 2362 default: 2363 abort(); 2364 } 2365 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2366 if ( ( 0x7FFE < zExp ) 2367 || ( ( zExp == 0x7FFE ) 2368 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2369 && increment 2370 ) 2371 ) { 2372 roundMask = 0; 2373 overflow: 2374 float_raise(float_flag_overflow | float_flag_inexact, status); 2375 if ( ( roundingMode == float_round_to_zero ) 2376 || ( zSign && ( roundingMode == float_round_up ) ) 2377 || ( ! zSign && ( roundingMode == float_round_down ) ) 2378 ) { 2379 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2380 } 2381 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2382 } 2383 if ( zExp <= 0 ) { 2384 isTiny = 2385 (status->float_detect_tininess 2386 == float_tininess_before_rounding) 2387 || ( zExp < 0 ) 2388 || ! increment 2389 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2390 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2391 zExp = 0; 2392 if (isTiny && zSig1) { 2393 float_raise(float_flag_underflow, status); 2394 } 2395 if (zSig1) { 2396 status->float_exception_flags |= float_flag_inexact; 2397 } 2398 switch (roundingMode) { 2399 case float_round_nearest_even: 2400 case float_round_ties_away: 2401 increment = ((int64_t)zSig1 < 0); 2402 break; 2403 case float_round_to_zero: 2404 increment = 0; 2405 break; 2406 case float_round_up: 2407 increment = !zSign && zSig1; 2408 break; 2409 case float_round_down: 2410 increment = zSign && zSig1; 2411 break; 2412 default: 2413 abort(); 2414 } 2415 if ( increment ) { 2416 ++zSig0; 2417 zSig0 &= 2418 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2419 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2420 } 2421 return packFloatx80( zSign, zExp, zSig0 ); 2422 } 2423 } 2424 if (zSig1) { 2425 status->float_exception_flags |= float_flag_inexact; 2426 } 2427 if ( increment ) { 2428 ++zSig0; 2429 if ( zSig0 == 0 ) { 2430 ++zExp; 2431 zSig0 = LIT64( 0x8000000000000000 ); 2432 } 2433 else { 2434 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2435 } 2436 } 2437 else { 2438 if ( zSig0 == 0 ) zExp = 0; 2439 } 2440 return packFloatx80( zSign, zExp, zSig0 ); 2441 2442 } 2443 2444 /*---------------------------------------------------------------------------- 2445 | Takes an abstract floating-point value having sign `zSign', exponent 2446 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2447 | and returns the proper extended double-precision floating-point value 2448 | corresponding to the abstract input. This routine is just like 2449 | `roundAndPackFloatx80' except that the input significand does not have to be 2450 | normalized. 2451 *----------------------------------------------------------------------------*/ 2452 2453 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2454 flag zSign, int32_t zExp, 2455 uint64_t zSig0, uint64_t zSig1, 2456 float_status *status) 2457 { 2458 int8_t shiftCount; 2459 2460 if ( zSig0 == 0 ) { 2461 zSig0 = zSig1; 2462 zSig1 = 0; 2463 zExp -= 64; 2464 } 2465 shiftCount = countLeadingZeros64( zSig0 ); 2466 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2467 zExp -= shiftCount; 2468 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2469 zSig0, zSig1, status); 2470 2471 } 2472 2473 /*---------------------------------------------------------------------------- 2474 | Returns the least-significant 64 fraction bits of the quadruple-precision 2475 | floating-point value `a'. 2476 *----------------------------------------------------------------------------*/ 2477 2478 static inline uint64_t extractFloat128Frac1( float128 a ) 2479 { 2480 2481 return a.low; 2482 2483 } 2484 2485 /*---------------------------------------------------------------------------- 2486 | Returns the most-significant 48 fraction bits of the quadruple-precision 2487 | floating-point value `a'. 2488 *----------------------------------------------------------------------------*/ 2489 2490 static inline uint64_t extractFloat128Frac0( float128 a ) 2491 { 2492 2493 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2494 2495 } 2496 2497 /*---------------------------------------------------------------------------- 2498 | Returns the exponent bits of the quadruple-precision floating-point value 2499 | `a'. 2500 *----------------------------------------------------------------------------*/ 2501 2502 static inline int32_t extractFloat128Exp( float128 a ) 2503 { 2504 2505 return ( a.high>>48 ) & 0x7FFF; 2506 2507 } 2508 2509 /*---------------------------------------------------------------------------- 2510 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2511 *----------------------------------------------------------------------------*/ 2512 2513 static inline flag extractFloat128Sign( float128 a ) 2514 { 2515 2516 return a.high>>63; 2517 2518 } 2519 2520 /*---------------------------------------------------------------------------- 2521 | Normalizes the subnormal quadruple-precision floating-point value 2522 | represented by the denormalized significand formed by the concatenation of 2523 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2524 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2525 | significand are stored at the location pointed to by `zSig0Ptr', and the 2526 | least significant 64 bits of the normalized significand are stored at the 2527 | location pointed to by `zSig1Ptr'. 2528 *----------------------------------------------------------------------------*/ 2529 2530 static void 2531 normalizeFloat128Subnormal( 2532 uint64_t aSig0, 2533 uint64_t aSig1, 2534 int32_t *zExpPtr, 2535 uint64_t *zSig0Ptr, 2536 uint64_t *zSig1Ptr 2537 ) 2538 { 2539 int8_t shiftCount; 2540 2541 if ( aSig0 == 0 ) { 2542 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2543 if ( shiftCount < 0 ) { 2544 *zSig0Ptr = aSig1>>( - shiftCount ); 2545 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2546 } 2547 else { 2548 *zSig0Ptr = aSig1<<shiftCount; 2549 *zSig1Ptr = 0; 2550 } 2551 *zExpPtr = - shiftCount - 63; 2552 } 2553 else { 2554 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2555 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2556 *zExpPtr = 1 - shiftCount; 2557 } 2558 2559 } 2560 2561 /*---------------------------------------------------------------------------- 2562 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2563 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2564 | floating-point value, returning the result. After being shifted into the 2565 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2566 | added together to form the most significant 32 bits of the result. This 2567 | means that any integer portion of `zSig0' will be added into the exponent. 2568 | Since a properly normalized significand will have an integer portion equal 2569 | to 1, the `zExp' input should be 1 less than the desired result exponent 2570 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2571 | significand. 2572 *----------------------------------------------------------------------------*/ 2573 2574 static inline float128 2575 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2576 { 2577 float128 z; 2578 2579 z.low = zSig1; 2580 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2581 return z; 2582 2583 } 2584 2585 /*---------------------------------------------------------------------------- 2586 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2587 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2588 | and `zSig2', and returns the proper quadruple-precision floating-point value 2589 | corresponding to the abstract input. Ordinarily, the abstract value is 2590 | simply rounded and packed into the quadruple-precision format, with the 2591 | inexact exception raised if the abstract input cannot be represented 2592 | exactly. However, if the abstract value is too large, the overflow and 2593 | inexact exceptions are raised and an infinity or maximal finite value is 2594 | returned. If the abstract value is too small, the input value is rounded to 2595 | a subnormal number, and the underflow and inexact exceptions are raised if 2596 | the abstract input cannot be represented exactly as a subnormal quadruple- 2597 | precision floating-point number. 2598 | The input significand must be normalized or smaller. If the input 2599 | significand is not normalized, `zExp' must be 0; in that case, the result 2600 | returned is a subnormal number, and it must not require rounding. In the 2601 | usual case that the input significand is normalized, `zExp' must be 1 less 2602 | than the ``true'' floating-point exponent. The handling of underflow and 2603 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2604 *----------------------------------------------------------------------------*/ 2605 2606 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2607 uint64_t zSig0, uint64_t zSig1, 2608 uint64_t zSig2, float_status *status) 2609 { 2610 int8_t roundingMode; 2611 flag roundNearestEven, increment, isTiny; 2612 2613 roundingMode = status->float_rounding_mode; 2614 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2615 switch (roundingMode) { 2616 case float_round_nearest_even: 2617 case float_round_ties_away: 2618 increment = ((int64_t)zSig2 < 0); 2619 break; 2620 case float_round_to_zero: 2621 increment = 0; 2622 break; 2623 case float_round_up: 2624 increment = !zSign && zSig2; 2625 break; 2626 case float_round_down: 2627 increment = zSign && zSig2; 2628 break; 2629 case float_round_to_odd: 2630 increment = !(zSig1 & 0x1) && zSig2; 2631 break; 2632 default: 2633 abort(); 2634 } 2635 if ( 0x7FFD <= (uint32_t) zExp ) { 2636 if ( ( 0x7FFD < zExp ) 2637 || ( ( zExp == 0x7FFD ) 2638 && eq128( 2639 LIT64( 0x0001FFFFFFFFFFFF ), 2640 LIT64( 0xFFFFFFFFFFFFFFFF ), 2641 zSig0, 2642 zSig1 2643 ) 2644 && increment 2645 ) 2646 ) { 2647 float_raise(float_flag_overflow | float_flag_inexact, status); 2648 if ( ( roundingMode == float_round_to_zero ) 2649 || ( zSign && ( roundingMode == float_round_up ) ) 2650 || ( ! zSign && ( roundingMode == float_round_down ) ) 2651 || (roundingMode == float_round_to_odd) 2652 ) { 2653 return 2654 packFloat128( 2655 zSign, 2656 0x7FFE, 2657 LIT64( 0x0000FFFFFFFFFFFF ), 2658 LIT64( 0xFFFFFFFFFFFFFFFF ) 2659 ); 2660 } 2661 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2662 } 2663 if ( zExp < 0 ) { 2664 if (status->flush_to_zero) { 2665 float_raise(float_flag_output_denormal, status); 2666 return packFloat128(zSign, 0, 0, 0); 2667 } 2668 isTiny = 2669 (status->float_detect_tininess 2670 == float_tininess_before_rounding) 2671 || ( zExp < -1 ) 2672 || ! increment 2673 || lt128( 2674 zSig0, 2675 zSig1, 2676 LIT64( 0x0001FFFFFFFFFFFF ), 2677 LIT64( 0xFFFFFFFFFFFFFFFF ) 2678 ); 2679 shift128ExtraRightJamming( 2680 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2681 zExp = 0; 2682 if (isTiny && zSig2) { 2683 float_raise(float_flag_underflow, status); 2684 } 2685 switch (roundingMode) { 2686 case float_round_nearest_even: 2687 case float_round_ties_away: 2688 increment = ((int64_t)zSig2 < 0); 2689 break; 2690 case float_round_to_zero: 2691 increment = 0; 2692 break; 2693 case float_round_up: 2694 increment = !zSign && zSig2; 2695 break; 2696 case float_round_down: 2697 increment = zSign && zSig2; 2698 break; 2699 case float_round_to_odd: 2700 increment = !(zSig1 & 0x1) && zSig2; 2701 break; 2702 default: 2703 abort(); 2704 } 2705 } 2706 } 2707 if (zSig2) { 2708 status->float_exception_flags |= float_flag_inexact; 2709 } 2710 if ( increment ) { 2711 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2712 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2713 } 2714 else { 2715 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2716 } 2717 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2718 2719 } 2720 2721 /*---------------------------------------------------------------------------- 2722 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2723 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2724 | returns the proper quadruple-precision floating-point value corresponding 2725 | to the abstract input. This routine is just like `roundAndPackFloat128' 2726 | except that the input significand has fewer bits and does not have to be 2727 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2728 | point exponent. 2729 *----------------------------------------------------------------------------*/ 2730 2731 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2732 uint64_t zSig0, uint64_t zSig1, 2733 float_status *status) 2734 { 2735 int8_t shiftCount; 2736 uint64_t zSig2; 2737 2738 if ( zSig0 == 0 ) { 2739 zSig0 = zSig1; 2740 zSig1 = 0; 2741 zExp -= 64; 2742 } 2743 shiftCount = countLeadingZeros64( zSig0 ) - 15; 2744 if ( 0 <= shiftCount ) { 2745 zSig2 = 0; 2746 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2747 } 2748 else { 2749 shift128ExtraRightJamming( 2750 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 2751 } 2752 zExp -= shiftCount; 2753 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 2754 2755 } 2756 2757 2758 /*---------------------------------------------------------------------------- 2759 | Returns the result of converting the 32-bit two's complement integer `a' 2760 | to the extended double-precision floating-point format. The conversion 2761 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2762 | Arithmetic. 2763 *----------------------------------------------------------------------------*/ 2764 2765 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2766 { 2767 flag zSign; 2768 uint32_t absA; 2769 int8_t shiftCount; 2770 uint64_t zSig; 2771 2772 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2773 zSign = ( a < 0 ); 2774 absA = zSign ? - a : a; 2775 shiftCount = countLeadingZeros32( absA ) + 32; 2776 zSig = absA; 2777 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 2778 2779 } 2780 2781 /*---------------------------------------------------------------------------- 2782 | Returns the result of converting the 32-bit two's complement integer `a' to 2783 | the quadruple-precision floating-point format. The conversion is performed 2784 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2785 *----------------------------------------------------------------------------*/ 2786 2787 float128 int32_to_float128(int32_t a, float_status *status) 2788 { 2789 flag zSign; 2790 uint32_t absA; 2791 int8_t shiftCount; 2792 uint64_t zSig0; 2793 2794 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2795 zSign = ( a < 0 ); 2796 absA = zSign ? - a : a; 2797 shiftCount = countLeadingZeros32( absA ) + 17; 2798 zSig0 = absA; 2799 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 2800 2801 } 2802 2803 /*---------------------------------------------------------------------------- 2804 | Returns the result of converting the 64-bit two's complement integer `a' 2805 | to the extended double-precision floating-point format. The conversion 2806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2807 | Arithmetic. 2808 *----------------------------------------------------------------------------*/ 2809 2810 floatx80 int64_to_floatx80(int64_t a, float_status *status) 2811 { 2812 flag zSign; 2813 uint64_t absA; 2814 int8_t shiftCount; 2815 2816 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2817 zSign = ( a < 0 ); 2818 absA = zSign ? - a : a; 2819 shiftCount = countLeadingZeros64( absA ); 2820 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 2821 2822 } 2823 2824 /*---------------------------------------------------------------------------- 2825 | Returns the result of converting the 64-bit two's complement integer `a' to 2826 | the quadruple-precision floating-point format. The conversion is performed 2827 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2828 *----------------------------------------------------------------------------*/ 2829 2830 float128 int64_to_float128(int64_t a, float_status *status) 2831 { 2832 flag zSign; 2833 uint64_t absA; 2834 int8_t shiftCount; 2835 int32_t zExp; 2836 uint64_t zSig0, zSig1; 2837 2838 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2839 zSign = ( a < 0 ); 2840 absA = zSign ? - a : a; 2841 shiftCount = countLeadingZeros64( absA ) + 49; 2842 zExp = 0x406E - shiftCount; 2843 if ( 64 <= shiftCount ) { 2844 zSig1 = 0; 2845 zSig0 = absA; 2846 shiftCount -= 64; 2847 } 2848 else { 2849 zSig1 = absA; 2850 zSig0 = 0; 2851 } 2852 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2853 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2854 2855 } 2856 2857 /*---------------------------------------------------------------------------- 2858 | Returns the result of converting the 64-bit unsigned integer `a' 2859 | to the quadruple-precision floating-point format. The conversion is performed 2860 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2861 *----------------------------------------------------------------------------*/ 2862 2863 float128 uint64_to_float128(uint64_t a, float_status *status) 2864 { 2865 if (a == 0) { 2866 return float128_zero; 2867 } 2868 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 2869 } 2870 2871 2872 2873 2874 /*---------------------------------------------------------------------------- 2875 | Returns the result of converting the single-precision floating-point value 2876 | `a' to the double-precision floating-point format. The conversion is 2877 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2878 | Arithmetic. 2879 *----------------------------------------------------------------------------*/ 2880 2881 float64 float32_to_float64(float32 a, float_status *status) 2882 { 2883 flag aSign; 2884 int aExp; 2885 uint32_t aSig; 2886 a = float32_squash_input_denormal(a, status); 2887 2888 aSig = extractFloat32Frac( a ); 2889 aExp = extractFloat32Exp( a ); 2890 aSign = extractFloat32Sign( a ); 2891 if ( aExp == 0xFF ) { 2892 if (aSig) { 2893 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 2894 } 2895 return packFloat64( aSign, 0x7FF, 0 ); 2896 } 2897 if ( aExp == 0 ) { 2898 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 2899 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2900 --aExp; 2901 } 2902 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 2903 2904 } 2905 2906 /*---------------------------------------------------------------------------- 2907 | Returns the result of converting the single-precision floating-point value 2908 | `a' to the extended double-precision floating-point format. The conversion 2909 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2910 | Arithmetic. 2911 *----------------------------------------------------------------------------*/ 2912 2913 floatx80 float32_to_floatx80(float32 a, float_status *status) 2914 { 2915 flag aSign; 2916 int aExp; 2917 uint32_t aSig; 2918 2919 a = float32_squash_input_denormal(a, status); 2920 aSig = extractFloat32Frac( a ); 2921 aExp = extractFloat32Exp( a ); 2922 aSign = extractFloat32Sign( a ); 2923 if ( aExp == 0xFF ) { 2924 if (aSig) { 2925 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 2926 } 2927 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2928 } 2929 if ( aExp == 0 ) { 2930 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2931 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2932 } 2933 aSig |= 0x00800000; 2934 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 2935 2936 } 2937 2938 /*---------------------------------------------------------------------------- 2939 | Returns the result of converting the single-precision floating-point value 2940 | `a' to the double-precision floating-point format. The conversion is 2941 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2942 | Arithmetic. 2943 *----------------------------------------------------------------------------*/ 2944 2945 float128 float32_to_float128(float32 a, float_status *status) 2946 { 2947 flag aSign; 2948 int aExp; 2949 uint32_t aSig; 2950 2951 a = float32_squash_input_denormal(a, status); 2952 aSig = extractFloat32Frac( a ); 2953 aExp = extractFloat32Exp( a ); 2954 aSign = extractFloat32Sign( a ); 2955 if ( aExp == 0xFF ) { 2956 if (aSig) { 2957 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 2958 } 2959 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2960 } 2961 if ( aExp == 0 ) { 2962 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2963 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2964 --aExp; 2965 } 2966 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 2967 2968 } 2969 2970 /*---------------------------------------------------------------------------- 2971 | Returns the remainder of the single-precision floating-point value `a' 2972 | with respect to the corresponding value `b'. The operation is performed 2973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2974 *----------------------------------------------------------------------------*/ 2975 2976 float32 float32_rem(float32 a, float32 b, float_status *status) 2977 { 2978 flag aSign, zSign; 2979 int aExp, bExp, expDiff; 2980 uint32_t aSig, bSig; 2981 uint32_t q; 2982 uint64_t aSig64, bSig64, q64; 2983 uint32_t alternateASig; 2984 int32_t sigMean; 2985 a = float32_squash_input_denormal(a, status); 2986 b = float32_squash_input_denormal(b, status); 2987 2988 aSig = extractFloat32Frac( a ); 2989 aExp = extractFloat32Exp( a ); 2990 aSign = extractFloat32Sign( a ); 2991 bSig = extractFloat32Frac( b ); 2992 bExp = extractFloat32Exp( b ); 2993 if ( aExp == 0xFF ) { 2994 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2995 return propagateFloat32NaN(a, b, status); 2996 } 2997 float_raise(float_flag_invalid, status); 2998 return float32_default_nan(status); 2999 } 3000 if ( bExp == 0xFF ) { 3001 if (bSig) { 3002 return propagateFloat32NaN(a, b, status); 3003 } 3004 return a; 3005 } 3006 if ( bExp == 0 ) { 3007 if ( bSig == 0 ) { 3008 float_raise(float_flag_invalid, status); 3009 return float32_default_nan(status); 3010 } 3011 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3012 } 3013 if ( aExp == 0 ) { 3014 if ( aSig == 0 ) return a; 3015 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3016 } 3017 expDiff = aExp - bExp; 3018 aSig |= 0x00800000; 3019 bSig |= 0x00800000; 3020 if ( expDiff < 32 ) { 3021 aSig <<= 8; 3022 bSig <<= 8; 3023 if ( expDiff < 0 ) { 3024 if ( expDiff < -1 ) return a; 3025 aSig >>= 1; 3026 } 3027 q = ( bSig <= aSig ); 3028 if ( q ) aSig -= bSig; 3029 if ( 0 < expDiff ) { 3030 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3031 q >>= 32 - expDiff; 3032 bSig >>= 2; 3033 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3034 } 3035 else { 3036 aSig >>= 2; 3037 bSig >>= 2; 3038 } 3039 } 3040 else { 3041 if ( bSig <= aSig ) aSig -= bSig; 3042 aSig64 = ( (uint64_t) aSig )<<40; 3043 bSig64 = ( (uint64_t) bSig )<<40; 3044 expDiff -= 64; 3045 while ( 0 < expDiff ) { 3046 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3047 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3048 aSig64 = - ( ( bSig * q64 )<<38 ); 3049 expDiff -= 62; 3050 } 3051 expDiff += 64; 3052 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3053 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3054 q = q64>>( 64 - expDiff ); 3055 bSig <<= 6; 3056 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3057 } 3058 do { 3059 alternateASig = aSig; 3060 ++q; 3061 aSig -= bSig; 3062 } while ( 0 <= (int32_t) aSig ); 3063 sigMean = aSig + alternateASig; 3064 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3065 aSig = alternateASig; 3066 } 3067 zSign = ( (int32_t) aSig < 0 ); 3068 if ( zSign ) aSig = - aSig; 3069 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3070 } 3071 3072 3073 /*---------------------------------------------------------------------------- 3074 | Returns the square root of the single-precision floating-point value `a'. 3075 | The operation is performed according to the IEC/IEEE Standard for Binary 3076 | Floating-Point Arithmetic. 3077 *----------------------------------------------------------------------------*/ 3078 3079 float32 float32_sqrt(float32 a, float_status *status) 3080 { 3081 flag aSign; 3082 int aExp, zExp; 3083 uint32_t aSig, zSig; 3084 uint64_t rem, term; 3085 a = float32_squash_input_denormal(a, status); 3086 3087 aSig = extractFloat32Frac( a ); 3088 aExp = extractFloat32Exp( a ); 3089 aSign = extractFloat32Sign( a ); 3090 if ( aExp == 0xFF ) { 3091 if (aSig) { 3092 return propagateFloat32NaN(a, float32_zero, status); 3093 } 3094 if ( ! aSign ) return a; 3095 float_raise(float_flag_invalid, status); 3096 return float32_default_nan(status); 3097 } 3098 if ( aSign ) { 3099 if ( ( aExp | aSig ) == 0 ) return a; 3100 float_raise(float_flag_invalid, status); 3101 return float32_default_nan(status); 3102 } 3103 if ( aExp == 0 ) { 3104 if ( aSig == 0 ) return float32_zero; 3105 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3106 } 3107 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3108 aSig = ( aSig | 0x00800000 )<<8; 3109 zSig = estimateSqrt32( aExp, aSig ) + 2; 3110 if ( ( zSig & 0x7F ) <= 5 ) { 3111 if ( zSig < 2 ) { 3112 zSig = 0x7FFFFFFF; 3113 goto roundAndPack; 3114 } 3115 aSig >>= aExp & 1; 3116 term = ( (uint64_t) zSig ) * zSig; 3117 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3118 while ( (int64_t) rem < 0 ) { 3119 --zSig; 3120 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3121 } 3122 zSig |= ( rem != 0 ); 3123 } 3124 shift32RightJamming( zSig, 1, &zSig ); 3125 roundAndPack: 3126 return roundAndPackFloat32(0, zExp, zSig, status); 3127 3128 } 3129 3130 /*---------------------------------------------------------------------------- 3131 | Returns the binary exponential of the single-precision floating-point value 3132 | `a'. The operation is performed according to the IEC/IEEE Standard for 3133 | Binary Floating-Point Arithmetic. 3134 | 3135 | Uses the following identities: 3136 | 3137 | 1. ------------------------------------------------------------------------- 3138 | x x*ln(2) 3139 | 2 = e 3140 | 3141 | 2. ------------------------------------------------------------------------- 3142 | 2 3 4 5 n 3143 | x x x x x x x 3144 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3145 | 1! 2! 3! 4! 5! n! 3146 *----------------------------------------------------------------------------*/ 3147 3148 static const float64 float32_exp2_coefficients[15] = 3149 { 3150 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3151 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3152 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3153 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3154 const_float64( 0x3f81111111111111ll ), /* 5 */ 3155 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3156 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3157 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3158 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3159 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3160 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3161 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3162 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3163 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3164 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3165 }; 3166 3167 float32 float32_exp2(float32 a, float_status *status) 3168 { 3169 flag aSign; 3170 int aExp; 3171 uint32_t aSig; 3172 float64 r, x, xn; 3173 int i; 3174 a = float32_squash_input_denormal(a, status); 3175 3176 aSig = extractFloat32Frac( a ); 3177 aExp = extractFloat32Exp( a ); 3178 aSign = extractFloat32Sign( a ); 3179 3180 if ( aExp == 0xFF) { 3181 if (aSig) { 3182 return propagateFloat32NaN(a, float32_zero, status); 3183 } 3184 return (aSign) ? float32_zero : a; 3185 } 3186 if (aExp == 0) { 3187 if (aSig == 0) return float32_one; 3188 } 3189 3190 float_raise(float_flag_inexact, status); 3191 3192 /* ******************************* */ 3193 /* using float64 for approximation */ 3194 /* ******************************* */ 3195 x = float32_to_float64(a, status); 3196 x = float64_mul(x, float64_ln2, status); 3197 3198 xn = x; 3199 r = float64_one; 3200 for (i = 0 ; i < 15 ; i++) { 3201 float64 f; 3202 3203 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3204 r = float64_add(r, f, status); 3205 3206 xn = float64_mul(xn, x, status); 3207 } 3208 3209 return float64_to_float32(r, status); 3210 } 3211 3212 /*---------------------------------------------------------------------------- 3213 | Returns the binary log of the single-precision floating-point value `a'. 3214 | The operation is performed according to the IEC/IEEE Standard for Binary 3215 | Floating-Point Arithmetic. 3216 *----------------------------------------------------------------------------*/ 3217 float32 float32_log2(float32 a, float_status *status) 3218 { 3219 flag aSign, zSign; 3220 int aExp; 3221 uint32_t aSig, zSig, i; 3222 3223 a = float32_squash_input_denormal(a, status); 3224 aSig = extractFloat32Frac( a ); 3225 aExp = extractFloat32Exp( a ); 3226 aSign = extractFloat32Sign( a ); 3227 3228 if ( aExp == 0 ) { 3229 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3230 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3231 } 3232 if ( aSign ) { 3233 float_raise(float_flag_invalid, status); 3234 return float32_default_nan(status); 3235 } 3236 if ( aExp == 0xFF ) { 3237 if (aSig) { 3238 return propagateFloat32NaN(a, float32_zero, status); 3239 } 3240 return a; 3241 } 3242 3243 aExp -= 0x7F; 3244 aSig |= 0x00800000; 3245 zSign = aExp < 0; 3246 zSig = aExp << 23; 3247 3248 for (i = 1 << 22; i > 0; i >>= 1) { 3249 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3250 if ( aSig & 0x01000000 ) { 3251 aSig >>= 1; 3252 zSig |= i; 3253 } 3254 } 3255 3256 if ( zSign ) 3257 zSig = -zSig; 3258 3259 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3260 } 3261 3262 /*---------------------------------------------------------------------------- 3263 | Returns 1 if the single-precision floating-point value `a' is equal to 3264 | the corresponding value `b', and 0 otherwise. The invalid exception is 3265 | raised if either operand is a NaN. Otherwise, the comparison is performed 3266 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3267 *----------------------------------------------------------------------------*/ 3268 3269 int float32_eq(float32 a, float32 b, float_status *status) 3270 { 3271 uint32_t av, bv; 3272 a = float32_squash_input_denormal(a, status); 3273 b = float32_squash_input_denormal(b, status); 3274 3275 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3276 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3277 ) { 3278 float_raise(float_flag_invalid, status); 3279 return 0; 3280 } 3281 av = float32_val(a); 3282 bv = float32_val(b); 3283 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3284 } 3285 3286 /*---------------------------------------------------------------------------- 3287 | Returns 1 if the single-precision floating-point value `a' is less than 3288 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3289 | exception is raised if either operand is a NaN. The comparison is performed 3290 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3291 *----------------------------------------------------------------------------*/ 3292 3293 int float32_le(float32 a, float32 b, float_status *status) 3294 { 3295 flag aSign, bSign; 3296 uint32_t av, bv; 3297 a = float32_squash_input_denormal(a, status); 3298 b = float32_squash_input_denormal(b, status); 3299 3300 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3301 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3302 ) { 3303 float_raise(float_flag_invalid, status); 3304 return 0; 3305 } 3306 aSign = extractFloat32Sign( a ); 3307 bSign = extractFloat32Sign( b ); 3308 av = float32_val(a); 3309 bv = float32_val(b); 3310 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3311 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3312 3313 } 3314 3315 /*---------------------------------------------------------------------------- 3316 | Returns 1 if the single-precision floating-point value `a' is less than 3317 | the corresponding value `b', and 0 otherwise. The invalid exception is 3318 | raised if either operand is a NaN. The comparison is performed according 3319 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3320 *----------------------------------------------------------------------------*/ 3321 3322 int float32_lt(float32 a, float32 b, float_status *status) 3323 { 3324 flag aSign, bSign; 3325 uint32_t av, bv; 3326 a = float32_squash_input_denormal(a, status); 3327 b = float32_squash_input_denormal(b, status); 3328 3329 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3330 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3331 ) { 3332 float_raise(float_flag_invalid, status); 3333 return 0; 3334 } 3335 aSign = extractFloat32Sign( a ); 3336 bSign = extractFloat32Sign( b ); 3337 av = float32_val(a); 3338 bv = float32_val(b); 3339 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3340 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3341 3342 } 3343 3344 /*---------------------------------------------------------------------------- 3345 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3346 | be compared, and 0 otherwise. The invalid exception is raised if either 3347 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3348 | Standard for Binary Floating-Point Arithmetic. 3349 *----------------------------------------------------------------------------*/ 3350 3351 int float32_unordered(float32 a, float32 b, float_status *status) 3352 { 3353 a = float32_squash_input_denormal(a, status); 3354 b = float32_squash_input_denormal(b, status); 3355 3356 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3357 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3358 ) { 3359 float_raise(float_flag_invalid, status); 3360 return 1; 3361 } 3362 return 0; 3363 } 3364 3365 /*---------------------------------------------------------------------------- 3366 | Returns 1 if the single-precision floating-point value `a' is equal to 3367 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3368 | exception. The comparison is performed according to the IEC/IEEE Standard 3369 | for Binary Floating-Point Arithmetic. 3370 *----------------------------------------------------------------------------*/ 3371 3372 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3373 { 3374 a = float32_squash_input_denormal(a, status); 3375 b = float32_squash_input_denormal(b, status); 3376 3377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3379 ) { 3380 if (float32_is_signaling_nan(a, status) 3381 || float32_is_signaling_nan(b, status)) { 3382 float_raise(float_flag_invalid, status); 3383 } 3384 return 0; 3385 } 3386 return ( float32_val(a) == float32_val(b) ) || 3387 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3388 } 3389 3390 /*---------------------------------------------------------------------------- 3391 | Returns 1 if the single-precision floating-point value `a' is less than or 3392 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3393 | cause an exception. Otherwise, the comparison is performed according to the 3394 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3395 *----------------------------------------------------------------------------*/ 3396 3397 int float32_le_quiet(float32 a, float32 b, float_status *status) 3398 { 3399 flag aSign, bSign; 3400 uint32_t av, bv; 3401 a = float32_squash_input_denormal(a, status); 3402 b = float32_squash_input_denormal(b, status); 3403 3404 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3405 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3406 ) { 3407 if (float32_is_signaling_nan(a, status) 3408 || float32_is_signaling_nan(b, status)) { 3409 float_raise(float_flag_invalid, status); 3410 } 3411 return 0; 3412 } 3413 aSign = extractFloat32Sign( a ); 3414 bSign = extractFloat32Sign( b ); 3415 av = float32_val(a); 3416 bv = float32_val(b); 3417 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3418 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3419 3420 } 3421 3422 /*---------------------------------------------------------------------------- 3423 | Returns 1 if the single-precision floating-point value `a' is less than 3424 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3425 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3426 | Standard for Binary Floating-Point Arithmetic. 3427 *----------------------------------------------------------------------------*/ 3428 3429 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3430 { 3431 flag aSign, bSign; 3432 uint32_t av, bv; 3433 a = float32_squash_input_denormal(a, status); 3434 b = float32_squash_input_denormal(b, status); 3435 3436 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3437 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3438 ) { 3439 if (float32_is_signaling_nan(a, status) 3440 || float32_is_signaling_nan(b, status)) { 3441 float_raise(float_flag_invalid, status); 3442 } 3443 return 0; 3444 } 3445 aSign = extractFloat32Sign( a ); 3446 bSign = extractFloat32Sign( b ); 3447 av = float32_val(a); 3448 bv = float32_val(b); 3449 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3450 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3451 3452 } 3453 3454 /*---------------------------------------------------------------------------- 3455 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3456 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3457 | comparison is performed according to the IEC/IEEE Standard for Binary 3458 | Floating-Point Arithmetic. 3459 *----------------------------------------------------------------------------*/ 3460 3461 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3462 { 3463 a = float32_squash_input_denormal(a, status); 3464 b = float32_squash_input_denormal(b, status); 3465 3466 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3467 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3468 ) { 3469 if (float32_is_signaling_nan(a, status) 3470 || float32_is_signaling_nan(b, status)) { 3471 float_raise(float_flag_invalid, status); 3472 } 3473 return 1; 3474 } 3475 return 0; 3476 } 3477 3478 3479 /*---------------------------------------------------------------------------- 3480 | Returns the result of converting the double-precision floating-point value 3481 | `a' to the single-precision floating-point format. The conversion is 3482 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3483 | Arithmetic. 3484 *----------------------------------------------------------------------------*/ 3485 3486 float32 float64_to_float32(float64 a, float_status *status) 3487 { 3488 flag aSign; 3489 int aExp; 3490 uint64_t aSig; 3491 uint32_t zSig; 3492 a = float64_squash_input_denormal(a, status); 3493 3494 aSig = extractFloat64Frac( a ); 3495 aExp = extractFloat64Exp( a ); 3496 aSign = extractFloat64Sign( a ); 3497 if ( aExp == 0x7FF ) { 3498 if (aSig) { 3499 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3500 } 3501 return packFloat32( aSign, 0xFF, 0 ); 3502 } 3503 shift64RightJamming( aSig, 22, &aSig ); 3504 zSig = aSig; 3505 if ( aExp || zSig ) { 3506 zSig |= 0x40000000; 3507 aExp -= 0x381; 3508 } 3509 return roundAndPackFloat32(aSign, aExp, zSig, status); 3510 3511 } 3512 3513 3514 /*---------------------------------------------------------------------------- 3515 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3516 | half-precision floating-point value, returning the result. After being 3517 | shifted into the proper positions, the three fields are simply added 3518 | together to form the result. This means that any integer portion of `zSig' 3519 | will be added into the exponent. Since a properly normalized significand 3520 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3521 | than the desired result exponent whenever `zSig' is a complete, normalized 3522 | significand. 3523 *----------------------------------------------------------------------------*/ 3524 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3525 { 3526 return make_float16( 3527 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3528 } 3529 3530 /*---------------------------------------------------------------------------- 3531 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3532 | and significand `zSig', and returns the proper half-precision floating- 3533 | point value corresponding to the abstract input. Ordinarily, the abstract 3534 | value is simply rounded and packed into the half-precision format, with 3535 | the inexact exception raised if the abstract input cannot be represented 3536 | exactly. However, if the abstract value is too large, the overflow and 3537 | inexact exceptions are raised and an infinity or maximal finite value is 3538 | returned. If the abstract value is too small, the input value is rounded to 3539 | a subnormal number, and the underflow and inexact exceptions are raised if 3540 | the abstract input cannot be represented exactly as a subnormal half- 3541 | precision floating-point number. 3542 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3543 | ARM-style "alternative representation", which omits the NaN and Inf 3544 | encodings in order to raise the maximum representable exponent by one. 3545 | The input significand `zSig' has its binary point between bits 22 3546 | and 23, which is 13 bits to the left of the usual location. This shifted 3547 | significand must be normalized or smaller. If `zSig' is not normalized, 3548 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3549 | and it must not require rounding. In the usual case that `zSig' is 3550 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3551 | Note the slightly odd position of the binary point in zSig compared with the 3552 | other roundAndPackFloat functions. This should probably be fixed if we 3553 | need to implement more float16 routines than just conversion. 3554 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3555 | Binary Floating-Point Arithmetic. 3556 *----------------------------------------------------------------------------*/ 3557 3558 static float16 roundAndPackFloat16(flag zSign, int zExp, 3559 uint32_t zSig, flag ieee, 3560 float_status *status) 3561 { 3562 int maxexp = ieee ? 29 : 30; 3563 uint32_t mask; 3564 uint32_t increment; 3565 bool rounding_bumps_exp; 3566 bool is_tiny = false; 3567 3568 /* Calculate the mask of bits of the mantissa which are not 3569 * representable in half-precision and will be lost. 3570 */ 3571 if (zExp < 1) { 3572 /* Will be denormal in halfprec */ 3573 mask = 0x00ffffff; 3574 if (zExp >= -11) { 3575 mask >>= 11 + zExp; 3576 } 3577 } else { 3578 /* Normal number in halfprec */ 3579 mask = 0x00001fff; 3580 } 3581 3582 switch (status->float_rounding_mode) { 3583 case float_round_nearest_even: 3584 increment = (mask + 1) >> 1; 3585 if ((zSig & mask) == increment) { 3586 increment = zSig & (increment << 1); 3587 } 3588 break; 3589 case float_round_ties_away: 3590 increment = (mask + 1) >> 1; 3591 break; 3592 case float_round_up: 3593 increment = zSign ? 0 : mask; 3594 break; 3595 case float_round_down: 3596 increment = zSign ? mask : 0; 3597 break; 3598 default: /* round_to_zero */ 3599 increment = 0; 3600 break; 3601 } 3602 3603 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3604 3605 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3606 if (ieee) { 3607 float_raise(float_flag_overflow | float_flag_inexact, status); 3608 return packFloat16(zSign, 0x1f, 0); 3609 } else { 3610 float_raise(float_flag_invalid, status); 3611 return packFloat16(zSign, 0x1f, 0x3ff); 3612 } 3613 } 3614 3615 if (zExp < 0) { 3616 /* Note that flush-to-zero does not affect half-precision results */ 3617 is_tiny = 3618 (status->float_detect_tininess == float_tininess_before_rounding) 3619 || (zExp < -1) 3620 || (!rounding_bumps_exp); 3621 } 3622 if (zSig & mask) { 3623 float_raise(float_flag_inexact, status); 3624 if (is_tiny) { 3625 float_raise(float_flag_underflow, status); 3626 } 3627 } 3628 3629 zSig += increment; 3630 if (rounding_bumps_exp) { 3631 zSig >>= 1; 3632 zExp++; 3633 } 3634 3635 if (zExp < -10) { 3636 return packFloat16(zSign, 0, 0); 3637 } 3638 if (zExp < 0) { 3639 zSig >>= -zExp; 3640 zExp = 0; 3641 } 3642 return packFloat16(zSign, zExp, zSig >> 13); 3643 } 3644 3645 /*---------------------------------------------------------------------------- 3646 | If `a' is denormal and we are in flush-to-zero mode then set the 3647 | input-denormal exception and return zero. Otherwise just return the value. 3648 *----------------------------------------------------------------------------*/ 3649 float16 float16_squash_input_denormal(float16 a, float_status *status) 3650 { 3651 if (status->flush_inputs_to_zero) { 3652 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3653 float_raise(float_flag_input_denormal, status); 3654 return make_float16(float16_val(a) & 0x8000); 3655 } 3656 } 3657 return a; 3658 } 3659 3660 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3661 uint32_t *zSigPtr) 3662 { 3663 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3664 *zSigPtr = aSig << shiftCount; 3665 *zExpPtr = 1 - shiftCount; 3666 } 3667 3668 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3669 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3670 3671 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3672 { 3673 flag aSign; 3674 int aExp; 3675 uint32_t aSig; 3676 3677 aSign = extractFloat16Sign(a); 3678 aExp = extractFloat16Exp(a); 3679 aSig = extractFloat16Frac(a); 3680 3681 if (aExp == 0x1f && ieee) { 3682 if (aSig) { 3683 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3684 } 3685 return packFloat32(aSign, 0xff, 0); 3686 } 3687 if (aExp == 0) { 3688 if (aSig == 0) { 3689 return packFloat32(aSign, 0, 0); 3690 } 3691 3692 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3693 aExp--; 3694 } 3695 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3696 } 3697 3698 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3699 { 3700 flag aSign; 3701 int aExp; 3702 uint32_t aSig; 3703 3704 a = float32_squash_input_denormal(a, status); 3705 3706 aSig = extractFloat32Frac( a ); 3707 aExp = extractFloat32Exp( a ); 3708 aSign = extractFloat32Sign( a ); 3709 if ( aExp == 0xFF ) { 3710 if (aSig) { 3711 /* Input is a NaN */ 3712 if (!ieee) { 3713 float_raise(float_flag_invalid, status); 3714 return packFloat16(aSign, 0, 0); 3715 } 3716 return commonNaNToFloat16( 3717 float32ToCommonNaN(a, status), status); 3718 } 3719 /* Infinity */ 3720 if (!ieee) { 3721 float_raise(float_flag_invalid, status); 3722 return packFloat16(aSign, 0x1f, 0x3ff); 3723 } 3724 return packFloat16(aSign, 0x1f, 0); 3725 } 3726 if (aExp == 0 && aSig == 0) { 3727 return packFloat16(aSign, 0, 0); 3728 } 3729 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3730 * even if the input is denormal; however this is harmless because 3731 * the largest possible single-precision denormal is still smaller 3732 * than the smallest representable half-precision denormal, and so we 3733 * will end up ignoring aSig and returning via the "always return zero" 3734 * codepath. 3735 */ 3736 aSig |= 0x00800000; 3737 aExp -= 0x71; 3738 3739 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3740 } 3741 3742 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3743 { 3744 flag aSign; 3745 int aExp; 3746 uint32_t aSig; 3747 3748 aSign = extractFloat16Sign(a); 3749 aExp = extractFloat16Exp(a); 3750 aSig = extractFloat16Frac(a); 3751 3752 if (aExp == 0x1f && ieee) { 3753 if (aSig) { 3754 return commonNaNToFloat64( 3755 float16ToCommonNaN(a, status), status); 3756 } 3757 return packFloat64(aSign, 0x7ff, 0); 3758 } 3759 if (aExp == 0) { 3760 if (aSig == 0) { 3761 return packFloat64(aSign, 0, 0); 3762 } 3763 3764 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3765 aExp--; 3766 } 3767 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3768 } 3769 3770 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3771 { 3772 flag aSign; 3773 int aExp; 3774 uint64_t aSig; 3775 uint32_t zSig; 3776 3777 a = float64_squash_input_denormal(a, status); 3778 3779 aSig = extractFloat64Frac(a); 3780 aExp = extractFloat64Exp(a); 3781 aSign = extractFloat64Sign(a); 3782 if (aExp == 0x7FF) { 3783 if (aSig) { 3784 /* Input is a NaN */ 3785 if (!ieee) { 3786 float_raise(float_flag_invalid, status); 3787 return packFloat16(aSign, 0, 0); 3788 } 3789 return commonNaNToFloat16( 3790 float64ToCommonNaN(a, status), status); 3791 } 3792 /* Infinity */ 3793 if (!ieee) { 3794 float_raise(float_flag_invalid, status); 3795 return packFloat16(aSign, 0x1f, 0x3ff); 3796 } 3797 return packFloat16(aSign, 0x1f, 0); 3798 } 3799 shift64RightJamming(aSig, 29, &aSig); 3800 zSig = aSig; 3801 if (aExp == 0 && zSig == 0) { 3802 return packFloat16(aSign, 0, 0); 3803 } 3804 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3805 * even if the input is denormal; however this is harmless because 3806 * the largest possible single-precision denormal is still smaller 3807 * than the smallest representable half-precision denormal, and so we 3808 * will end up ignoring aSig and returning via the "always return zero" 3809 * codepath. 3810 */ 3811 zSig |= 0x00800000; 3812 aExp -= 0x3F1; 3813 3814 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3815 } 3816 3817 /*---------------------------------------------------------------------------- 3818 | Returns the result of converting the double-precision floating-point value 3819 | `a' to the extended double-precision floating-point format. The conversion 3820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3821 | Arithmetic. 3822 *----------------------------------------------------------------------------*/ 3823 3824 floatx80 float64_to_floatx80(float64 a, float_status *status) 3825 { 3826 flag aSign; 3827 int aExp; 3828 uint64_t aSig; 3829 3830 a = float64_squash_input_denormal(a, status); 3831 aSig = extractFloat64Frac( a ); 3832 aExp = extractFloat64Exp( a ); 3833 aSign = extractFloat64Sign( a ); 3834 if ( aExp == 0x7FF ) { 3835 if (aSig) { 3836 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3837 } 3838 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3839 } 3840 if ( aExp == 0 ) { 3841 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3842 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3843 } 3844 return 3845 packFloatx80( 3846 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3847 3848 } 3849 3850 /*---------------------------------------------------------------------------- 3851 | Returns the result of converting the double-precision floating-point value 3852 | `a' to the quadruple-precision floating-point format. The conversion is 3853 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3854 | Arithmetic. 3855 *----------------------------------------------------------------------------*/ 3856 3857 float128 float64_to_float128(float64 a, float_status *status) 3858 { 3859 flag aSign; 3860 int aExp; 3861 uint64_t aSig, zSig0, zSig1; 3862 3863 a = float64_squash_input_denormal(a, status); 3864 aSig = extractFloat64Frac( a ); 3865 aExp = extractFloat64Exp( a ); 3866 aSign = extractFloat64Sign( a ); 3867 if ( aExp == 0x7FF ) { 3868 if (aSig) { 3869 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3870 } 3871 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3872 } 3873 if ( aExp == 0 ) { 3874 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3875 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3876 --aExp; 3877 } 3878 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3879 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3880 3881 } 3882 3883 3884 /*---------------------------------------------------------------------------- 3885 | Returns the remainder of the double-precision floating-point value `a' 3886 | with respect to the corresponding value `b'. The operation is performed 3887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3888 *----------------------------------------------------------------------------*/ 3889 3890 float64 float64_rem(float64 a, float64 b, float_status *status) 3891 { 3892 flag aSign, zSign; 3893 int aExp, bExp, expDiff; 3894 uint64_t aSig, bSig; 3895 uint64_t q, alternateASig; 3896 int64_t sigMean; 3897 3898 a = float64_squash_input_denormal(a, status); 3899 b = float64_squash_input_denormal(b, status); 3900 aSig = extractFloat64Frac( a ); 3901 aExp = extractFloat64Exp( a ); 3902 aSign = extractFloat64Sign( a ); 3903 bSig = extractFloat64Frac( b ); 3904 bExp = extractFloat64Exp( b ); 3905 if ( aExp == 0x7FF ) { 3906 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3907 return propagateFloat64NaN(a, b, status); 3908 } 3909 float_raise(float_flag_invalid, status); 3910 return float64_default_nan(status); 3911 } 3912 if ( bExp == 0x7FF ) { 3913 if (bSig) { 3914 return propagateFloat64NaN(a, b, status); 3915 } 3916 return a; 3917 } 3918 if ( bExp == 0 ) { 3919 if ( bSig == 0 ) { 3920 float_raise(float_flag_invalid, status); 3921 return float64_default_nan(status); 3922 } 3923 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3924 } 3925 if ( aExp == 0 ) { 3926 if ( aSig == 0 ) return a; 3927 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3928 } 3929 expDiff = aExp - bExp; 3930 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3931 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3932 if ( expDiff < 0 ) { 3933 if ( expDiff < -1 ) return a; 3934 aSig >>= 1; 3935 } 3936 q = ( bSig <= aSig ); 3937 if ( q ) aSig -= bSig; 3938 expDiff -= 64; 3939 while ( 0 < expDiff ) { 3940 q = estimateDiv128To64( aSig, 0, bSig ); 3941 q = ( 2 < q ) ? q - 2 : 0; 3942 aSig = - ( ( bSig>>2 ) * q ); 3943 expDiff -= 62; 3944 } 3945 expDiff += 64; 3946 if ( 0 < expDiff ) { 3947 q = estimateDiv128To64( aSig, 0, bSig ); 3948 q = ( 2 < q ) ? q - 2 : 0; 3949 q >>= 64 - expDiff; 3950 bSig >>= 2; 3951 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3952 } 3953 else { 3954 aSig >>= 2; 3955 bSig >>= 2; 3956 } 3957 do { 3958 alternateASig = aSig; 3959 ++q; 3960 aSig -= bSig; 3961 } while ( 0 <= (int64_t) aSig ); 3962 sigMean = aSig + alternateASig; 3963 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3964 aSig = alternateASig; 3965 } 3966 zSign = ( (int64_t) aSig < 0 ); 3967 if ( zSign ) aSig = - aSig; 3968 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 3969 3970 } 3971 3972 3973 /*---------------------------------------------------------------------------- 3974 | Returns the square root of the double-precision floating-point value `a'. 3975 | The operation is performed according to the IEC/IEEE Standard for Binary 3976 | Floating-Point Arithmetic. 3977 *----------------------------------------------------------------------------*/ 3978 3979 float64 float64_sqrt(float64 a, float_status *status) 3980 { 3981 flag aSign; 3982 int aExp, zExp; 3983 uint64_t aSig, zSig, doubleZSig; 3984 uint64_t rem0, rem1, term0, term1; 3985 a = float64_squash_input_denormal(a, status); 3986 3987 aSig = extractFloat64Frac( a ); 3988 aExp = extractFloat64Exp( a ); 3989 aSign = extractFloat64Sign( a ); 3990 if ( aExp == 0x7FF ) { 3991 if (aSig) { 3992 return propagateFloat64NaN(a, a, status); 3993 } 3994 if ( ! aSign ) return a; 3995 float_raise(float_flag_invalid, status); 3996 return float64_default_nan(status); 3997 } 3998 if ( aSign ) { 3999 if ( ( aExp | aSig ) == 0 ) return a; 4000 float_raise(float_flag_invalid, status); 4001 return float64_default_nan(status); 4002 } 4003 if ( aExp == 0 ) { 4004 if ( aSig == 0 ) return float64_zero; 4005 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4006 } 4007 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4008 aSig |= LIT64( 0x0010000000000000 ); 4009 zSig = estimateSqrt32( aExp, aSig>>21 ); 4010 aSig <<= 9 - ( aExp & 1 ); 4011 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4012 if ( ( zSig & 0x1FF ) <= 5 ) { 4013 doubleZSig = zSig<<1; 4014 mul64To128( zSig, zSig, &term0, &term1 ); 4015 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4016 while ( (int64_t) rem0 < 0 ) { 4017 --zSig; 4018 doubleZSig -= 2; 4019 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4020 } 4021 zSig |= ( ( rem0 | rem1 ) != 0 ); 4022 } 4023 return roundAndPackFloat64(0, zExp, zSig, status); 4024 4025 } 4026 4027 /*---------------------------------------------------------------------------- 4028 | Returns the binary log of the double-precision floating-point value `a'. 4029 | The operation is performed according to the IEC/IEEE Standard for Binary 4030 | Floating-Point Arithmetic. 4031 *----------------------------------------------------------------------------*/ 4032 float64 float64_log2(float64 a, float_status *status) 4033 { 4034 flag aSign, zSign; 4035 int aExp; 4036 uint64_t aSig, aSig0, aSig1, zSig, i; 4037 a = float64_squash_input_denormal(a, status); 4038 4039 aSig = extractFloat64Frac( a ); 4040 aExp = extractFloat64Exp( a ); 4041 aSign = extractFloat64Sign( a ); 4042 4043 if ( aExp == 0 ) { 4044 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4045 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4046 } 4047 if ( aSign ) { 4048 float_raise(float_flag_invalid, status); 4049 return float64_default_nan(status); 4050 } 4051 if ( aExp == 0x7FF ) { 4052 if (aSig) { 4053 return propagateFloat64NaN(a, float64_zero, status); 4054 } 4055 return a; 4056 } 4057 4058 aExp -= 0x3FF; 4059 aSig |= LIT64( 0x0010000000000000 ); 4060 zSign = aExp < 0; 4061 zSig = (uint64_t)aExp << 52; 4062 for (i = 1LL << 51; i > 0; i >>= 1) { 4063 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4064 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4065 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4066 aSig >>= 1; 4067 zSig |= i; 4068 } 4069 } 4070 4071 if ( zSign ) 4072 zSig = -zSig; 4073 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4074 } 4075 4076 /*---------------------------------------------------------------------------- 4077 | Returns 1 if the double-precision floating-point value `a' is equal to the 4078 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4079 | if either operand is a NaN. Otherwise, the comparison is performed 4080 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4081 *----------------------------------------------------------------------------*/ 4082 4083 int float64_eq(float64 a, float64 b, float_status *status) 4084 { 4085 uint64_t av, bv; 4086 a = float64_squash_input_denormal(a, status); 4087 b = float64_squash_input_denormal(b, status); 4088 4089 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4090 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4091 ) { 4092 float_raise(float_flag_invalid, status); 4093 return 0; 4094 } 4095 av = float64_val(a); 4096 bv = float64_val(b); 4097 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4098 4099 } 4100 4101 /*---------------------------------------------------------------------------- 4102 | Returns 1 if the double-precision floating-point value `a' is less than or 4103 | equal to the corresponding value `b', and 0 otherwise. The invalid 4104 | exception is raised if either operand is a NaN. The comparison is performed 4105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4106 *----------------------------------------------------------------------------*/ 4107 4108 int float64_le(float64 a, float64 b, float_status *status) 4109 { 4110 flag aSign, bSign; 4111 uint64_t av, bv; 4112 a = float64_squash_input_denormal(a, status); 4113 b = float64_squash_input_denormal(b, status); 4114 4115 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4116 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4117 ) { 4118 float_raise(float_flag_invalid, status); 4119 return 0; 4120 } 4121 aSign = extractFloat64Sign( a ); 4122 bSign = extractFloat64Sign( b ); 4123 av = float64_val(a); 4124 bv = float64_val(b); 4125 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4126 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4127 4128 } 4129 4130 /*---------------------------------------------------------------------------- 4131 | Returns 1 if the double-precision floating-point value `a' is less than 4132 | the corresponding value `b', and 0 otherwise. The invalid exception is 4133 | raised if either operand is a NaN. The comparison is performed according 4134 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4135 *----------------------------------------------------------------------------*/ 4136 4137 int float64_lt(float64 a, float64 b, float_status *status) 4138 { 4139 flag aSign, bSign; 4140 uint64_t av, bv; 4141 4142 a = float64_squash_input_denormal(a, status); 4143 b = float64_squash_input_denormal(b, status); 4144 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4145 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4146 ) { 4147 float_raise(float_flag_invalid, status); 4148 return 0; 4149 } 4150 aSign = extractFloat64Sign( a ); 4151 bSign = extractFloat64Sign( b ); 4152 av = float64_val(a); 4153 bv = float64_val(b); 4154 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4155 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4156 4157 } 4158 4159 /*---------------------------------------------------------------------------- 4160 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4161 | be compared, and 0 otherwise. The invalid exception is raised if either 4162 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4163 | Standard for Binary Floating-Point Arithmetic. 4164 *----------------------------------------------------------------------------*/ 4165 4166 int float64_unordered(float64 a, float64 b, float_status *status) 4167 { 4168 a = float64_squash_input_denormal(a, status); 4169 b = float64_squash_input_denormal(b, status); 4170 4171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4173 ) { 4174 float_raise(float_flag_invalid, status); 4175 return 1; 4176 } 4177 return 0; 4178 } 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns 1 if the double-precision floating-point value `a' is equal to the 4182 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4183 | exception.The comparison is performed according to the IEC/IEEE Standard 4184 | for Binary Floating-Point Arithmetic. 4185 *----------------------------------------------------------------------------*/ 4186 4187 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4188 { 4189 uint64_t av, bv; 4190 a = float64_squash_input_denormal(a, status); 4191 b = float64_squash_input_denormal(b, status); 4192 4193 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4194 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4195 ) { 4196 if (float64_is_signaling_nan(a, status) 4197 || float64_is_signaling_nan(b, status)) { 4198 float_raise(float_flag_invalid, status); 4199 } 4200 return 0; 4201 } 4202 av = float64_val(a); 4203 bv = float64_val(b); 4204 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4205 4206 } 4207 4208 /*---------------------------------------------------------------------------- 4209 | Returns 1 if the double-precision floating-point value `a' is less than or 4210 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4211 | cause an exception. Otherwise, the comparison is performed according to the 4212 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4213 *----------------------------------------------------------------------------*/ 4214 4215 int float64_le_quiet(float64 a, float64 b, float_status *status) 4216 { 4217 flag aSign, bSign; 4218 uint64_t av, bv; 4219 a = float64_squash_input_denormal(a, status); 4220 b = float64_squash_input_denormal(b, status); 4221 4222 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4223 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4224 ) { 4225 if (float64_is_signaling_nan(a, status) 4226 || float64_is_signaling_nan(b, status)) { 4227 float_raise(float_flag_invalid, status); 4228 } 4229 return 0; 4230 } 4231 aSign = extractFloat64Sign( a ); 4232 bSign = extractFloat64Sign( b ); 4233 av = float64_val(a); 4234 bv = float64_val(b); 4235 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4236 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4237 4238 } 4239 4240 /*---------------------------------------------------------------------------- 4241 | Returns 1 if the double-precision floating-point value `a' is less than 4242 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4243 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4244 | Standard for Binary Floating-Point Arithmetic. 4245 *----------------------------------------------------------------------------*/ 4246 4247 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4248 { 4249 flag aSign, bSign; 4250 uint64_t av, bv; 4251 a = float64_squash_input_denormal(a, status); 4252 b = float64_squash_input_denormal(b, status); 4253 4254 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4255 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4256 ) { 4257 if (float64_is_signaling_nan(a, status) 4258 || float64_is_signaling_nan(b, status)) { 4259 float_raise(float_flag_invalid, status); 4260 } 4261 return 0; 4262 } 4263 aSign = extractFloat64Sign( a ); 4264 bSign = extractFloat64Sign( b ); 4265 av = float64_val(a); 4266 bv = float64_val(b); 4267 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4268 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4269 4270 } 4271 4272 /*---------------------------------------------------------------------------- 4273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4274 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4275 | comparison is performed according to the IEC/IEEE Standard for Binary 4276 | Floating-Point Arithmetic. 4277 *----------------------------------------------------------------------------*/ 4278 4279 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4280 { 4281 a = float64_squash_input_denormal(a, status); 4282 b = float64_squash_input_denormal(b, status); 4283 4284 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4285 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4286 ) { 4287 if (float64_is_signaling_nan(a, status) 4288 || float64_is_signaling_nan(b, status)) { 4289 float_raise(float_flag_invalid, status); 4290 } 4291 return 1; 4292 } 4293 return 0; 4294 } 4295 4296 /*---------------------------------------------------------------------------- 4297 | Returns the result of converting the extended double-precision floating- 4298 | point value `a' to the 32-bit two's complement integer format. The 4299 | conversion is performed according to the IEC/IEEE Standard for Binary 4300 | Floating-Point Arithmetic---which means in particular that the conversion 4301 | is rounded according to the current rounding mode. If `a' is a NaN, the 4302 | largest positive integer is returned. Otherwise, if the conversion 4303 | overflows, the largest integer with the same sign as `a' is returned. 4304 *----------------------------------------------------------------------------*/ 4305 4306 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4307 { 4308 flag aSign; 4309 int32_t aExp, shiftCount; 4310 uint64_t aSig; 4311 4312 if (floatx80_invalid_encoding(a)) { 4313 float_raise(float_flag_invalid, status); 4314 return 1 << 31; 4315 } 4316 aSig = extractFloatx80Frac( a ); 4317 aExp = extractFloatx80Exp( a ); 4318 aSign = extractFloatx80Sign( a ); 4319 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4320 shiftCount = 0x4037 - aExp; 4321 if ( shiftCount <= 0 ) shiftCount = 1; 4322 shift64RightJamming( aSig, shiftCount, &aSig ); 4323 return roundAndPackInt32(aSign, aSig, status); 4324 4325 } 4326 4327 /*---------------------------------------------------------------------------- 4328 | Returns the result of converting the extended double-precision floating- 4329 | point value `a' to the 32-bit two's complement integer format. The 4330 | conversion is performed according to the IEC/IEEE Standard for Binary 4331 | Floating-Point Arithmetic, except that the conversion is always rounded 4332 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4333 | Otherwise, if the conversion overflows, the largest integer with the same 4334 | sign as `a' is returned. 4335 *----------------------------------------------------------------------------*/ 4336 4337 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4338 { 4339 flag aSign; 4340 int32_t aExp, shiftCount; 4341 uint64_t aSig, savedASig; 4342 int32_t z; 4343 4344 if (floatx80_invalid_encoding(a)) { 4345 float_raise(float_flag_invalid, status); 4346 return 1 << 31; 4347 } 4348 aSig = extractFloatx80Frac( a ); 4349 aExp = extractFloatx80Exp( a ); 4350 aSign = extractFloatx80Sign( a ); 4351 if ( 0x401E < aExp ) { 4352 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4353 goto invalid; 4354 } 4355 else if ( aExp < 0x3FFF ) { 4356 if (aExp || aSig) { 4357 status->float_exception_flags |= float_flag_inexact; 4358 } 4359 return 0; 4360 } 4361 shiftCount = 0x403E - aExp; 4362 savedASig = aSig; 4363 aSig >>= shiftCount; 4364 z = aSig; 4365 if ( aSign ) z = - z; 4366 if ( ( z < 0 ) ^ aSign ) { 4367 invalid: 4368 float_raise(float_flag_invalid, status); 4369 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4370 } 4371 if ( ( aSig<<shiftCount ) != savedASig ) { 4372 status->float_exception_flags |= float_flag_inexact; 4373 } 4374 return z; 4375 4376 } 4377 4378 /*---------------------------------------------------------------------------- 4379 | Returns the result of converting the extended double-precision floating- 4380 | point value `a' to the 64-bit two's complement integer format. The 4381 | conversion is performed according to the IEC/IEEE Standard for Binary 4382 | Floating-Point Arithmetic---which means in particular that the conversion 4383 | is rounded according to the current rounding mode. If `a' is a NaN, 4384 | the largest positive integer is returned. Otherwise, if the conversion 4385 | overflows, the largest integer with the same sign as `a' is returned. 4386 *----------------------------------------------------------------------------*/ 4387 4388 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4389 { 4390 flag aSign; 4391 int32_t aExp, shiftCount; 4392 uint64_t aSig, aSigExtra; 4393 4394 if (floatx80_invalid_encoding(a)) { 4395 float_raise(float_flag_invalid, status); 4396 return 1ULL << 63; 4397 } 4398 aSig = extractFloatx80Frac( a ); 4399 aExp = extractFloatx80Exp( a ); 4400 aSign = extractFloatx80Sign( a ); 4401 shiftCount = 0x403E - aExp; 4402 if ( shiftCount <= 0 ) { 4403 if ( shiftCount ) { 4404 float_raise(float_flag_invalid, status); 4405 if ( ! aSign 4406 || ( ( aExp == 0x7FFF ) 4407 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4408 ) { 4409 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4410 } 4411 return (int64_t) LIT64( 0x8000000000000000 ); 4412 } 4413 aSigExtra = 0; 4414 } 4415 else { 4416 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4417 } 4418 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4419 4420 } 4421 4422 /*---------------------------------------------------------------------------- 4423 | Returns the result of converting the extended double-precision floating- 4424 | point value `a' to the 64-bit two's complement integer format. The 4425 | conversion is performed according to the IEC/IEEE Standard for Binary 4426 | Floating-Point Arithmetic, except that the conversion is always rounded 4427 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4428 | Otherwise, if the conversion overflows, the largest integer with the same 4429 | sign as `a' is returned. 4430 *----------------------------------------------------------------------------*/ 4431 4432 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4433 { 4434 flag aSign; 4435 int32_t aExp, shiftCount; 4436 uint64_t aSig; 4437 int64_t z; 4438 4439 if (floatx80_invalid_encoding(a)) { 4440 float_raise(float_flag_invalid, status); 4441 return 1ULL << 63; 4442 } 4443 aSig = extractFloatx80Frac( a ); 4444 aExp = extractFloatx80Exp( a ); 4445 aSign = extractFloatx80Sign( a ); 4446 shiftCount = aExp - 0x403E; 4447 if ( 0 <= shiftCount ) { 4448 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4449 if ( ( a.high != 0xC03E ) || aSig ) { 4450 float_raise(float_flag_invalid, status); 4451 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4452 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4453 } 4454 } 4455 return (int64_t) LIT64( 0x8000000000000000 ); 4456 } 4457 else if ( aExp < 0x3FFF ) { 4458 if (aExp | aSig) { 4459 status->float_exception_flags |= float_flag_inexact; 4460 } 4461 return 0; 4462 } 4463 z = aSig>>( - shiftCount ); 4464 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4465 status->float_exception_flags |= float_flag_inexact; 4466 } 4467 if ( aSign ) z = - z; 4468 return z; 4469 4470 } 4471 4472 /*---------------------------------------------------------------------------- 4473 | Returns the result of converting the extended double-precision floating- 4474 | point value `a' to the single-precision floating-point format. The 4475 | conversion is performed according to the IEC/IEEE Standard for Binary 4476 | Floating-Point Arithmetic. 4477 *----------------------------------------------------------------------------*/ 4478 4479 float32 floatx80_to_float32(floatx80 a, float_status *status) 4480 { 4481 flag aSign; 4482 int32_t aExp; 4483 uint64_t aSig; 4484 4485 if (floatx80_invalid_encoding(a)) { 4486 float_raise(float_flag_invalid, status); 4487 return float32_default_nan(status); 4488 } 4489 aSig = extractFloatx80Frac( a ); 4490 aExp = extractFloatx80Exp( a ); 4491 aSign = extractFloatx80Sign( a ); 4492 if ( aExp == 0x7FFF ) { 4493 if ( (uint64_t) ( aSig<<1 ) ) { 4494 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4495 } 4496 return packFloat32( aSign, 0xFF, 0 ); 4497 } 4498 shift64RightJamming( aSig, 33, &aSig ); 4499 if ( aExp || aSig ) aExp -= 0x3F81; 4500 return roundAndPackFloat32(aSign, aExp, aSig, status); 4501 4502 } 4503 4504 /*---------------------------------------------------------------------------- 4505 | Returns the result of converting the extended double-precision floating- 4506 | point value `a' to the double-precision floating-point format. The 4507 | conversion is performed according to the IEC/IEEE Standard for Binary 4508 | Floating-Point Arithmetic. 4509 *----------------------------------------------------------------------------*/ 4510 4511 float64 floatx80_to_float64(floatx80 a, float_status *status) 4512 { 4513 flag aSign; 4514 int32_t aExp; 4515 uint64_t aSig, zSig; 4516 4517 if (floatx80_invalid_encoding(a)) { 4518 float_raise(float_flag_invalid, status); 4519 return float64_default_nan(status); 4520 } 4521 aSig = extractFloatx80Frac( a ); 4522 aExp = extractFloatx80Exp( a ); 4523 aSign = extractFloatx80Sign( a ); 4524 if ( aExp == 0x7FFF ) { 4525 if ( (uint64_t) ( aSig<<1 ) ) { 4526 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4527 } 4528 return packFloat64( aSign, 0x7FF, 0 ); 4529 } 4530 shift64RightJamming( aSig, 1, &zSig ); 4531 if ( aExp || aSig ) aExp -= 0x3C01; 4532 return roundAndPackFloat64(aSign, aExp, zSig, status); 4533 4534 } 4535 4536 /*---------------------------------------------------------------------------- 4537 | Returns the result of converting the extended double-precision floating- 4538 | point value `a' to the quadruple-precision floating-point format. The 4539 | conversion is performed according to the IEC/IEEE Standard for Binary 4540 | Floating-Point Arithmetic. 4541 *----------------------------------------------------------------------------*/ 4542 4543 float128 floatx80_to_float128(floatx80 a, float_status *status) 4544 { 4545 flag aSign; 4546 int aExp; 4547 uint64_t aSig, zSig0, zSig1; 4548 4549 if (floatx80_invalid_encoding(a)) { 4550 float_raise(float_flag_invalid, status); 4551 return float128_default_nan(status); 4552 } 4553 aSig = extractFloatx80Frac( a ); 4554 aExp = extractFloatx80Exp( a ); 4555 aSign = extractFloatx80Sign( a ); 4556 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4557 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4558 } 4559 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4560 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4561 4562 } 4563 4564 /*---------------------------------------------------------------------------- 4565 | Rounds the extended double-precision floating-point value `a' 4566 | to the precision provided by floatx80_rounding_precision and returns the 4567 | result as an extended double-precision floating-point value. 4568 | The operation is performed according to the IEC/IEEE Standard for Binary 4569 | Floating-Point Arithmetic. 4570 *----------------------------------------------------------------------------*/ 4571 4572 floatx80 floatx80_round(floatx80 a, float_status *status) 4573 { 4574 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4575 extractFloatx80Sign(a), 4576 extractFloatx80Exp(a), 4577 extractFloatx80Frac(a), 0, status); 4578 } 4579 4580 /*---------------------------------------------------------------------------- 4581 | Rounds the extended double-precision floating-point value `a' to an integer, 4582 | and returns the result as an extended quadruple-precision floating-point 4583 | value. The operation is performed according to the IEC/IEEE Standard for 4584 | Binary Floating-Point Arithmetic. 4585 *----------------------------------------------------------------------------*/ 4586 4587 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4588 { 4589 flag aSign; 4590 int32_t aExp; 4591 uint64_t lastBitMask, roundBitsMask; 4592 floatx80 z; 4593 4594 if (floatx80_invalid_encoding(a)) { 4595 float_raise(float_flag_invalid, status); 4596 return floatx80_default_nan(status); 4597 } 4598 aExp = extractFloatx80Exp( a ); 4599 if ( 0x403E <= aExp ) { 4600 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4601 return propagateFloatx80NaN(a, a, status); 4602 } 4603 return a; 4604 } 4605 if ( aExp < 0x3FFF ) { 4606 if ( ( aExp == 0 ) 4607 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4608 return a; 4609 } 4610 status->float_exception_flags |= float_flag_inexact; 4611 aSign = extractFloatx80Sign( a ); 4612 switch (status->float_rounding_mode) { 4613 case float_round_nearest_even: 4614 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4615 ) { 4616 return 4617 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4618 } 4619 break; 4620 case float_round_ties_away: 4621 if (aExp == 0x3FFE) { 4622 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4623 } 4624 break; 4625 case float_round_down: 4626 return 4627 aSign ? 4628 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4629 : packFloatx80( 0, 0, 0 ); 4630 case float_round_up: 4631 return 4632 aSign ? packFloatx80( 1, 0, 0 ) 4633 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4634 } 4635 return packFloatx80( aSign, 0, 0 ); 4636 } 4637 lastBitMask = 1; 4638 lastBitMask <<= 0x403E - aExp; 4639 roundBitsMask = lastBitMask - 1; 4640 z = a; 4641 switch (status->float_rounding_mode) { 4642 case float_round_nearest_even: 4643 z.low += lastBitMask>>1; 4644 if ((z.low & roundBitsMask) == 0) { 4645 z.low &= ~lastBitMask; 4646 } 4647 break; 4648 case float_round_ties_away: 4649 z.low += lastBitMask >> 1; 4650 break; 4651 case float_round_to_zero: 4652 break; 4653 case float_round_up: 4654 if (!extractFloatx80Sign(z)) { 4655 z.low += roundBitsMask; 4656 } 4657 break; 4658 case float_round_down: 4659 if (extractFloatx80Sign(z)) { 4660 z.low += roundBitsMask; 4661 } 4662 break; 4663 default: 4664 abort(); 4665 } 4666 z.low &= ~ roundBitsMask; 4667 if ( z.low == 0 ) { 4668 ++z.high; 4669 z.low = LIT64( 0x8000000000000000 ); 4670 } 4671 if (z.low != a.low) { 4672 status->float_exception_flags |= float_flag_inexact; 4673 } 4674 return z; 4675 4676 } 4677 4678 /*---------------------------------------------------------------------------- 4679 | Returns the result of adding the absolute values of the extended double- 4680 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4681 | negated before being returned. `zSign' is ignored if the result is a NaN. 4682 | The addition is performed according to the IEC/IEEE Standard for Binary 4683 | Floating-Point Arithmetic. 4684 *----------------------------------------------------------------------------*/ 4685 4686 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4687 float_status *status) 4688 { 4689 int32_t aExp, bExp, zExp; 4690 uint64_t aSig, bSig, zSig0, zSig1; 4691 int32_t expDiff; 4692 4693 aSig = extractFloatx80Frac( a ); 4694 aExp = extractFloatx80Exp( a ); 4695 bSig = extractFloatx80Frac( b ); 4696 bExp = extractFloatx80Exp( b ); 4697 expDiff = aExp - bExp; 4698 if ( 0 < expDiff ) { 4699 if ( aExp == 0x7FFF ) { 4700 if ((uint64_t)(aSig << 1)) { 4701 return propagateFloatx80NaN(a, b, status); 4702 } 4703 return a; 4704 } 4705 if ( bExp == 0 ) --expDiff; 4706 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4707 zExp = aExp; 4708 } 4709 else if ( expDiff < 0 ) { 4710 if ( bExp == 0x7FFF ) { 4711 if ((uint64_t)(bSig << 1)) { 4712 return propagateFloatx80NaN(a, b, status); 4713 } 4714 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4715 } 4716 if ( aExp == 0 ) ++expDiff; 4717 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4718 zExp = bExp; 4719 } 4720 else { 4721 if ( aExp == 0x7FFF ) { 4722 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4723 return propagateFloatx80NaN(a, b, status); 4724 } 4725 return a; 4726 } 4727 zSig1 = 0; 4728 zSig0 = aSig + bSig; 4729 if ( aExp == 0 ) { 4730 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4731 goto roundAndPack; 4732 } 4733 zExp = aExp; 4734 goto shiftRight1; 4735 } 4736 zSig0 = aSig + bSig; 4737 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4738 shiftRight1: 4739 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4740 zSig0 |= LIT64( 0x8000000000000000 ); 4741 ++zExp; 4742 roundAndPack: 4743 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4744 zSign, zExp, zSig0, zSig1, status); 4745 } 4746 4747 /*---------------------------------------------------------------------------- 4748 | Returns the result of subtracting the absolute values of the extended 4749 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4750 | difference is negated before being returned. `zSign' is ignored if the 4751 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4752 | Standard for Binary Floating-Point Arithmetic. 4753 *----------------------------------------------------------------------------*/ 4754 4755 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4756 float_status *status) 4757 { 4758 int32_t aExp, bExp, zExp; 4759 uint64_t aSig, bSig, zSig0, zSig1; 4760 int32_t expDiff; 4761 4762 aSig = extractFloatx80Frac( a ); 4763 aExp = extractFloatx80Exp( a ); 4764 bSig = extractFloatx80Frac( b ); 4765 bExp = extractFloatx80Exp( b ); 4766 expDiff = aExp - bExp; 4767 if ( 0 < expDiff ) goto aExpBigger; 4768 if ( expDiff < 0 ) goto bExpBigger; 4769 if ( aExp == 0x7FFF ) { 4770 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4771 return propagateFloatx80NaN(a, b, status); 4772 } 4773 float_raise(float_flag_invalid, status); 4774 return floatx80_default_nan(status); 4775 } 4776 if ( aExp == 0 ) { 4777 aExp = 1; 4778 bExp = 1; 4779 } 4780 zSig1 = 0; 4781 if ( bSig < aSig ) goto aBigger; 4782 if ( aSig < bSig ) goto bBigger; 4783 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4784 bExpBigger: 4785 if ( bExp == 0x7FFF ) { 4786 if ((uint64_t)(bSig << 1)) { 4787 return propagateFloatx80NaN(a, b, status); 4788 } 4789 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4790 } 4791 if ( aExp == 0 ) ++expDiff; 4792 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4793 bBigger: 4794 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4795 zExp = bExp; 4796 zSign ^= 1; 4797 goto normalizeRoundAndPack; 4798 aExpBigger: 4799 if ( aExp == 0x7FFF ) { 4800 if ((uint64_t)(aSig << 1)) { 4801 return propagateFloatx80NaN(a, b, status); 4802 } 4803 return a; 4804 } 4805 if ( bExp == 0 ) --expDiff; 4806 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4807 aBigger: 4808 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4809 zExp = aExp; 4810 normalizeRoundAndPack: 4811 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4812 zSign, zExp, zSig0, zSig1, status); 4813 } 4814 4815 /*---------------------------------------------------------------------------- 4816 | Returns the result of adding the extended double-precision floating-point 4817 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4818 | Standard for Binary Floating-Point Arithmetic. 4819 *----------------------------------------------------------------------------*/ 4820 4821 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4822 { 4823 flag aSign, bSign; 4824 4825 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4826 float_raise(float_flag_invalid, status); 4827 return floatx80_default_nan(status); 4828 } 4829 aSign = extractFloatx80Sign( a ); 4830 bSign = extractFloatx80Sign( b ); 4831 if ( aSign == bSign ) { 4832 return addFloatx80Sigs(a, b, aSign, status); 4833 } 4834 else { 4835 return subFloatx80Sigs(a, b, aSign, status); 4836 } 4837 4838 } 4839 4840 /*---------------------------------------------------------------------------- 4841 | Returns the result of subtracting the extended double-precision floating- 4842 | point values `a' and `b'. The operation is performed according to the 4843 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4844 *----------------------------------------------------------------------------*/ 4845 4846 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 4847 { 4848 flag aSign, bSign; 4849 4850 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4851 float_raise(float_flag_invalid, status); 4852 return floatx80_default_nan(status); 4853 } 4854 aSign = extractFloatx80Sign( a ); 4855 bSign = extractFloatx80Sign( b ); 4856 if ( aSign == bSign ) { 4857 return subFloatx80Sigs(a, b, aSign, status); 4858 } 4859 else { 4860 return addFloatx80Sigs(a, b, aSign, status); 4861 } 4862 4863 } 4864 4865 /*---------------------------------------------------------------------------- 4866 | Returns the result of multiplying the extended double-precision floating- 4867 | point values `a' and `b'. The operation is performed according to the 4868 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4869 *----------------------------------------------------------------------------*/ 4870 4871 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 4872 { 4873 flag aSign, bSign, zSign; 4874 int32_t aExp, bExp, zExp; 4875 uint64_t aSig, bSig, zSig0, zSig1; 4876 4877 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4878 float_raise(float_flag_invalid, status); 4879 return floatx80_default_nan(status); 4880 } 4881 aSig = extractFloatx80Frac( a ); 4882 aExp = extractFloatx80Exp( a ); 4883 aSign = extractFloatx80Sign( a ); 4884 bSig = extractFloatx80Frac( b ); 4885 bExp = extractFloatx80Exp( b ); 4886 bSign = extractFloatx80Sign( b ); 4887 zSign = aSign ^ bSign; 4888 if ( aExp == 0x7FFF ) { 4889 if ( (uint64_t) ( aSig<<1 ) 4890 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4891 return propagateFloatx80NaN(a, b, status); 4892 } 4893 if ( ( bExp | bSig ) == 0 ) goto invalid; 4894 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4895 } 4896 if ( bExp == 0x7FFF ) { 4897 if ((uint64_t)(bSig << 1)) { 4898 return propagateFloatx80NaN(a, b, status); 4899 } 4900 if ( ( aExp | aSig ) == 0 ) { 4901 invalid: 4902 float_raise(float_flag_invalid, status); 4903 return floatx80_default_nan(status); 4904 } 4905 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4906 } 4907 if ( aExp == 0 ) { 4908 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4909 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4910 } 4911 if ( bExp == 0 ) { 4912 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4913 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4914 } 4915 zExp = aExp + bExp - 0x3FFE; 4916 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4917 if ( 0 < (int64_t) zSig0 ) { 4918 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4919 --zExp; 4920 } 4921 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4922 zSign, zExp, zSig0, zSig1, status); 4923 } 4924 4925 /*---------------------------------------------------------------------------- 4926 | Returns the result of dividing the extended double-precision floating-point 4927 | value `a' by the corresponding value `b'. The operation is performed 4928 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4929 *----------------------------------------------------------------------------*/ 4930 4931 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 4932 { 4933 flag aSign, bSign, zSign; 4934 int32_t aExp, bExp, zExp; 4935 uint64_t aSig, bSig, zSig0, zSig1; 4936 uint64_t rem0, rem1, rem2, term0, term1, term2; 4937 4938 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4939 float_raise(float_flag_invalid, status); 4940 return floatx80_default_nan(status); 4941 } 4942 aSig = extractFloatx80Frac( a ); 4943 aExp = extractFloatx80Exp( a ); 4944 aSign = extractFloatx80Sign( a ); 4945 bSig = extractFloatx80Frac( b ); 4946 bExp = extractFloatx80Exp( b ); 4947 bSign = extractFloatx80Sign( b ); 4948 zSign = aSign ^ bSign; 4949 if ( aExp == 0x7FFF ) { 4950 if ((uint64_t)(aSig << 1)) { 4951 return propagateFloatx80NaN(a, b, status); 4952 } 4953 if ( bExp == 0x7FFF ) { 4954 if ((uint64_t)(bSig << 1)) { 4955 return propagateFloatx80NaN(a, b, status); 4956 } 4957 goto invalid; 4958 } 4959 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4960 } 4961 if ( bExp == 0x7FFF ) { 4962 if ((uint64_t)(bSig << 1)) { 4963 return propagateFloatx80NaN(a, b, status); 4964 } 4965 return packFloatx80( zSign, 0, 0 ); 4966 } 4967 if ( bExp == 0 ) { 4968 if ( bSig == 0 ) { 4969 if ( ( aExp | aSig ) == 0 ) { 4970 invalid: 4971 float_raise(float_flag_invalid, status); 4972 return floatx80_default_nan(status); 4973 } 4974 float_raise(float_flag_divbyzero, status); 4975 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4976 } 4977 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4978 } 4979 if ( aExp == 0 ) { 4980 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4981 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4982 } 4983 zExp = aExp - bExp + 0x3FFE; 4984 rem1 = 0; 4985 if ( bSig <= aSig ) { 4986 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4987 ++zExp; 4988 } 4989 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4990 mul64To128( bSig, zSig0, &term0, &term1 ); 4991 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4992 while ( (int64_t) rem0 < 0 ) { 4993 --zSig0; 4994 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4995 } 4996 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4997 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 4998 mul64To128( bSig, zSig1, &term1, &term2 ); 4999 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5000 while ( (int64_t) rem1 < 0 ) { 5001 --zSig1; 5002 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5003 } 5004 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5005 } 5006 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5007 zSign, zExp, zSig0, zSig1, status); 5008 } 5009 5010 /*---------------------------------------------------------------------------- 5011 | Returns the remainder of the extended double-precision floating-point value 5012 | `a' with respect to the corresponding value `b'. The operation is performed 5013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5014 *----------------------------------------------------------------------------*/ 5015 5016 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5017 { 5018 flag aSign, zSign; 5019 int32_t aExp, bExp, expDiff; 5020 uint64_t aSig0, aSig1, bSig; 5021 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5022 5023 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5024 float_raise(float_flag_invalid, status); 5025 return floatx80_default_nan(status); 5026 } 5027 aSig0 = extractFloatx80Frac( a ); 5028 aExp = extractFloatx80Exp( a ); 5029 aSign = extractFloatx80Sign( a ); 5030 bSig = extractFloatx80Frac( b ); 5031 bExp = extractFloatx80Exp( b ); 5032 if ( aExp == 0x7FFF ) { 5033 if ( (uint64_t) ( aSig0<<1 ) 5034 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5035 return propagateFloatx80NaN(a, b, status); 5036 } 5037 goto invalid; 5038 } 5039 if ( bExp == 0x7FFF ) { 5040 if ((uint64_t)(bSig << 1)) { 5041 return propagateFloatx80NaN(a, b, status); 5042 } 5043 return a; 5044 } 5045 if ( bExp == 0 ) { 5046 if ( bSig == 0 ) { 5047 invalid: 5048 float_raise(float_flag_invalid, status); 5049 return floatx80_default_nan(status); 5050 } 5051 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5052 } 5053 if ( aExp == 0 ) { 5054 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5055 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5056 } 5057 bSig |= LIT64( 0x8000000000000000 ); 5058 zSign = aSign; 5059 expDiff = aExp - bExp; 5060 aSig1 = 0; 5061 if ( expDiff < 0 ) { 5062 if ( expDiff < -1 ) return a; 5063 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5064 expDiff = 0; 5065 } 5066 q = ( bSig <= aSig0 ); 5067 if ( q ) aSig0 -= bSig; 5068 expDiff -= 64; 5069 while ( 0 < expDiff ) { 5070 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5071 q = ( 2 < q ) ? q - 2 : 0; 5072 mul64To128( bSig, q, &term0, &term1 ); 5073 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5074 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5075 expDiff -= 62; 5076 } 5077 expDiff += 64; 5078 if ( 0 < expDiff ) { 5079 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5080 q = ( 2 < q ) ? q - 2 : 0; 5081 q >>= 64 - expDiff; 5082 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5083 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5084 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5085 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5086 ++q; 5087 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5088 } 5089 } 5090 else { 5091 term1 = 0; 5092 term0 = bSig; 5093 } 5094 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5095 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5096 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5097 && ( q & 1 ) ) 5098 ) { 5099 aSig0 = alternateASig0; 5100 aSig1 = alternateASig1; 5101 zSign = ! zSign; 5102 } 5103 return 5104 normalizeRoundAndPackFloatx80( 5105 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5106 5107 } 5108 5109 /*---------------------------------------------------------------------------- 5110 | Returns the square root of the extended double-precision floating-point 5111 | value `a'. The operation is performed according to the IEC/IEEE Standard 5112 | for Binary Floating-Point Arithmetic. 5113 *----------------------------------------------------------------------------*/ 5114 5115 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5116 { 5117 flag aSign; 5118 int32_t aExp, zExp; 5119 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5120 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5121 5122 if (floatx80_invalid_encoding(a)) { 5123 float_raise(float_flag_invalid, status); 5124 return floatx80_default_nan(status); 5125 } 5126 aSig0 = extractFloatx80Frac( a ); 5127 aExp = extractFloatx80Exp( a ); 5128 aSign = extractFloatx80Sign( a ); 5129 if ( aExp == 0x7FFF ) { 5130 if ((uint64_t)(aSig0 << 1)) { 5131 return propagateFloatx80NaN(a, a, status); 5132 } 5133 if ( ! aSign ) return a; 5134 goto invalid; 5135 } 5136 if ( aSign ) { 5137 if ( ( aExp | aSig0 ) == 0 ) return a; 5138 invalid: 5139 float_raise(float_flag_invalid, status); 5140 return floatx80_default_nan(status); 5141 } 5142 if ( aExp == 0 ) { 5143 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5144 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5145 } 5146 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5147 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5148 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5149 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5150 doubleZSig0 = zSig0<<1; 5151 mul64To128( zSig0, zSig0, &term0, &term1 ); 5152 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5153 while ( (int64_t) rem0 < 0 ) { 5154 --zSig0; 5155 doubleZSig0 -= 2; 5156 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5157 } 5158 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5159 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5160 if ( zSig1 == 0 ) zSig1 = 1; 5161 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5162 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5163 mul64To128( zSig1, zSig1, &term2, &term3 ); 5164 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5165 while ( (int64_t) rem1 < 0 ) { 5166 --zSig1; 5167 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5168 term3 |= 1; 5169 term2 |= doubleZSig0; 5170 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5171 } 5172 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5173 } 5174 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5175 zSig0 |= doubleZSig0; 5176 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5177 0, zExp, zSig0, zSig1, status); 5178 } 5179 5180 /*---------------------------------------------------------------------------- 5181 | Returns 1 if the extended double-precision floating-point value `a' is equal 5182 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5183 | raised if either operand is a NaN. Otherwise, the comparison is performed 5184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5185 *----------------------------------------------------------------------------*/ 5186 5187 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5188 { 5189 5190 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5191 || (extractFloatx80Exp(a) == 0x7FFF 5192 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5193 || (extractFloatx80Exp(b) == 0x7FFF 5194 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5195 ) { 5196 float_raise(float_flag_invalid, status); 5197 return 0; 5198 } 5199 return 5200 ( a.low == b.low ) 5201 && ( ( a.high == b.high ) 5202 || ( ( a.low == 0 ) 5203 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5204 ); 5205 5206 } 5207 5208 /*---------------------------------------------------------------------------- 5209 | Returns 1 if the extended double-precision floating-point value `a' is 5210 | less than or equal to the corresponding value `b', and 0 otherwise. The 5211 | invalid exception is raised if either operand is a NaN. The comparison is 5212 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5213 | Arithmetic. 5214 *----------------------------------------------------------------------------*/ 5215 5216 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5217 { 5218 flag aSign, bSign; 5219 5220 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5221 || (extractFloatx80Exp(a) == 0x7FFF 5222 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5223 || (extractFloatx80Exp(b) == 0x7FFF 5224 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5225 ) { 5226 float_raise(float_flag_invalid, status); 5227 return 0; 5228 } 5229 aSign = extractFloatx80Sign( a ); 5230 bSign = extractFloatx80Sign( b ); 5231 if ( aSign != bSign ) { 5232 return 5233 aSign 5234 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5235 == 0 ); 5236 } 5237 return 5238 aSign ? le128( b.high, b.low, a.high, a.low ) 5239 : le128( a.high, a.low, b.high, b.low ); 5240 5241 } 5242 5243 /*---------------------------------------------------------------------------- 5244 | Returns 1 if the extended double-precision floating-point value `a' is 5245 | less than the corresponding value `b', and 0 otherwise. The invalid 5246 | exception is raised if either operand is a NaN. The comparison is performed 5247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5248 *----------------------------------------------------------------------------*/ 5249 5250 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5251 { 5252 flag aSign, bSign; 5253 5254 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5255 || (extractFloatx80Exp(a) == 0x7FFF 5256 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5257 || (extractFloatx80Exp(b) == 0x7FFF 5258 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5259 ) { 5260 float_raise(float_flag_invalid, status); 5261 return 0; 5262 } 5263 aSign = extractFloatx80Sign( a ); 5264 bSign = extractFloatx80Sign( b ); 5265 if ( aSign != bSign ) { 5266 return 5267 aSign 5268 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5269 != 0 ); 5270 } 5271 return 5272 aSign ? lt128( b.high, b.low, a.high, a.low ) 5273 : lt128( a.high, a.low, b.high, b.low ); 5274 5275 } 5276 5277 /*---------------------------------------------------------------------------- 5278 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5279 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5280 | either operand is a NaN. The comparison is performed according to the 5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5282 *----------------------------------------------------------------------------*/ 5283 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5284 { 5285 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5286 || (extractFloatx80Exp(a) == 0x7FFF 5287 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5288 || (extractFloatx80Exp(b) == 0x7FFF 5289 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5290 ) { 5291 float_raise(float_flag_invalid, status); 5292 return 1; 5293 } 5294 return 0; 5295 } 5296 5297 /*---------------------------------------------------------------------------- 5298 | Returns 1 if the extended double-precision floating-point value `a' is 5299 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5300 | cause an exception. The comparison is performed according to the IEC/IEEE 5301 | Standard for Binary Floating-Point Arithmetic. 5302 *----------------------------------------------------------------------------*/ 5303 5304 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5305 { 5306 5307 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5308 float_raise(float_flag_invalid, status); 5309 return 0; 5310 } 5311 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5312 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5313 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5314 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5315 ) { 5316 if (floatx80_is_signaling_nan(a, status) 5317 || floatx80_is_signaling_nan(b, status)) { 5318 float_raise(float_flag_invalid, status); 5319 } 5320 return 0; 5321 } 5322 return 5323 ( a.low == b.low ) 5324 && ( ( a.high == b.high ) 5325 || ( ( a.low == 0 ) 5326 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5327 ); 5328 5329 } 5330 5331 /*---------------------------------------------------------------------------- 5332 | Returns 1 if the extended double-precision floating-point value `a' is less 5333 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5334 | do not cause an exception. Otherwise, the comparison is performed according 5335 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5336 *----------------------------------------------------------------------------*/ 5337 5338 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5339 { 5340 flag aSign, bSign; 5341 5342 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5343 float_raise(float_flag_invalid, status); 5344 return 0; 5345 } 5346 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5347 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5348 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5349 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5350 ) { 5351 if (floatx80_is_signaling_nan(a, status) 5352 || floatx80_is_signaling_nan(b, status)) { 5353 float_raise(float_flag_invalid, status); 5354 } 5355 return 0; 5356 } 5357 aSign = extractFloatx80Sign( a ); 5358 bSign = extractFloatx80Sign( b ); 5359 if ( aSign != bSign ) { 5360 return 5361 aSign 5362 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5363 == 0 ); 5364 } 5365 return 5366 aSign ? le128( b.high, b.low, a.high, a.low ) 5367 : le128( a.high, a.low, b.high, b.low ); 5368 5369 } 5370 5371 /*---------------------------------------------------------------------------- 5372 | Returns 1 if the extended double-precision floating-point value `a' is less 5373 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5374 | an exception. Otherwise, the comparison is performed according to the 5375 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5376 *----------------------------------------------------------------------------*/ 5377 5378 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5379 { 5380 flag aSign, bSign; 5381 5382 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5383 float_raise(float_flag_invalid, status); 5384 return 0; 5385 } 5386 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5387 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5388 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5389 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5390 ) { 5391 if (floatx80_is_signaling_nan(a, status) 5392 || floatx80_is_signaling_nan(b, status)) { 5393 float_raise(float_flag_invalid, status); 5394 } 5395 return 0; 5396 } 5397 aSign = extractFloatx80Sign( a ); 5398 bSign = extractFloatx80Sign( b ); 5399 if ( aSign != bSign ) { 5400 return 5401 aSign 5402 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5403 != 0 ); 5404 } 5405 return 5406 aSign ? lt128( b.high, b.low, a.high, a.low ) 5407 : lt128( a.high, a.low, b.high, b.low ); 5408 5409 } 5410 5411 /*---------------------------------------------------------------------------- 5412 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5413 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5414 | The comparison is performed according to the IEC/IEEE Standard for Binary 5415 | Floating-Point Arithmetic. 5416 *----------------------------------------------------------------------------*/ 5417 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5418 { 5419 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5420 float_raise(float_flag_invalid, status); 5421 return 1; 5422 } 5423 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5424 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5425 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5426 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5427 ) { 5428 if (floatx80_is_signaling_nan(a, status) 5429 || floatx80_is_signaling_nan(b, status)) { 5430 float_raise(float_flag_invalid, status); 5431 } 5432 return 1; 5433 } 5434 return 0; 5435 } 5436 5437 /*---------------------------------------------------------------------------- 5438 | Returns the result of converting the quadruple-precision floating-point 5439 | value `a' to the 32-bit two's complement integer format. The conversion 5440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5441 | Arithmetic---which means in particular that the conversion is rounded 5442 | according to the current rounding mode. If `a' is a NaN, the largest 5443 | positive integer is returned. Otherwise, if the conversion overflows, the 5444 | largest integer with the same sign as `a' is returned. 5445 *----------------------------------------------------------------------------*/ 5446 5447 int32_t float128_to_int32(float128 a, float_status *status) 5448 { 5449 flag aSign; 5450 int32_t aExp, shiftCount; 5451 uint64_t aSig0, aSig1; 5452 5453 aSig1 = extractFloat128Frac1( a ); 5454 aSig0 = extractFloat128Frac0( a ); 5455 aExp = extractFloat128Exp( a ); 5456 aSign = extractFloat128Sign( a ); 5457 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5458 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5459 aSig0 |= ( aSig1 != 0 ); 5460 shiftCount = 0x4028 - aExp; 5461 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5462 return roundAndPackInt32(aSign, aSig0, status); 5463 5464 } 5465 5466 /*---------------------------------------------------------------------------- 5467 | Returns the result of converting the quadruple-precision floating-point 5468 | value `a' to the 32-bit two's complement integer format. The conversion 5469 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5470 | Arithmetic, except that the conversion is always rounded toward zero. If 5471 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5472 | conversion overflows, the largest integer with the same sign as `a' is 5473 | returned. 5474 *----------------------------------------------------------------------------*/ 5475 5476 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5477 { 5478 flag aSign; 5479 int32_t aExp, shiftCount; 5480 uint64_t aSig0, aSig1, savedASig; 5481 int32_t z; 5482 5483 aSig1 = extractFloat128Frac1( a ); 5484 aSig0 = extractFloat128Frac0( a ); 5485 aExp = extractFloat128Exp( a ); 5486 aSign = extractFloat128Sign( a ); 5487 aSig0 |= ( aSig1 != 0 ); 5488 if ( 0x401E < aExp ) { 5489 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5490 goto invalid; 5491 } 5492 else if ( aExp < 0x3FFF ) { 5493 if (aExp || aSig0) { 5494 status->float_exception_flags |= float_flag_inexact; 5495 } 5496 return 0; 5497 } 5498 aSig0 |= LIT64( 0x0001000000000000 ); 5499 shiftCount = 0x402F - aExp; 5500 savedASig = aSig0; 5501 aSig0 >>= shiftCount; 5502 z = aSig0; 5503 if ( aSign ) z = - z; 5504 if ( ( z < 0 ) ^ aSign ) { 5505 invalid: 5506 float_raise(float_flag_invalid, status); 5507 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5508 } 5509 if ( ( aSig0<<shiftCount ) != savedASig ) { 5510 status->float_exception_flags |= float_flag_inexact; 5511 } 5512 return z; 5513 5514 } 5515 5516 /*---------------------------------------------------------------------------- 5517 | Returns the result of converting the quadruple-precision floating-point 5518 | value `a' to the 64-bit two's complement integer format. The conversion 5519 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5520 | Arithmetic---which means in particular that the conversion is rounded 5521 | according to the current rounding mode. If `a' is a NaN, the largest 5522 | positive integer is returned. Otherwise, if the conversion overflows, the 5523 | largest integer with the same sign as `a' is returned. 5524 *----------------------------------------------------------------------------*/ 5525 5526 int64_t float128_to_int64(float128 a, float_status *status) 5527 { 5528 flag aSign; 5529 int32_t aExp, shiftCount; 5530 uint64_t aSig0, aSig1; 5531 5532 aSig1 = extractFloat128Frac1( a ); 5533 aSig0 = extractFloat128Frac0( a ); 5534 aExp = extractFloat128Exp( a ); 5535 aSign = extractFloat128Sign( a ); 5536 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5537 shiftCount = 0x402F - aExp; 5538 if ( shiftCount <= 0 ) { 5539 if ( 0x403E < aExp ) { 5540 float_raise(float_flag_invalid, status); 5541 if ( ! aSign 5542 || ( ( aExp == 0x7FFF ) 5543 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5544 ) 5545 ) { 5546 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5547 } 5548 return (int64_t) LIT64( 0x8000000000000000 ); 5549 } 5550 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5551 } 5552 else { 5553 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5554 } 5555 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5556 5557 } 5558 5559 /*---------------------------------------------------------------------------- 5560 | Returns the result of converting the quadruple-precision floating-point 5561 | value `a' to the 64-bit two's complement integer format. The conversion 5562 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5563 | Arithmetic, except that the conversion is always rounded toward zero. 5564 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5565 | the conversion overflows, the largest integer with the same sign as `a' is 5566 | returned. 5567 *----------------------------------------------------------------------------*/ 5568 5569 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5570 { 5571 flag aSign; 5572 int32_t aExp, shiftCount; 5573 uint64_t aSig0, aSig1; 5574 int64_t z; 5575 5576 aSig1 = extractFloat128Frac1( a ); 5577 aSig0 = extractFloat128Frac0( a ); 5578 aExp = extractFloat128Exp( a ); 5579 aSign = extractFloat128Sign( a ); 5580 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5581 shiftCount = aExp - 0x402F; 5582 if ( 0 < shiftCount ) { 5583 if ( 0x403E <= aExp ) { 5584 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5585 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5586 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5587 if (aSig1) { 5588 status->float_exception_flags |= float_flag_inexact; 5589 } 5590 } 5591 else { 5592 float_raise(float_flag_invalid, status); 5593 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5594 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5595 } 5596 } 5597 return (int64_t) LIT64( 0x8000000000000000 ); 5598 } 5599 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5600 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5601 status->float_exception_flags |= float_flag_inexact; 5602 } 5603 } 5604 else { 5605 if ( aExp < 0x3FFF ) { 5606 if ( aExp | aSig0 | aSig1 ) { 5607 status->float_exception_flags |= float_flag_inexact; 5608 } 5609 return 0; 5610 } 5611 z = aSig0>>( - shiftCount ); 5612 if ( aSig1 5613 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5614 status->float_exception_flags |= float_flag_inexact; 5615 } 5616 } 5617 if ( aSign ) z = - z; 5618 return z; 5619 5620 } 5621 5622 /*---------------------------------------------------------------------------- 5623 | Returns the result of converting the quadruple-precision floating-point value 5624 | `a' to the 64-bit unsigned integer format. The conversion is 5625 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5626 | Arithmetic---which means in particular that the conversion is rounded 5627 | according to the current rounding mode. If `a' is a NaN, the largest 5628 | positive integer is returned. If the conversion overflows, the 5629 | largest unsigned integer is returned. If 'a' is negative, the value is 5630 | rounded and zero is returned; negative values that do not round to zero 5631 | will raise the inexact exception. 5632 *----------------------------------------------------------------------------*/ 5633 5634 uint64_t float128_to_uint64(float128 a, float_status *status) 5635 { 5636 flag aSign; 5637 int aExp; 5638 int shiftCount; 5639 uint64_t aSig0, aSig1; 5640 5641 aSig0 = extractFloat128Frac0(a); 5642 aSig1 = extractFloat128Frac1(a); 5643 aExp = extractFloat128Exp(a); 5644 aSign = extractFloat128Sign(a); 5645 if (aSign && (aExp > 0x3FFE)) { 5646 float_raise(float_flag_invalid, status); 5647 if (float128_is_any_nan(a)) { 5648 return LIT64(0xFFFFFFFFFFFFFFFF); 5649 } else { 5650 return 0; 5651 } 5652 } 5653 if (aExp) { 5654 aSig0 |= LIT64(0x0001000000000000); 5655 } 5656 shiftCount = 0x402F - aExp; 5657 if (shiftCount <= 0) { 5658 if (0x403E < aExp) { 5659 float_raise(float_flag_invalid, status); 5660 return LIT64(0xFFFFFFFFFFFFFFFF); 5661 } 5662 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5663 } else { 5664 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5665 } 5666 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5667 } 5668 5669 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5670 { 5671 uint64_t v; 5672 signed char current_rounding_mode = status->float_rounding_mode; 5673 5674 set_float_rounding_mode(float_round_to_zero, status); 5675 v = float128_to_uint64(a, status); 5676 set_float_rounding_mode(current_rounding_mode, status); 5677 5678 return v; 5679 } 5680 5681 /*---------------------------------------------------------------------------- 5682 | Returns the result of converting the quadruple-precision floating-point 5683 | value `a' to the 32-bit unsigned integer format. The conversion 5684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5685 | Arithmetic except that the conversion is always rounded toward zero. 5686 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5687 | if the conversion overflows, the largest unsigned integer is returned. 5688 | If 'a' is negative, the value is rounded and zero is returned; negative 5689 | values that do not round to zero will raise the inexact exception. 5690 *----------------------------------------------------------------------------*/ 5691 5692 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5693 { 5694 uint64_t v; 5695 uint32_t res; 5696 int old_exc_flags = get_float_exception_flags(status); 5697 5698 v = float128_to_uint64_round_to_zero(a, status); 5699 if (v > 0xffffffff) { 5700 res = 0xffffffff; 5701 } else { 5702 return v; 5703 } 5704 set_float_exception_flags(old_exc_flags, status); 5705 float_raise(float_flag_invalid, status); 5706 return res; 5707 } 5708 5709 /*---------------------------------------------------------------------------- 5710 | Returns the result of converting the quadruple-precision floating-point 5711 | value `a' to the single-precision floating-point format. The conversion 5712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5713 | Arithmetic. 5714 *----------------------------------------------------------------------------*/ 5715 5716 float32 float128_to_float32(float128 a, float_status *status) 5717 { 5718 flag aSign; 5719 int32_t aExp; 5720 uint64_t aSig0, aSig1; 5721 uint32_t zSig; 5722 5723 aSig1 = extractFloat128Frac1( a ); 5724 aSig0 = extractFloat128Frac0( a ); 5725 aExp = extractFloat128Exp( a ); 5726 aSign = extractFloat128Sign( a ); 5727 if ( aExp == 0x7FFF ) { 5728 if ( aSig0 | aSig1 ) { 5729 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5730 } 5731 return packFloat32( aSign, 0xFF, 0 ); 5732 } 5733 aSig0 |= ( aSig1 != 0 ); 5734 shift64RightJamming( aSig0, 18, &aSig0 ); 5735 zSig = aSig0; 5736 if ( aExp || zSig ) { 5737 zSig |= 0x40000000; 5738 aExp -= 0x3F81; 5739 } 5740 return roundAndPackFloat32(aSign, aExp, zSig, status); 5741 5742 } 5743 5744 /*---------------------------------------------------------------------------- 5745 | Returns the result of converting the quadruple-precision floating-point 5746 | value `a' to the double-precision floating-point format. The conversion 5747 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5748 | Arithmetic. 5749 *----------------------------------------------------------------------------*/ 5750 5751 float64 float128_to_float64(float128 a, float_status *status) 5752 { 5753 flag aSign; 5754 int32_t aExp; 5755 uint64_t aSig0, aSig1; 5756 5757 aSig1 = extractFloat128Frac1( a ); 5758 aSig0 = extractFloat128Frac0( a ); 5759 aExp = extractFloat128Exp( a ); 5760 aSign = extractFloat128Sign( a ); 5761 if ( aExp == 0x7FFF ) { 5762 if ( aSig0 | aSig1 ) { 5763 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5764 } 5765 return packFloat64( aSign, 0x7FF, 0 ); 5766 } 5767 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5768 aSig0 |= ( aSig1 != 0 ); 5769 if ( aExp || aSig0 ) { 5770 aSig0 |= LIT64( 0x4000000000000000 ); 5771 aExp -= 0x3C01; 5772 } 5773 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5774 5775 } 5776 5777 /*---------------------------------------------------------------------------- 5778 | Returns the result of converting the quadruple-precision floating-point 5779 | value `a' to the extended double-precision floating-point format. The 5780 | conversion is performed according to the IEC/IEEE Standard for Binary 5781 | Floating-Point Arithmetic. 5782 *----------------------------------------------------------------------------*/ 5783 5784 floatx80 float128_to_floatx80(float128 a, float_status *status) 5785 { 5786 flag aSign; 5787 int32_t aExp; 5788 uint64_t aSig0, aSig1; 5789 5790 aSig1 = extractFloat128Frac1( a ); 5791 aSig0 = extractFloat128Frac0( a ); 5792 aExp = extractFloat128Exp( a ); 5793 aSign = extractFloat128Sign( a ); 5794 if ( aExp == 0x7FFF ) { 5795 if ( aSig0 | aSig1 ) { 5796 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5797 } 5798 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5799 } 5800 if ( aExp == 0 ) { 5801 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5802 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5803 } 5804 else { 5805 aSig0 |= LIT64( 0x0001000000000000 ); 5806 } 5807 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5808 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5809 5810 } 5811 5812 /*---------------------------------------------------------------------------- 5813 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5814 | returns the result as a quadruple-precision floating-point value. The 5815 | operation is performed according to the IEC/IEEE Standard for Binary 5816 | Floating-Point Arithmetic. 5817 *----------------------------------------------------------------------------*/ 5818 5819 float128 float128_round_to_int(float128 a, float_status *status) 5820 { 5821 flag aSign; 5822 int32_t aExp; 5823 uint64_t lastBitMask, roundBitsMask; 5824 float128 z; 5825 5826 aExp = extractFloat128Exp( a ); 5827 if ( 0x402F <= aExp ) { 5828 if ( 0x406F <= aExp ) { 5829 if ( ( aExp == 0x7FFF ) 5830 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5831 ) { 5832 return propagateFloat128NaN(a, a, status); 5833 } 5834 return a; 5835 } 5836 lastBitMask = 1; 5837 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5838 roundBitsMask = lastBitMask - 1; 5839 z = a; 5840 switch (status->float_rounding_mode) { 5841 case float_round_nearest_even: 5842 if ( lastBitMask ) { 5843 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5844 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5845 } 5846 else { 5847 if ( (int64_t) z.low < 0 ) { 5848 ++z.high; 5849 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5850 } 5851 } 5852 break; 5853 case float_round_ties_away: 5854 if (lastBitMask) { 5855 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 5856 } else { 5857 if ((int64_t) z.low < 0) { 5858 ++z.high; 5859 } 5860 } 5861 break; 5862 case float_round_to_zero: 5863 break; 5864 case float_round_up: 5865 if (!extractFloat128Sign(z)) { 5866 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5867 } 5868 break; 5869 case float_round_down: 5870 if (extractFloat128Sign(z)) { 5871 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5872 } 5873 break; 5874 default: 5875 abort(); 5876 } 5877 z.low &= ~ roundBitsMask; 5878 } 5879 else { 5880 if ( aExp < 0x3FFF ) { 5881 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5882 status->float_exception_flags |= float_flag_inexact; 5883 aSign = extractFloat128Sign( a ); 5884 switch (status->float_rounding_mode) { 5885 case float_round_nearest_even: 5886 if ( ( aExp == 0x3FFE ) 5887 && ( extractFloat128Frac0( a ) 5888 | extractFloat128Frac1( a ) ) 5889 ) { 5890 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5891 } 5892 break; 5893 case float_round_ties_away: 5894 if (aExp == 0x3FFE) { 5895 return packFloat128(aSign, 0x3FFF, 0, 0); 5896 } 5897 break; 5898 case float_round_down: 5899 return 5900 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5901 : packFloat128( 0, 0, 0, 0 ); 5902 case float_round_up: 5903 return 5904 aSign ? packFloat128( 1, 0, 0, 0 ) 5905 : packFloat128( 0, 0x3FFF, 0, 0 ); 5906 } 5907 return packFloat128( aSign, 0, 0, 0 ); 5908 } 5909 lastBitMask = 1; 5910 lastBitMask <<= 0x402F - aExp; 5911 roundBitsMask = lastBitMask - 1; 5912 z.low = 0; 5913 z.high = a.high; 5914 switch (status->float_rounding_mode) { 5915 case float_round_nearest_even: 5916 z.high += lastBitMask>>1; 5917 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5918 z.high &= ~ lastBitMask; 5919 } 5920 break; 5921 case float_round_ties_away: 5922 z.high += lastBitMask>>1; 5923 break; 5924 case float_round_to_zero: 5925 break; 5926 case float_round_up: 5927 if (!extractFloat128Sign(z)) { 5928 z.high |= ( a.low != 0 ); 5929 z.high += roundBitsMask; 5930 } 5931 break; 5932 case float_round_down: 5933 if (extractFloat128Sign(z)) { 5934 z.high |= (a.low != 0); 5935 z.high += roundBitsMask; 5936 } 5937 break; 5938 default: 5939 abort(); 5940 } 5941 z.high &= ~ roundBitsMask; 5942 } 5943 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5944 status->float_exception_flags |= float_flag_inexact; 5945 } 5946 return z; 5947 5948 } 5949 5950 /*---------------------------------------------------------------------------- 5951 | Returns the result of adding the absolute values of the quadruple-precision 5952 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5953 | before being returned. `zSign' is ignored if the result is a NaN. 5954 | The addition is performed according to the IEC/IEEE Standard for Binary 5955 | Floating-Point Arithmetic. 5956 *----------------------------------------------------------------------------*/ 5957 5958 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 5959 float_status *status) 5960 { 5961 int32_t aExp, bExp, zExp; 5962 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5963 int32_t expDiff; 5964 5965 aSig1 = extractFloat128Frac1( a ); 5966 aSig0 = extractFloat128Frac0( a ); 5967 aExp = extractFloat128Exp( a ); 5968 bSig1 = extractFloat128Frac1( b ); 5969 bSig0 = extractFloat128Frac0( b ); 5970 bExp = extractFloat128Exp( b ); 5971 expDiff = aExp - bExp; 5972 if ( 0 < expDiff ) { 5973 if ( aExp == 0x7FFF ) { 5974 if (aSig0 | aSig1) { 5975 return propagateFloat128NaN(a, b, status); 5976 } 5977 return a; 5978 } 5979 if ( bExp == 0 ) { 5980 --expDiff; 5981 } 5982 else { 5983 bSig0 |= LIT64( 0x0001000000000000 ); 5984 } 5985 shift128ExtraRightJamming( 5986 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 5987 zExp = aExp; 5988 } 5989 else if ( expDiff < 0 ) { 5990 if ( bExp == 0x7FFF ) { 5991 if (bSig0 | bSig1) { 5992 return propagateFloat128NaN(a, b, status); 5993 } 5994 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5995 } 5996 if ( aExp == 0 ) { 5997 ++expDiff; 5998 } 5999 else { 6000 aSig0 |= LIT64( 0x0001000000000000 ); 6001 } 6002 shift128ExtraRightJamming( 6003 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6004 zExp = bExp; 6005 } 6006 else { 6007 if ( aExp == 0x7FFF ) { 6008 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6009 return propagateFloat128NaN(a, b, status); 6010 } 6011 return a; 6012 } 6013 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6014 if ( aExp == 0 ) { 6015 if (status->flush_to_zero) { 6016 if (zSig0 | zSig1) { 6017 float_raise(float_flag_output_denormal, status); 6018 } 6019 return packFloat128(zSign, 0, 0, 0); 6020 } 6021 return packFloat128( zSign, 0, zSig0, zSig1 ); 6022 } 6023 zSig2 = 0; 6024 zSig0 |= LIT64( 0x0002000000000000 ); 6025 zExp = aExp; 6026 goto shiftRight1; 6027 } 6028 aSig0 |= LIT64( 0x0001000000000000 ); 6029 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6030 --zExp; 6031 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6032 ++zExp; 6033 shiftRight1: 6034 shift128ExtraRightJamming( 6035 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6036 roundAndPack: 6037 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6038 6039 } 6040 6041 /*---------------------------------------------------------------------------- 6042 | Returns the result of subtracting the absolute values of the quadruple- 6043 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6044 | difference is negated before being returned. `zSign' is ignored if the 6045 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6046 | Standard for Binary Floating-Point Arithmetic. 6047 *----------------------------------------------------------------------------*/ 6048 6049 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6050 float_status *status) 6051 { 6052 int32_t aExp, bExp, zExp; 6053 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6054 int32_t expDiff; 6055 6056 aSig1 = extractFloat128Frac1( a ); 6057 aSig0 = extractFloat128Frac0( a ); 6058 aExp = extractFloat128Exp( a ); 6059 bSig1 = extractFloat128Frac1( b ); 6060 bSig0 = extractFloat128Frac0( b ); 6061 bExp = extractFloat128Exp( b ); 6062 expDiff = aExp - bExp; 6063 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6064 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6065 if ( 0 < expDiff ) goto aExpBigger; 6066 if ( expDiff < 0 ) goto bExpBigger; 6067 if ( aExp == 0x7FFF ) { 6068 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6069 return propagateFloat128NaN(a, b, status); 6070 } 6071 float_raise(float_flag_invalid, status); 6072 return float128_default_nan(status); 6073 } 6074 if ( aExp == 0 ) { 6075 aExp = 1; 6076 bExp = 1; 6077 } 6078 if ( bSig0 < aSig0 ) goto aBigger; 6079 if ( aSig0 < bSig0 ) goto bBigger; 6080 if ( bSig1 < aSig1 ) goto aBigger; 6081 if ( aSig1 < bSig1 ) goto bBigger; 6082 return packFloat128(status->float_rounding_mode == float_round_down, 6083 0, 0, 0); 6084 bExpBigger: 6085 if ( bExp == 0x7FFF ) { 6086 if (bSig0 | bSig1) { 6087 return propagateFloat128NaN(a, b, status); 6088 } 6089 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6090 } 6091 if ( aExp == 0 ) { 6092 ++expDiff; 6093 } 6094 else { 6095 aSig0 |= LIT64( 0x4000000000000000 ); 6096 } 6097 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6098 bSig0 |= LIT64( 0x4000000000000000 ); 6099 bBigger: 6100 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6101 zExp = bExp; 6102 zSign ^= 1; 6103 goto normalizeRoundAndPack; 6104 aExpBigger: 6105 if ( aExp == 0x7FFF ) { 6106 if (aSig0 | aSig1) { 6107 return propagateFloat128NaN(a, b, status); 6108 } 6109 return a; 6110 } 6111 if ( bExp == 0 ) { 6112 --expDiff; 6113 } 6114 else { 6115 bSig0 |= LIT64( 0x4000000000000000 ); 6116 } 6117 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6118 aSig0 |= LIT64( 0x4000000000000000 ); 6119 aBigger: 6120 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6121 zExp = aExp; 6122 normalizeRoundAndPack: 6123 --zExp; 6124 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6125 status); 6126 6127 } 6128 6129 /*---------------------------------------------------------------------------- 6130 | Returns the result of adding the quadruple-precision floating-point values 6131 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6132 | for Binary Floating-Point Arithmetic. 6133 *----------------------------------------------------------------------------*/ 6134 6135 float128 float128_add(float128 a, float128 b, float_status *status) 6136 { 6137 flag aSign, bSign; 6138 6139 aSign = extractFloat128Sign( a ); 6140 bSign = extractFloat128Sign( b ); 6141 if ( aSign == bSign ) { 6142 return addFloat128Sigs(a, b, aSign, status); 6143 } 6144 else { 6145 return subFloat128Sigs(a, b, aSign, status); 6146 } 6147 6148 } 6149 6150 /*---------------------------------------------------------------------------- 6151 | Returns the result of subtracting the quadruple-precision floating-point 6152 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6153 | Standard for Binary Floating-Point Arithmetic. 6154 *----------------------------------------------------------------------------*/ 6155 6156 float128 float128_sub(float128 a, float128 b, float_status *status) 6157 { 6158 flag aSign, bSign; 6159 6160 aSign = extractFloat128Sign( a ); 6161 bSign = extractFloat128Sign( b ); 6162 if ( aSign == bSign ) { 6163 return subFloat128Sigs(a, b, aSign, status); 6164 } 6165 else { 6166 return addFloat128Sigs(a, b, aSign, status); 6167 } 6168 6169 } 6170 6171 /*---------------------------------------------------------------------------- 6172 | Returns the result of multiplying the quadruple-precision floating-point 6173 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6174 | Standard for Binary Floating-Point Arithmetic. 6175 *----------------------------------------------------------------------------*/ 6176 6177 float128 float128_mul(float128 a, float128 b, float_status *status) 6178 { 6179 flag aSign, bSign, zSign; 6180 int32_t aExp, bExp, zExp; 6181 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6182 6183 aSig1 = extractFloat128Frac1( a ); 6184 aSig0 = extractFloat128Frac0( a ); 6185 aExp = extractFloat128Exp( a ); 6186 aSign = extractFloat128Sign( a ); 6187 bSig1 = extractFloat128Frac1( b ); 6188 bSig0 = extractFloat128Frac0( b ); 6189 bExp = extractFloat128Exp( b ); 6190 bSign = extractFloat128Sign( b ); 6191 zSign = aSign ^ bSign; 6192 if ( aExp == 0x7FFF ) { 6193 if ( ( aSig0 | aSig1 ) 6194 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6195 return propagateFloat128NaN(a, b, status); 6196 } 6197 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6198 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6199 } 6200 if ( bExp == 0x7FFF ) { 6201 if (bSig0 | bSig1) { 6202 return propagateFloat128NaN(a, b, status); 6203 } 6204 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6205 invalid: 6206 float_raise(float_flag_invalid, status); 6207 return float128_default_nan(status); 6208 } 6209 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6210 } 6211 if ( aExp == 0 ) { 6212 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6213 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6214 } 6215 if ( bExp == 0 ) { 6216 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6217 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6218 } 6219 zExp = aExp + bExp - 0x4000; 6220 aSig0 |= LIT64( 0x0001000000000000 ); 6221 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6222 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6223 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6224 zSig2 |= ( zSig3 != 0 ); 6225 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6226 shift128ExtraRightJamming( 6227 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6228 ++zExp; 6229 } 6230 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6231 6232 } 6233 6234 /*---------------------------------------------------------------------------- 6235 | Returns the result of dividing the quadruple-precision floating-point value 6236 | `a' by the corresponding value `b'. The operation is performed according to 6237 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6238 *----------------------------------------------------------------------------*/ 6239 6240 float128 float128_div(float128 a, float128 b, float_status *status) 6241 { 6242 flag aSign, bSign, zSign; 6243 int32_t aExp, bExp, zExp; 6244 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6245 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6246 6247 aSig1 = extractFloat128Frac1( a ); 6248 aSig0 = extractFloat128Frac0( a ); 6249 aExp = extractFloat128Exp( a ); 6250 aSign = extractFloat128Sign( a ); 6251 bSig1 = extractFloat128Frac1( b ); 6252 bSig0 = extractFloat128Frac0( b ); 6253 bExp = extractFloat128Exp( b ); 6254 bSign = extractFloat128Sign( b ); 6255 zSign = aSign ^ bSign; 6256 if ( aExp == 0x7FFF ) { 6257 if (aSig0 | aSig1) { 6258 return propagateFloat128NaN(a, b, status); 6259 } 6260 if ( bExp == 0x7FFF ) { 6261 if (bSig0 | bSig1) { 6262 return propagateFloat128NaN(a, b, status); 6263 } 6264 goto invalid; 6265 } 6266 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6267 } 6268 if ( bExp == 0x7FFF ) { 6269 if (bSig0 | bSig1) { 6270 return propagateFloat128NaN(a, b, status); 6271 } 6272 return packFloat128( zSign, 0, 0, 0 ); 6273 } 6274 if ( bExp == 0 ) { 6275 if ( ( bSig0 | bSig1 ) == 0 ) { 6276 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6277 invalid: 6278 float_raise(float_flag_invalid, status); 6279 return float128_default_nan(status); 6280 } 6281 float_raise(float_flag_divbyzero, status); 6282 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6283 } 6284 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6285 } 6286 if ( aExp == 0 ) { 6287 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6288 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6289 } 6290 zExp = aExp - bExp + 0x3FFD; 6291 shortShift128Left( 6292 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6293 shortShift128Left( 6294 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6295 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6296 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6297 ++zExp; 6298 } 6299 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6300 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6301 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6302 while ( (int64_t) rem0 < 0 ) { 6303 --zSig0; 6304 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6305 } 6306 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6307 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6308 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6309 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6310 while ( (int64_t) rem1 < 0 ) { 6311 --zSig1; 6312 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6313 } 6314 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6315 } 6316 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6317 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6318 6319 } 6320 6321 /*---------------------------------------------------------------------------- 6322 | Returns the remainder of the quadruple-precision floating-point value `a' 6323 | with respect to the corresponding value `b'. The operation is performed 6324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6325 *----------------------------------------------------------------------------*/ 6326 6327 float128 float128_rem(float128 a, float128 b, float_status *status) 6328 { 6329 flag aSign, zSign; 6330 int32_t aExp, bExp, expDiff; 6331 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6332 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6333 int64_t sigMean0; 6334 6335 aSig1 = extractFloat128Frac1( a ); 6336 aSig0 = extractFloat128Frac0( a ); 6337 aExp = extractFloat128Exp( a ); 6338 aSign = extractFloat128Sign( a ); 6339 bSig1 = extractFloat128Frac1( b ); 6340 bSig0 = extractFloat128Frac0( b ); 6341 bExp = extractFloat128Exp( b ); 6342 if ( aExp == 0x7FFF ) { 6343 if ( ( aSig0 | aSig1 ) 6344 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6345 return propagateFloat128NaN(a, b, status); 6346 } 6347 goto invalid; 6348 } 6349 if ( bExp == 0x7FFF ) { 6350 if (bSig0 | bSig1) { 6351 return propagateFloat128NaN(a, b, status); 6352 } 6353 return a; 6354 } 6355 if ( bExp == 0 ) { 6356 if ( ( bSig0 | bSig1 ) == 0 ) { 6357 invalid: 6358 float_raise(float_flag_invalid, status); 6359 return float128_default_nan(status); 6360 } 6361 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6362 } 6363 if ( aExp == 0 ) { 6364 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6365 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6366 } 6367 expDiff = aExp - bExp; 6368 if ( expDiff < -1 ) return a; 6369 shortShift128Left( 6370 aSig0 | LIT64( 0x0001000000000000 ), 6371 aSig1, 6372 15 - ( expDiff < 0 ), 6373 &aSig0, 6374 &aSig1 6375 ); 6376 shortShift128Left( 6377 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6378 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6379 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6380 expDiff -= 64; 6381 while ( 0 < expDiff ) { 6382 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6383 q = ( 4 < q ) ? q - 4 : 0; 6384 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6385 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6386 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6387 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6388 expDiff -= 61; 6389 } 6390 if ( -64 < expDiff ) { 6391 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6392 q = ( 4 < q ) ? q - 4 : 0; 6393 q >>= - expDiff; 6394 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6395 expDiff += 52; 6396 if ( expDiff < 0 ) { 6397 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6398 } 6399 else { 6400 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6401 } 6402 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6403 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6404 } 6405 else { 6406 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6407 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6408 } 6409 do { 6410 alternateASig0 = aSig0; 6411 alternateASig1 = aSig1; 6412 ++q; 6413 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6414 } while ( 0 <= (int64_t) aSig0 ); 6415 add128( 6416 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6417 if ( ( sigMean0 < 0 ) 6418 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6419 aSig0 = alternateASig0; 6420 aSig1 = alternateASig1; 6421 } 6422 zSign = ( (int64_t) aSig0 < 0 ); 6423 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6424 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6425 status); 6426 } 6427 6428 /*---------------------------------------------------------------------------- 6429 | Returns the square root of the quadruple-precision floating-point value `a'. 6430 | The operation is performed according to the IEC/IEEE Standard for Binary 6431 | Floating-Point Arithmetic. 6432 *----------------------------------------------------------------------------*/ 6433 6434 float128 float128_sqrt(float128 a, float_status *status) 6435 { 6436 flag aSign; 6437 int32_t aExp, zExp; 6438 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6439 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6440 6441 aSig1 = extractFloat128Frac1( a ); 6442 aSig0 = extractFloat128Frac0( a ); 6443 aExp = extractFloat128Exp( a ); 6444 aSign = extractFloat128Sign( a ); 6445 if ( aExp == 0x7FFF ) { 6446 if (aSig0 | aSig1) { 6447 return propagateFloat128NaN(a, a, status); 6448 } 6449 if ( ! aSign ) return a; 6450 goto invalid; 6451 } 6452 if ( aSign ) { 6453 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6454 invalid: 6455 float_raise(float_flag_invalid, status); 6456 return float128_default_nan(status); 6457 } 6458 if ( aExp == 0 ) { 6459 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6460 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6461 } 6462 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6463 aSig0 |= LIT64( 0x0001000000000000 ); 6464 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6465 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6466 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6467 doubleZSig0 = zSig0<<1; 6468 mul64To128( zSig0, zSig0, &term0, &term1 ); 6469 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6470 while ( (int64_t) rem0 < 0 ) { 6471 --zSig0; 6472 doubleZSig0 -= 2; 6473 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6474 } 6475 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6476 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6477 if ( zSig1 == 0 ) zSig1 = 1; 6478 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6479 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6480 mul64To128( zSig1, zSig1, &term2, &term3 ); 6481 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6482 while ( (int64_t) rem1 < 0 ) { 6483 --zSig1; 6484 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6485 term3 |= 1; 6486 term2 |= doubleZSig0; 6487 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6488 } 6489 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6490 } 6491 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6492 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6493 6494 } 6495 6496 /*---------------------------------------------------------------------------- 6497 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6498 | the corresponding value `b', and 0 otherwise. The invalid exception is 6499 | raised if either operand is a NaN. Otherwise, the comparison is performed 6500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6501 *----------------------------------------------------------------------------*/ 6502 6503 int float128_eq(float128 a, float128 b, float_status *status) 6504 { 6505 6506 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6507 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6508 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6509 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6510 ) { 6511 float_raise(float_flag_invalid, status); 6512 return 0; 6513 } 6514 return 6515 ( a.low == b.low ) 6516 && ( ( a.high == b.high ) 6517 || ( ( a.low == 0 ) 6518 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6519 ); 6520 6521 } 6522 6523 /*---------------------------------------------------------------------------- 6524 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6525 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6526 | exception is raised if either operand is a NaN. The comparison is performed 6527 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6528 *----------------------------------------------------------------------------*/ 6529 6530 int float128_le(float128 a, float128 b, float_status *status) 6531 { 6532 flag aSign, bSign; 6533 6534 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6535 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6536 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6537 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6538 ) { 6539 float_raise(float_flag_invalid, status); 6540 return 0; 6541 } 6542 aSign = extractFloat128Sign( a ); 6543 bSign = extractFloat128Sign( b ); 6544 if ( aSign != bSign ) { 6545 return 6546 aSign 6547 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6548 == 0 ); 6549 } 6550 return 6551 aSign ? le128( b.high, b.low, a.high, a.low ) 6552 : le128( a.high, a.low, b.high, b.low ); 6553 6554 } 6555 6556 /*---------------------------------------------------------------------------- 6557 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6558 | the corresponding value `b', and 0 otherwise. The invalid exception is 6559 | raised if either operand is a NaN. The comparison is performed according 6560 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6561 *----------------------------------------------------------------------------*/ 6562 6563 int float128_lt(float128 a, float128 b, float_status *status) 6564 { 6565 flag aSign, bSign; 6566 6567 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6568 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6569 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6570 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6571 ) { 6572 float_raise(float_flag_invalid, status); 6573 return 0; 6574 } 6575 aSign = extractFloat128Sign( a ); 6576 bSign = extractFloat128Sign( b ); 6577 if ( aSign != bSign ) { 6578 return 6579 aSign 6580 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6581 != 0 ); 6582 } 6583 return 6584 aSign ? lt128( b.high, b.low, a.high, a.low ) 6585 : lt128( a.high, a.low, b.high, b.low ); 6586 6587 } 6588 6589 /*---------------------------------------------------------------------------- 6590 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6591 | be compared, and 0 otherwise. The invalid exception is raised if either 6592 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6593 | Standard for Binary Floating-Point Arithmetic. 6594 *----------------------------------------------------------------------------*/ 6595 6596 int float128_unordered(float128 a, float128 b, float_status *status) 6597 { 6598 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6599 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6600 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6601 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6602 ) { 6603 float_raise(float_flag_invalid, status); 6604 return 1; 6605 } 6606 return 0; 6607 } 6608 6609 /*---------------------------------------------------------------------------- 6610 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6611 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6612 | exception. The comparison is performed according to the IEC/IEEE Standard 6613 | for Binary Floating-Point Arithmetic. 6614 *----------------------------------------------------------------------------*/ 6615 6616 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6617 { 6618 6619 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6620 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6621 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6622 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6623 ) { 6624 if (float128_is_signaling_nan(a, status) 6625 || float128_is_signaling_nan(b, status)) { 6626 float_raise(float_flag_invalid, status); 6627 } 6628 return 0; 6629 } 6630 return 6631 ( a.low == b.low ) 6632 && ( ( a.high == b.high ) 6633 || ( ( a.low == 0 ) 6634 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6635 ); 6636 6637 } 6638 6639 /*---------------------------------------------------------------------------- 6640 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6641 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6642 | cause an exception. Otherwise, the comparison is performed according to the 6643 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6644 *----------------------------------------------------------------------------*/ 6645 6646 int float128_le_quiet(float128 a, float128 b, float_status *status) 6647 { 6648 flag aSign, bSign; 6649 6650 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6651 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6652 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6653 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6654 ) { 6655 if (float128_is_signaling_nan(a, status) 6656 || float128_is_signaling_nan(b, status)) { 6657 float_raise(float_flag_invalid, status); 6658 } 6659 return 0; 6660 } 6661 aSign = extractFloat128Sign( a ); 6662 bSign = extractFloat128Sign( b ); 6663 if ( aSign != bSign ) { 6664 return 6665 aSign 6666 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6667 == 0 ); 6668 } 6669 return 6670 aSign ? le128( b.high, b.low, a.high, a.low ) 6671 : le128( a.high, a.low, b.high, b.low ); 6672 6673 } 6674 6675 /*---------------------------------------------------------------------------- 6676 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6677 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6678 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6679 | Standard for Binary Floating-Point Arithmetic. 6680 *----------------------------------------------------------------------------*/ 6681 6682 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6683 { 6684 flag aSign, bSign; 6685 6686 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6687 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6688 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6689 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6690 ) { 6691 if (float128_is_signaling_nan(a, status) 6692 || float128_is_signaling_nan(b, status)) { 6693 float_raise(float_flag_invalid, status); 6694 } 6695 return 0; 6696 } 6697 aSign = extractFloat128Sign( a ); 6698 bSign = extractFloat128Sign( b ); 6699 if ( aSign != bSign ) { 6700 return 6701 aSign 6702 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6703 != 0 ); 6704 } 6705 return 6706 aSign ? lt128( b.high, b.low, a.high, a.low ) 6707 : lt128( a.high, a.low, b.high, b.low ); 6708 6709 } 6710 6711 /*---------------------------------------------------------------------------- 6712 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6713 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6714 | comparison is performed according to the IEC/IEEE Standard for Binary 6715 | Floating-Point Arithmetic. 6716 *----------------------------------------------------------------------------*/ 6717 6718 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6719 { 6720 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6721 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6722 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6723 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6724 ) { 6725 if (float128_is_signaling_nan(a, status) 6726 || float128_is_signaling_nan(b, status)) { 6727 float_raise(float_flag_invalid, status); 6728 } 6729 return 1; 6730 } 6731 return 0; 6732 } 6733 6734 #define COMPARE(s, nan_exp) \ 6735 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 6736 int is_quiet, float_status *status) \ 6737 { \ 6738 flag aSign, bSign; \ 6739 uint ## s ## _t av, bv; \ 6740 a = float ## s ## _squash_input_denormal(a, status); \ 6741 b = float ## s ## _squash_input_denormal(b, status); \ 6742 \ 6743 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 6744 extractFloat ## s ## Frac( a ) ) || \ 6745 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 6746 extractFloat ## s ## Frac( b ) )) { \ 6747 if (!is_quiet || \ 6748 float ## s ## _is_signaling_nan(a, status) || \ 6749 float ## s ## _is_signaling_nan(b, status)) { \ 6750 float_raise(float_flag_invalid, status); \ 6751 } \ 6752 return float_relation_unordered; \ 6753 } \ 6754 aSign = extractFloat ## s ## Sign( a ); \ 6755 bSign = extractFloat ## s ## Sign( b ); \ 6756 av = float ## s ## _val(a); \ 6757 bv = float ## s ## _val(b); \ 6758 if ( aSign != bSign ) { \ 6759 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 6760 /* zero case */ \ 6761 return float_relation_equal; \ 6762 } else { \ 6763 return 1 - (2 * aSign); \ 6764 } \ 6765 } else { \ 6766 if (av == bv) { \ 6767 return float_relation_equal; \ 6768 } else { \ 6769 return 1 - 2 * (aSign ^ ( av < bv )); \ 6770 } \ 6771 } \ 6772 } \ 6773 \ 6774 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 6775 { \ 6776 return float ## s ## _compare_internal(a, b, 0, status); \ 6777 } \ 6778 \ 6779 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 6780 float_status *status) \ 6781 { \ 6782 return float ## s ## _compare_internal(a, b, 1, status); \ 6783 } 6784 6785 COMPARE(32, 0xff) 6786 COMPARE(64, 0x7ff) 6787 6788 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6789 int is_quiet, float_status *status) 6790 { 6791 flag aSign, bSign; 6792 6793 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6794 float_raise(float_flag_invalid, status); 6795 return float_relation_unordered; 6796 } 6797 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6798 ( extractFloatx80Frac( a )<<1 ) ) || 6799 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6800 ( extractFloatx80Frac( b )<<1 ) )) { 6801 if (!is_quiet || 6802 floatx80_is_signaling_nan(a, status) || 6803 floatx80_is_signaling_nan(b, status)) { 6804 float_raise(float_flag_invalid, status); 6805 } 6806 return float_relation_unordered; 6807 } 6808 aSign = extractFloatx80Sign( a ); 6809 bSign = extractFloatx80Sign( b ); 6810 if ( aSign != bSign ) { 6811 6812 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6813 ( ( a.low | b.low ) == 0 ) ) { 6814 /* zero case */ 6815 return float_relation_equal; 6816 } else { 6817 return 1 - (2 * aSign); 6818 } 6819 } else { 6820 if (a.low == b.low && a.high == b.high) { 6821 return float_relation_equal; 6822 } else { 6823 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6824 } 6825 } 6826 } 6827 6828 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6829 { 6830 return floatx80_compare_internal(a, b, 0, status); 6831 } 6832 6833 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6834 { 6835 return floatx80_compare_internal(a, b, 1, status); 6836 } 6837 6838 static inline int float128_compare_internal(float128 a, float128 b, 6839 int is_quiet, float_status *status) 6840 { 6841 flag aSign, bSign; 6842 6843 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6844 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6845 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6846 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6847 if (!is_quiet || 6848 float128_is_signaling_nan(a, status) || 6849 float128_is_signaling_nan(b, status)) { 6850 float_raise(float_flag_invalid, status); 6851 } 6852 return float_relation_unordered; 6853 } 6854 aSign = extractFloat128Sign( a ); 6855 bSign = extractFloat128Sign( b ); 6856 if ( aSign != bSign ) { 6857 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6858 /* zero case */ 6859 return float_relation_equal; 6860 } else { 6861 return 1 - (2 * aSign); 6862 } 6863 } else { 6864 if (a.low == b.low && a.high == b.high) { 6865 return float_relation_equal; 6866 } else { 6867 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6868 } 6869 } 6870 } 6871 6872 int float128_compare(float128 a, float128 b, float_status *status) 6873 { 6874 return float128_compare_internal(a, b, 0, status); 6875 } 6876 6877 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6878 { 6879 return float128_compare_internal(a, b, 1, status); 6880 } 6881 6882 /* min() and max() functions. These can't be implemented as 6883 * 'compare and pick one input' because that would mishandle 6884 * NaNs and +0 vs -0. 6885 * 6886 * minnum() and maxnum() functions. These are similar to the min() 6887 * and max() functions but if one of the arguments is a QNaN and 6888 * the other is numerical then the numerical argument is returned. 6889 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 6890 * and maxNum() operations. min() and max() are the typical min/max 6891 * semantics provided by many CPUs which predate that specification. 6892 * 6893 * minnummag() and maxnummag() functions correspond to minNumMag() 6894 * and minNumMag() from the IEEE-754 2008. 6895 */ 6896 #define MINMAX(s) \ 6897 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 6898 int ismin, int isieee, \ 6899 int ismag, \ 6900 float_status *status) \ 6901 { \ 6902 flag aSign, bSign; \ 6903 uint ## s ## _t av, bv, aav, abv; \ 6904 a = float ## s ## _squash_input_denormal(a, status); \ 6905 b = float ## s ## _squash_input_denormal(b, status); \ 6906 if (float ## s ## _is_any_nan(a) || \ 6907 float ## s ## _is_any_nan(b)) { \ 6908 if (isieee) { \ 6909 if (float ## s ## _is_quiet_nan(a, status) && \ 6910 !float ## s ##_is_any_nan(b)) { \ 6911 return b; \ 6912 } else if (float ## s ## _is_quiet_nan(b, status) && \ 6913 !float ## s ## _is_any_nan(a)) { \ 6914 return a; \ 6915 } \ 6916 } \ 6917 return propagateFloat ## s ## NaN(a, b, status); \ 6918 } \ 6919 aSign = extractFloat ## s ## Sign(a); \ 6920 bSign = extractFloat ## s ## Sign(b); \ 6921 av = float ## s ## _val(a); \ 6922 bv = float ## s ## _val(b); \ 6923 if (ismag) { \ 6924 aav = float ## s ## _abs(av); \ 6925 abv = float ## s ## _abs(bv); \ 6926 if (aav != abv) { \ 6927 if (ismin) { \ 6928 return (aav < abv) ? a : b; \ 6929 } else { \ 6930 return (aav < abv) ? b : a; \ 6931 } \ 6932 } \ 6933 } \ 6934 if (aSign != bSign) { \ 6935 if (ismin) { \ 6936 return aSign ? a : b; \ 6937 } else { \ 6938 return aSign ? b : a; \ 6939 } \ 6940 } else { \ 6941 if (ismin) { \ 6942 return (aSign ^ (av < bv)) ? a : b; \ 6943 } else { \ 6944 return (aSign ^ (av < bv)) ? b : a; \ 6945 } \ 6946 } \ 6947 } \ 6948 \ 6949 float ## s float ## s ## _min(float ## s a, float ## s b, \ 6950 float_status *status) \ 6951 { \ 6952 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 6953 } \ 6954 \ 6955 float ## s float ## s ## _max(float ## s a, float ## s b, \ 6956 float_status *status) \ 6957 { \ 6958 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 6959 } \ 6960 \ 6961 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 6962 float_status *status) \ 6963 { \ 6964 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 6965 } \ 6966 \ 6967 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 6968 float_status *status) \ 6969 { \ 6970 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 6971 } \ 6972 \ 6973 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 6974 float_status *status) \ 6975 { \ 6976 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 6977 } \ 6978 \ 6979 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 6980 float_status *status) \ 6981 { \ 6982 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 6983 } 6984 6985 MINMAX(32) 6986 MINMAX(64) 6987 6988 6989 /* Multiply A by 2 raised to the power N. */ 6990 float32 float32_scalbn(float32 a, int n, float_status *status) 6991 { 6992 flag aSign; 6993 int16_t aExp; 6994 uint32_t aSig; 6995 6996 a = float32_squash_input_denormal(a, status); 6997 aSig = extractFloat32Frac( a ); 6998 aExp = extractFloat32Exp( a ); 6999 aSign = extractFloat32Sign( a ); 7000 7001 if ( aExp == 0xFF ) { 7002 if ( aSig ) { 7003 return propagateFloat32NaN(a, a, status); 7004 } 7005 return a; 7006 } 7007 if (aExp != 0) { 7008 aSig |= 0x00800000; 7009 } else if (aSig == 0) { 7010 return a; 7011 } else { 7012 aExp++; 7013 } 7014 7015 if (n > 0x200) { 7016 n = 0x200; 7017 } else if (n < -0x200) { 7018 n = -0x200; 7019 } 7020 7021 aExp += n - 1; 7022 aSig <<= 7; 7023 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7024 } 7025 7026 float64 float64_scalbn(float64 a, int n, float_status *status) 7027 { 7028 flag aSign; 7029 int16_t aExp; 7030 uint64_t aSig; 7031 7032 a = float64_squash_input_denormal(a, status); 7033 aSig = extractFloat64Frac( a ); 7034 aExp = extractFloat64Exp( a ); 7035 aSign = extractFloat64Sign( a ); 7036 7037 if ( aExp == 0x7FF ) { 7038 if ( aSig ) { 7039 return propagateFloat64NaN(a, a, status); 7040 } 7041 return a; 7042 } 7043 if (aExp != 0) { 7044 aSig |= LIT64( 0x0010000000000000 ); 7045 } else if (aSig == 0) { 7046 return a; 7047 } else { 7048 aExp++; 7049 } 7050 7051 if (n > 0x1000) { 7052 n = 0x1000; 7053 } else if (n < -0x1000) { 7054 n = -0x1000; 7055 } 7056 7057 aExp += n - 1; 7058 aSig <<= 10; 7059 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7060 } 7061 7062 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7063 { 7064 flag aSign; 7065 int32_t aExp; 7066 uint64_t aSig; 7067 7068 if (floatx80_invalid_encoding(a)) { 7069 float_raise(float_flag_invalid, status); 7070 return floatx80_default_nan(status); 7071 } 7072 aSig = extractFloatx80Frac( a ); 7073 aExp = extractFloatx80Exp( a ); 7074 aSign = extractFloatx80Sign( a ); 7075 7076 if ( aExp == 0x7FFF ) { 7077 if ( aSig<<1 ) { 7078 return propagateFloatx80NaN(a, a, status); 7079 } 7080 return a; 7081 } 7082 7083 if (aExp == 0) { 7084 if (aSig == 0) { 7085 return a; 7086 } 7087 aExp++; 7088 } 7089 7090 if (n > 0x10000) { 7091 n = 0x10000; 7092 } else if (n < -0x10000) { 7093 n = -0x10000; 7094 } 7095 7096 aExp += n; 7097 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7098 aSign, aExp, aSig, 0, status); 7099 } 7100 7101 float128 float128_scalbn(float128 a, int n, float_status *status) 7102 { 7103 flag aSign; 7104 int32_t aExp; 7105 uint64_t aSig0, aSig1; 7106 7107 aSig1 = extractFloat128Frac1( a ); 7108 aSig0 = extractFloat128Frac0( a ); 7109 aExp = extractFloat128Exp( a ); 7110 aSign = extractFloat128Sign( a ); 7111 if ( aExp == 0x7FFF ) { 7112 if ( aSig0 | aSig1 ) { 7113 return propagateFloat128NaN(a, a, status); 7114 } 7115 return a; 7116 } 7117 if (aExp != 0) { 7118 aSig0 |= LIT64( 0x0001000000000000 ); 7119 } else if (aSig0 == 0 && aSig1 == 0) { 7120 return a; 7121 } else { 7122 aExp++; 7123 } 7124 7125 if (n > 0x10000) { 7126 n = 0x10000; 7127 } else if (n < -0x10000) { 7128 n = -0x10000; 7129 } 7130 7131 aExp += n - 1; 7132 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7133 , status); 7134 7135 } 7136