1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /*---------------------------------------------------------------------------- 1324 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1325 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1326 | input. If `zSign' is 1, the input is negated before being converted to an 1327 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 1328 | is simply rounded to an integer, with the inexact exception raised if the 1329 | input cannot be represented exactly as an integer. However, if the fixed- 1330 | point input is too large, the invalid exception is raised and the largest 1331 | positive or negative integer is returned. 1332 *----------------------------------------------------------------------------*/ 1333 1334 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 1335 { 1336 int8_t roundingMode; 1337 flag roundNearestEven; 1338 int8_t roundIncrement, roundBits; 1339 int32_t z; 1340 1341 roundingMode = status->float_rounding_mode; 1342 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1343 switch (roundingMode) { 1344 case float_round_nearest_even: 1345 case float_round_ties_away: 1346 roundIncrement = 0x40; 1347 break; 1348 case float_round_to_zero: 1349 roundIncrement = 0; 1350 break; 1351 case float_round_up: 1352 roundIncrement = zSign ? 0 : 0x7f; 1353 break; 1354 case float_round_down: 1355 roundIncrement = zSign ? 0x7f : 0; 1356 break; 1357 default: 1358 abort(); 1359 } 1360 roundBits = absZ & 0x7F; 1361 absZ = ( absZ + roundIncrement )>>7; 1362 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1363 z = absZ; 1364 if ( zSign ) z = - z; 1365 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 1366 float_raise(float_flag_invalid, status); 1367 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 1368 } 1369 if (roundBits) { 1370 status->float_exception_flags |= float_flag_inexact; 1371 } 1372 return z; 1373 1374 } 1375 1376 /*---------------------------------------------------------------------------- 1377 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1378 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1379 | and returns the properly rounded 64-bit integer corresponding to the input. 1380 | If `zSign' is 1, the input is negated before being converted to an integer. 1381 | Ordinarily, the fixed-point input is simply rounded to an integer, with 1382 | the inexact exception raised if the input cannot be represented exactly as 1383 | an integer. However, if the fixed-point input is too large, the invalid 1384 | exception is raised and the largest positive or negative integer is 1385 | returned. 1386 *----------------------------------------------------------------------------*/ 1387 1388 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 1389 float_status *status) 1390 { 1391 int8_t roundingMode; 1392 flag roundNearestEven, increment; 1393 int64_t z; 1394 1395 roundingMode = status->float_rounding_mode; 1396 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1397 switch (roundingMode) { 1398 case float_round_nearest_even: 1399 case float_round_ties_away: 1400 increment = ((int64_t) absZ1 < 0); 1401 break; 1402 case float_round_to_zero: 1403 increment = 0; 1404 break; 1405 case float_round_up: 1406 increment = !zSign && absZ1; 1407 break; 1408 case float_round_down: 1409 increment = zSign && absZ1; 1410 break; 1411 default: 1412 abort(); 1413 } 1414 if ( increment ) { 1415 ++absZ0; 1416 if ( absZ0 == 0 ) goto overflow; 1417 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1418 } 1419 z = absZ0; 1420 if ( zSign ) z = - z; 1421 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1422 overflow: 1423 float_raise(float_flag_invalid, status); 1424 return 1425 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 1426 : LIT64( 0x7FFFFFFFFFFFFFFF ); 1427 } 1428 if (absZ1) { 1429 status->float_exception_flags |= float_flag_inexact; 1430 } 1431 return z; 1432 1433 } 1434 1435 /*---------------------------------------------------------------------------- 1436 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1437 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1438 | and returns the properly rounded 64-bit unsigned integer corresponding to the 1439 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 1440 | with the inexact exception raised if the input cannot be represented exactly 1441 | as an integer. However, if the fixed-point input is too large, the invalid 1442 | exception is raised and the largest unsigned integer is returned. 1443 *----------------------------------------------------------------------------*/ 1444 1445 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 1446 uint64_t absZ1, float_status *status) 1447 { 1448 int8_t roundingMode; 1449 flag roundNearestEven, increment; 1450 1451 roundingMode = status->float_rounding_mode; 1452 roundNearestEven = (roundingMode == float_round_nearest_even); 1453 switch (roundingMode) { 1454 case float_round_nearest_even: 1455 case float_round_ties_away: 1456 increment = ((int64_t)absZ1 < 0); 1457 break; 1458 case float_round_to_zero: 1459 increment = 0; 1460 break; 1461 case float_round_up: 1462 increment = !zSign && absZ1; 1463 break; 1464 case float_round_down: 1465 increment = zSign && absZ1; 1466 break; 1467 default: 1468 abort(); 1469 } 1470 if (increment) { 1471 ++absZ0; 1472 if (absZ0 == 0) { 1473 float_raise(float_flag_invalid, status); 1474 return LIT64(0xFFFFFFFFFFFFFFFF); 1475 } 1476 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 1477 } 1478 1479 if (zSign && absZ0) { 1480 float_raise(float_flag_invalid, status); 1481 return 0; 1482 } 1483 1484 if (absZ1) { 1485 status->float_exception_flags |= float_flag_inexact; 1486 } 1487 return absZ0; 1488 } 1489 1490 /*---------------------------------------------------------------------------- 1491 | If `a' is denormal and we are in flush-to-zero mode then set the 1492 | input-denormal exception and return zero. Otherwise just return the value. 1493 *----------------------------------------------------------------------------*/ 1494 float32 float32_squash_input_denormal(float32 a, float_status *status) 1495 { 1496 if (status->flush_inputs_to_zero) { 1497 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 1498 float_raise(float_flag_input_denormal, status); 1499 return make_float32(float32_val(a) & 0x80000000); 1500 } 1501 } 1502 return a; 1503 } 1504 1505 /*---------------------------------------------------------------------------- 1506 | Normalizes the subnormal single-precision floating-point value represented 1507 | by the denormalized significand `aSig'. The normalized exponent and 1508 | significand are stored at the locations pointed to by `zExpPtr' and 1509 | `zSigPtr', respectively. 1510 *----------------------------------------------------------------------------*/ 1511 1512 static void 1513 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 1514 { 1515 int8_t shiftCount; 1516 1517 shiftCount = countLeadingZeros32( aSig ) - 8; 1518 *zSigPtr = aSig<<shiftCount; 1519 *zExpPtr = 1 - shiftCount; 1520 1521 } 1522 1523 /*---------------------------------------------------------------------------- 1524 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1525 | single-precision floating-point value, returning the result. After being 1526 | shifted into the proper positions, the three fields are simply added 1527 | together to form the result. This means that any integer portion of `zSig' 1528 | will be added into the exponent. Since a properly normalized significand 1529 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1530 | than the desired result exponent whenever `zSig' is a complete, normalized 1531 | significand. 1532 *----------------------------------------------------------------------------*/ 1533 1534 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 1535 { 1536 1537 return make_float32( 1538 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 1539 1540 } 1541 1542 /*---------------------------------------------------------------------------- 1543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1544 | and significand `zSig', and returns the proper single-precision floating- 1545 | point value corresponding to the abstract input. Ordinarily, the abstract 1546 | value is simply rounded and packed into the single-precision format, with 1547 | the inexact exception raised if the abstract input cannot be represented 1548 | exactly. However, if the abstract value is too large, the overflow and 1549 | inexact exceptions are raised and an infinity or maximal finite value is 1550 | returned. If the abstract value is too small, the input value is rounded to 1551 | a subnormal number, and the underflow and inexact exceptions are raised if 1552 | the abstract input cannot be represented exactly as a subnormal single- 1553 | precision floating-point number. 1554 | The input significand `zSig' has its binary point between bits 30 1555 | and 29, which is 7 bits to the left of the usual location. This shifted 1556 | significand must be normalized or smaller. If `zSig' is not normalized, 1557 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1558 | and it must not require rounding. In the usual case that `zSig' is 1559 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1560 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1561 | Binary Floating-Point Arithmetic. 1562 *----------------------------------------------------------------------------*/ 1563 1564 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1565 float_status *status) 1566 { 1567 int8_t roundingMode; 1568 flag roundNearestEven; 1569 int8_t roundIncrement, roundBits; 1570 flag isTiny; 1571 1572 roundingMode = status->float_rounding_mode; 1573 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1574 switch (roundingMode) { 1575 case float_round_nearest_even: 1576 case float_round_ties_away: 1577 roundIncrement = 0x40; 1578 break; 1579 case float_round_to_zero: 1580 roundIncrement = 0; 1581 break; 1582 case float_round_up: 1583 roundIncrement = zSign ? 0 : 0x7f; 1584 break; 1585 case float_round_down: 1586 roundIncrement = zSign ? 0x7f : 0; 1587 break; 1588 default: 1589 abort(); 1590 break; 1591 } 1592 roundBits = zSig & 0x7F; 1593 if ( 0xFD <= (uint16_t) zExp ) { 1594 if ( ( 0xFD < zExp ) 1595 || ( ( zExp == 0xFD ) 1596 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 1597 ) { 1598 float_raise(float_flag_overflow | float_flag_inexact, status); 1599 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 1600 } 1601 if ( zExp < 0 ) { 1602 if (status->flush_to_zero) { 1603 float_raise(float_flag_output_denormal, status); 1604 return packFloat32(zSign, 0, 0); 1605 } 1606 isTiny = 1607 (status->float_detect_tininess 1608 == float_tininess_before_rounding) 1609 || ( zExp < -1 ) 1610 || ( zSig + roundIncrement < 0x80000000 ); 1611 shift32RightJamming( zSig, - zExp, &zSig ); 1612 zExp = 0; 1613 roundBits = zSig & 0x7F; 1614 if (isTiny && roundBits) { 1615 float_raise(float_flag_underflow, status); 1616 } 1617 } 1618 } 1619 if (roundBits) { 1620 status->float_exception_flags |= float_flag_inexact; 1621 } 1622 zSig = ( zSig + roundIncrement )>>7; 1623 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1624 if ( zSig == 0 ) zExp = 0; 1625 return packFloat32( zSign, zExp, zSig ); 1626 1627 } 1628 1629 /*---------------------------------------------------------------------------- 1630 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1631 | and significand `zSig', and returns the proper single-precision floating- 1632 | point value corresponding to the abstract input. This routine is just like 1633 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 1634 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1635 | floating-point exponent. 1636 *----------------------------------------------------------------------------*/ 1637 1638 static float32 1639 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1640 float_status *status) 1641 { 1642 int8_t shiftCount; 1643 1644 shiftCount = countLeadingZeros32( zSig ) - 1; 1645 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 1646 status); 1647 1648 } 1649 1650 /*---------------------------------------------------------------------------- 1651 | If `a' is denormal and we are in flush-to-zero mode then set the 1652 | input-denormal exception and return zero. Otherwise just return the value. 1653 *----------------------------------------------------------------------------*/ 1654 float64 float64_squash_input_denormal(float64 a, float_status *status) 1655 { 1656 if (status->flush_inputs_to_zero) { 1657 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 1658 float_raise(float_flag_input_denormal, status); 1659 return make_float64(float64_val(a) & (1ULL << 63)); 1660 } 1661 } 1662 return a; 1663 } 1664 1665 /*---------------------------------------------------------------------------- 1666 | Normalizes the subnormal double-precision floating-point value represented 1667 | by the denormalized significand `aSig'. The normalized exponent and 1668 | significand are stored at the locations pointed to by `zExpPtr' and 1669 | `zSigPtr', respectively. 1670 *----------------------------------------------------------------------------*/ 1671 1672 static void 1673 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 1674 { 1675 int8_t shiftCount; 1676 1677 shiftCount = countLeadingZeros64( aSig ) - 11; 1678 *zSigPtr = aSig<<shiftCount; 1679 *zExpPtr = 1 - shiftCount; 1680 1681 } 1682 1683 /*---------------------------------------------------------------------------- 1684 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1685 | double-precision floating-point value, returning the result. After being 1686 | shifted into the proper positions, the three fields are simply added 1687 | together to form the result. This means that any integer portion of `zSig' 1688 | will be added into the exponent. Since a properly normalized significand 1689 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1690 | than the desired result exponent whenever `zSig' is a complete, normalized 1691 | significand. 1692 *----------------------------------------------------------------------------*/ 1693 1694 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 1695 { 1696 1697 return make_float64( 1698 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 1699 1700 } 1701 1702 /*---------------------------------------------------------------------------- 1703 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1704 | and significand `zSig', and returns the proper double-precision floating- 1705 | point value corresponding to the abstract input. Ordinarily, the abstract 1706 | value is simply rounded and packed into the double-precision format, with 1707 | the inexact exception raised if the abstract input cannot be represented 1708 | exactly. However, if the abstract value is too large, the overflow and 1709 | inexact exceptions are raised and an infinity or maximal finite value is 1710 | returned. If the abstract value is too small, the input value is rounded to 1711 | a subnormal number, and the underflow and inexact exceptions are raised if 1712 | the abstract input cannot be represented exactly as a subnormal double- 1713 | precision floating-point number. 1714 | The input significand `zSig' has its binary point between bits 62 1715 | and 61, which is 10 bits to the left of the usual location. This shifted 1716 | significand must be normalized or smaller. If `zSig' is not normalized, 1717 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1718 | and it must not require rounding. In the usual case that `zSig' is 1719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1720 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1721 | Binary Floating-Point Arithmetic. 1722 *----------------------------------------------------------------------------*/ 1723 1724 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1725 float_status *status) 1726 { 1727 int8_t roundingMode; 1728 flag roundNearestEven; 1729 int roundIncrement, roundBits; 1730 flag isTiny; 1731 1732 roundingMode = status->float_rounding_mode; 1733 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1734 switch (roundingMode) { 1735 case float_round_nearest_even: 1736 case float_round_ties_away: 1737 roundIncrement = 0x200; 1738 break; 1739 case float_round_to_zero: 1740 roundIncrement = 0; 1741 break; 1742 case float_round_up: 1743 roundIncrement = zSign ? 0 : 0x3ff; 1744 break; 1745 case float_round_down: 1746 roundIncrement = zSign ? 0x3ff : 0; 1747 break; 1748 case float_round_to_odd: 1749 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1750 break; 1751 default: 1752 abort(); 1753 } 1754 roundBits = zSig & 0x3FF; 1755 if ( 0x7FD <= (uint16_t) zExp ) { 1756 if ( ( 0x7FD < zExp ) 1757 || ( ( zExp == 0x7FD ) 1758 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 1759 ) { 1760 bool overflow_to_inf = roundingMode != float_round_to_odd && 1761 roundIncrement != 0; 1762 float_raise(float_flag_overflow | float_flag_inexact, status); 1763 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 1764 } 1765 if ( zExp < 0 ) { 1766 if (status->flush_to_zero) { 1767 float_raise(float_flag_output_denormal, status); 1768 return packFloat64(zSign, 0, 0); 1769 } 1770 isTiny = 1771 (status->float_detect_tininess 1772 == float_tininess_before_rounding) 1773 || ( zExp < -1 ) 1774 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 1775 shift64RightJamming( zSig, - zExp, &zSig ); 1776 zExp = 0; 1777 roundBits = zSig & 0x3FF; 1778 if (isTiny && roundBits) { 1779 float_raise(float_flag_underflow, status); 1780 } 1781 if (roundingMode == float_round_to_odd) { 1782 /* 1783 * For round-to-odd case, the roundIncrement depends on 1784 * zSig which just changed. 1785 */ 1786 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1787 } 1788 } 1789 } 1790 if (roundBits) { 1791 status->float_exception_flags |= float_flag_inexact; 1792 } 1793 zSig = ( zSig + roundIncrement )>>10; 1794 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 1795 if ( zSig == 0 ) zExp = 0; 1796 return packFloat64( zSign, zExp, zSig ); 1797 1798 } 1799 1800 /*---------------------------------------------------------------------------- 1801 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1802 | and significand `zSig', and returns the proper double-precision floating- 1803 | point value corresponding to the abstract input. This routine is just like 1804 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 1805 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1806 | floating-point exponent. 1807 *----------------------------------------------------------------------------*/ 1808 1809 static float64 1810 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1811 float_status *status) 1812 { 1813 int8_t shiftCount; 1814 1815 shiftCount = countLeadingZeros64( zSig ) - 1; 1816 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 1817 status); 1818 1819 } 1820 1821 /*---------------------------------------------------------------------------- 1822 | Returns the fraction bits of the extended double-precision floating-point 1823 | value `a'. 1824 *----------------------------------------------------------------------------*/ 1825 1826 static inline uint64_t extractFloatx80Frac( floatx80 a ) 1827 { 1828 1829 return a.low; 1830 1831 } 1832 1833 /*---------------------------------------------------------------------------- 1834 | Returns the exponent bits of the extended double-precision floating-point 1835 | value `a'. 1836 *----------------------------------------------------------------------------*/ 1837 1838 static inline int32_t extractFloatx80Exp( floatx80 a ) 1839 { 1840 1841 return a.high & 0x7FFF; 1842 1843 } 1844 1845 /*---------------------------------------------------------------------------- 1846 | Returns the sign bit of the extended double-precision floating-point value 1847 | `a'. 1848 *----------------------------------------------------------------------------*/ 1849 1850 static inline flag extractFloatx80Sign( floatx80 a ) 1851 { 1852 1853 return a.high>>15; 1854 1855 } 1856 1857 /*---------------------------------------------------------------------------- 1858 | Normalizes the subnormal extended double-precision floating-point value 1859 | represented by the denormalized significand `aSig'. The normalized exponent 1860 | and significand are stored at the locations pointed to by `zExpPtr' and 1861 | `zSigPtr', respectively. 1862 *----------------------------------------------------------------------------*/ 1863 1864 static void 1865 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 1866 { 1867 int8_t shiftCount; 1868 1869 shiftCount = countLeadingZeros64( aSig ); 1870 *zSigPtr = aSig<<shiftCount; 1871 *zExpPtr = 1 - shiftCount; 1872 1873 } 1874 1875 /*---------------------------------------------------------------------------- 1876 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 1877 | extended double-precision floating-point value, returning the result. 1878 *----------------------------------------------------------------------------*/ 1879 1880 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 1881 { 1882 floatx80 z; 1883 1884 z.low = zSig; 1885 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 1886 return z; 1887 1888 } 1889 1890 /*---------------------------------------------------------------------------- 1891 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1892 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 1893 | and returns the proper extended double-precision floating-point value 1894 | corresponding to the abstract input. Ordinarily, the abstract value is 1895 | rounded and packed into the extended double-precision format, with the 1896 | inexact exception raised if the abstract input cannot be represented 1897 | exactly. However, if the abstract value is too large, the overflow and 1898 | inexact exceptions are raised and an infinity or maximal finite value is 1899 | returned. If the abstract value is too small, the input value is rounded to 1900 | a subnormal number, and the underflow and inexact exceptions are raised if 1901 | the abstract input cannot be represented exactly as a subnormal extended 1902 | double-precision floating-point number. 1903 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 1904 | number of bits as single or double precision, respectively. Otherwise, the 1905 | result is rounded to the full precision of the extended double-precision 1906 | format. 1907 | The input significand must be normalized or smaller. If the input 1908 | significand is not normalized, `zExp' must be 0; in that case, the result 1909 | returned is a subnormal number, and it must not require rounding. The 1910 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 1911 | Floating-Point Arithmetic. 1912 *----------------------------------------------------------------------------*/ 1913 1914 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 1915 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 1916 float_status *status) 1917 { 1918 int8_t roundingMode; 1919 flag roundNearestEven, increment, isTiny; 1920 int64_t roundIncrement, roundMask, roundBits; 1921 1922 roundingMode = status->float_rounding_mode; 1923 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1924 if ( roundingPrecision == 80 ) goto precision80; 1925 if ( roundingPrecision == 64 ) { 1926 roundIncrement = LIT64( 0x0000000000000400 ); 1927 roundMask = LIT64( 0x00000000000007FF ); 1928 } 1929 else if ( roundingPrecision == 32 ) { 1930 roundIncrement = LIT64( 0x0000008000000000 ); 1931 roundMask = LIT64( 0x000000FFFFFFFFFF ); 1932 } 1933 else { 1934 goto precision80; 1935 } 1936 zSig0 |= ( zSig1 != 0 ); 1937 switch (roundingMode) { 1938 case float_round_nearest_even: 1939 case float_round_ties_away: 1940 break; 1941 case float_round_to_zero: 1942 roundIncrement = 0; 1943 break; 1944 case float_round_up: 1945 roundIncrement = zSign ? 0 : roundMask; 1946 break; 1947 case float_round_down: 1948 roundIncrement = zSign ? roundMask : 0; 1949 break; 1950 default: 1951 abort(); 1952 } 1953 roundBits = zSig0 & roundMask; 1954 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 1955 if ( ( 0x7FFE < zExp ) 1956 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 1957 ) { 1958 goto overflow; 1959 } 1960 if ( zExp <= 0 ) { 1961 if (status->flush_to_zero) { 1962 float_raise(float_flag_output_denormal, status); 1963 return packFloatx80(zSign, 0, 0); 1964 } 1965 isTiny = 1966 (status->float_detect_tininess 1967 == float_tininess_before_rounding) 1968 || ( zExp < 0 ) 1969 || ( zSig0 <= zSig0 + roundIncrement ); 1970 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 1971 zExp = 0; 1972 roundBits = zSig0 & roundMask; 1973 if (isTiny && roundBits) { 1974 float_raise(float_flag_underflow, status); 1975 } 1976 if (roundBits) { 1977 status->float_exception_flags |= float_flag_inexact; 1978 } 1979 zSig0 += roundIncrement; 1980 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1981 roundIncrement = roundMask + 1; 1982 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1983 roundMask |= roundIncrement; 1984 } 1985 zSig0 &= ~ roundMask; 1986 return packFloatx80( zSign, zExp, zSig0 ); 1987 } 1988 } 1989 if (roundBits) { 1990 status->float_exception_flags |= float_flag_inexact; 1991 } 1992 zSig0 += roundIncrement; 1993 if ( zSig0 < roundIncrement ) { 1994 ++zExp; 1995 zSig0 = LIT64( 0x8000000000000000 ); 1996 } 1997 roundIncrement = roundMask + 1; 1998 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1999 roundMask |= roundIncrement; 2000 } 2001 zSig0 &= ~ roundMask; 2002 if ( zSig0 == 0 ) zExp = 0; 2003 return packFloatx80( zSign, zExp, zSig0 ); 2004 precision80: 2005 switch (roundingMode) { 2006 case float_round_nearest_even: 2007 case float_round_ties_away: 2008 increment = ((int64_t)zSig1 < 0); 2009 break; 2010 case float_round_to_zero: 2011 increment = 0; 2012 break; 2013 case float_round_up: 2014 increment = !zSign && zSig1; 2015 break; 2016 case float_round_down: 2017 increment = zSign && zSig1; 2018 break; 2019 default: 2020 abort(); 2021 } 2022 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2023 if ( ( 0x7FFE < zExp ) 2024 || ( ( zExp == 0x7FFE ) 2025 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2026 && increment 2027 ) 2028 ) { 2029 roundMask = 0; 2030 overflow: 2031 float_raise(float_flag_overflow | float_flag_inexact, status); 2032 if ( ( roundingMode == float_round_to_zero ) 2033 || ( zSign && ( roundingMode == float_round_up ) ) 2034 || ( ! zSign && ( roundingMode == float_round_down ) ) 2035 ) { 2036 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2037 } 2038 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2039 } 2040 if ( zExp <= 0 ) { 2041 isTiny = 2042 (status->float_detect_tininess 2043 == float_tininess_before_rounding) 2044 || ( zExp < 0 ) 2045 || ! increment 2046 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2047 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2048 zExp = 0; 2049 if (isTiny && zSig1) { 2050 float_raise(float_flag_underflow, status); 2051 } 2052 if (zSig1) { 2053 status->float_exception_flags |= float_flag_inexact; 2054 } 2055 switch (roundingMode) { 2056 case float_round_nearest_even: 2057 case float_round_ties_away: 2058 increment = ((int64_t)zSig1 < 0); 2059 break; 2060 case float_round_to_zero: 2061 increment = 0; 2062 break; 2063 case float_round_up: 2064 increment = !zSign && zSig1; 2065 break; 2066 case float_round_down: 2067 increment = zSign && zSig1; 2068 break; 2069 default: 2070 abort(); 2071 } 2072 if ( increment ) { 2073 ++zSig0; 2074 zSig0 &= 2075 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2076 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2077 } 2078 return packFloatx80( zSign, zExp, zSig0 ); 2079 } 2080 } 2081 if (zSig1) { 2082 status->float_exception_flags |= float_flag_inexact; 2083 } 2084 if ( increment ) { 2085 ++zSig0; 2086 if ( zSig0 == 0 ) { 2087 ++zExp; 2088 zSig0 = LIT64( 0x8000000000000000 ); 2089 } 2090 else { 2091 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2092 } 2093 } 2094 else { 2095 if ( zSig0 == 0 ) zExp = 0; 2096 } 2097 return packFloatx80( zSign, zExp, zSig0 ); 2098 2099 } 2100 2101 /*---------------------------------------------------------------------------- 2102 | Takes an abstract floating-point value having sign `zSign', exponent 2103 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2104 | and returns the proper extended double-precision floating-point value 2105 | corresponding to the abstract input. This routine is just like 2106 | `roundAndPackFloatx80' except that the input significand does not have to be 2107 | normalized. 2108 *----------------------------------------------------------------------------*/ 2109 2110 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2111 flag zSign, int32_t zExp, 2112 uint64_t zSig0, uint64_t zSig1, 2113 float_status *status) 2114 { 2115 int8_t shiftCount; 2116 2117 if ( zSig0 == 0 ) { 2118 zSig0 = zSig1; 2119 zSig1 = 0; 2120 zExp -= 64; 2121 } 2122 shiftCount = countLeadingZeros64( zSig0 ); 2123 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2124 zExp -= shiftCount; 2125 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2126 zSig0, zSig1, status); 2127 2128 } 2129 2130 /*---------------------------------------------------------------------------- 2131 | Returns the least-significant 64 fraction bits of the quadruple-precision 2132 | floating-point value `a'. 2133 *----------------------------------------------------------------------------*/ 2134 2135 static inline uint64_t extractFloat128Frac1( float128 a ) 2136 { 2137 2138 return a.low; 2139 2140 } 2141 2142 /*---------------------------------------------------------------------------- 2143 | Returns the most-significant 48 fraction bits of the quadruple-precision 2144 | floating-point value `a'. 2145 *----------------------------------------------------------------------------*/ 2146 2147 static inline uint64_t extractFloat128Frac0( float128 a ) 2148 { 2149 2150 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2151 2152 } 2153 2154 /*---------------------------------------------------------------------------- 2155 | Returns the exponent bits of the quadruple-precision floating-point value 2156 | `a'. 2157 *----------------------------------------------------------------------------*/ 2158 2159 static inline int32_t extractFloat128Exp( float128 a ) 2160 { 2161 2162 return ( a.high>>48 ) & 0x7FFF; 2163 2164 } 2165 2166 /*---------------------------------------------------------------------------- 2167 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2168 *----------------------------------------------------------------------------*/ 2169 2170 static inline flag extractFloat128Sign( float128 a ) 2171 { 2172 2173 return a.high>>63; 2174 2175 } 2176 2177 /*---------------------------------------------------------------------------- 2178 | Normalizes the subnormal quadruple-precision floating-point value 2179 | represented by the denormalized significand formed by the concatenation of 2180 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2181 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2182 | significand are stored at the location pointed to by `zSig0Ptr', and the 2183 | least significant 64 bits of the normalized significand are stored at the 2184 | location pointed to by `zSig1Ptr'. 2185 *----------------------------------------------------------------------------*/ 2186 2187 static void 2188 normalizeFloat128Subnormal( 2189 uint64_t aSig0, 2190 uint64_t aSig1, 2191 int32_t *zExpPtr, 2192 uint64_t *zSig0Ptr, 2193 uint64_t *zSig1Ptr 2194 ) 2195 { 2196 int8_t shiftCount; 2197 2198 if ( aSig0 == 0 ) { 2199 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2200 if ( shiftCount < 0 ) { 2201 *zSig0Ptr = aSig1>>( - shiftCount ); 2202 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2203 } 2204 else { 2205 *zSig0Ptr = aSig1<<shiftCount; 2206 *zSig1Ptr = 0; 2207 } 2208 *zExpPtr = - shiftCount - 63; 2209 } 2210 else { 2211 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2212 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2213 *zExpPtr = 1 - shiftCount; 2214 } 2215 2216 } 2217 2218 /*---------------------------------------------------------------------------- 2219 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2220 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2221 | floating-point value, returning the result. After being shifted into the 2222 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2223 | added together to form the most significant 32 bits of the result. This 2224 | means that any integer portion of `zSig0' will be added into the exponent. 2225 | Since a properly normalized significand will have an integer portion equal 2226 | to 1, the `zExp' input should be 1 less than the desired result exponent 2227 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2228 | significand. 2229 *----------------------------------------------------------------------------*/ 2230 2231 static inline float128 2232 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2233 { 2234 float128 z; 2235 2236 z.low = zSig1; 2237 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2238 return z; 2239 2240 } 2241 2242 /*---------------------------------------------------------------------------- 2243 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2244 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2245 | and `zSig2', and returns the proper quadruple-precision floating-point value 2246 | corresponding to the abstract input. Ordinarily, the abstract value is 2247 | simply rounded and packed into the quadruple-precision format, with the 2248 | inexact exception raised if the abstract input cannot be represented 2249 | exactly. However, if the abstract value is too large, the overflow and 2250 | inexact exceptions are raised and an infinity or maximal finite value is 2251 | returned. If the abstract value is too small, the input value is rounded to 2252 | a subnormal number, and the underflow and inexact exceptions are raised if 2253 | the abstract input cannot be represented exactly as a subnormal quadruple- 2254 | precision floating-point number. 2255 | The input significand must be normalized or smaller. If the input 2256 | significand is not normalized, `zExp' must be 0; in that case, the result 2257 | returned is a subnormal number, and it must not require rounding. In the 2258 | usual case that the input significand is normalized, `zExp' must be 1 less 2259 | than the ``true'' floating-point exponent. The handling of underflow and 2260 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2261 *----------------------------------------------------------------------------*/ 2262 2263 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2264 uint64_t zSig0, uint64_t zSig1, 2265 uint64_t zSig2, float_status *status) 2266 { 2267 int8_t roundingMode; 2268 flag roundNearestEven, increment, isTiny; 2269 2270 roundingMode = status->float_rounding_mode; 2271 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2272 switch (roundingMode) { 2273 case float_round_nearest_even: 2274 case float_round_ties_away: 2275 increment = ((int64_t)zSig2 < 0); 2276 break; 2277 case float_round_to_zero: 2278 increment = 0; 2279 break; 2280 case float_round_up: 2281 increment = !zSign && zSig2; 2282 break; 2283 case float_round_down: 2284 increment = zSign && zSig2; 2285 break; 2286 case float_round_to_odd: 2287 increment = !(zSig1 & 0x1) && zSig2; 2288 break; 2289 default: 2290 abort(); 2291 } 2292 if ( 0x7FFD <= (uint32_t) zExp ) { 2293 if ( ( 0x7FFD < zExp ) 2294 || ( ( zExp == 0x7FFD ) 2295 && eq128( 2296 LIT64( 0x0001FFFFFFFFFFFF ), 2297 LIT64( 0xFFFFFFFFFFFFFFFF ), 2298 zSig0, 2299 zSig1 2300 ) 2301 && increment 2302 ) 2303 ) { 2304 float_raise(float_flag_overflow | float_flag_inexact, status); 2305 if ( ( roundingMode == float_round_to_zero ) 2306 || ( zSign && ( roundingMode == float_round_up ) ) 2307 || ( ! zSign && ( roundingMode == float_round_down ) ) 2308 || (roundingMode == float_round_to_odd) 2309 ) { 2310 return 2311 packFloat128( 2312 zSign, 2313 0x7FFE, 2314 LIT64( 0x0000FFFFFFFFFFFF ), 2315 LIT64( 0xFFFFFFFFFFFFFFFF ) 2316 ); 2317 } 2318 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2319 } 2320 if ( zExp < 0 ) { 2321 if (status->flush_to_zero) { 2322 float_raise(float_flag_output_denormal, status); 2323 return packFloat128(zSign, 0, 0, 0); 2324 } 2325 isTiny = 2326 (status->float_detect_tininess 2327 == float_tininess_before_rounding) 2328 || ( zExp < -1 ) 2329 || ! increment 2330 || lt128( 2331 zSig0, 2332 zSig1, 2333 LIT64( 0x0001FFFFFFFFFFFF ), 2334 LIT64( 0xFFFFFFFFFFFFFFFF ) 2335 ); 2336 shift128ExtraRightJamming( 2337 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2338 zExp = 0; 2339 if (isTiny && zSig2) { 2340 float_raise(float_flag_underflow, status); 2341 } 2342 switch (roundingMode) { 2343 case float_round_nearest_even: 2344 case float_round_ties_away: 2345 increment = ((int64_t)zSig2 < 0); 2346 break; 2347 case float_round_to_zero: 2348 increment = 0; 2349 break; 2350 case float_round_up: 2351 increment = !zSign && zSig2; 2352 break; 2353 case float_round_down: 2354 increment = zSign && zSig2; 2355 break; 2356 case float_round_to_odd: 2357 increment = !(zSig1 & 0x1) && zSig2; 2358 break; 2359 default: 2360 abort(); 2361 } 2362 } 2363 } 2364 if (zSig2) { 2365 status->float_exception_flags |= float_flag_inexact; 2366 } 2367 if ( increment ) { 2368 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2369 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2370 } 2371 else { 2372 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2373 } 2374 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2375 2376 } 2377 2378 /*---------------------------------------------------------------------------- 2379 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2380 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2381 | returns the proper quadruple-precision floating-point value corresponding 2382 | to the abstract input. This routine is just like `roundAndPackFloat128' 2383 | except that the input significand has fewer bits and does not have to be 2384 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2385 | point exponent. 2386 *----------------------------------------------------------------------------*/ 2387 2388 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2389 uint64_t zSig0, uint64_t zSig1, 2390 float_status *status) 2391 { 2392 int8_t shiftCount; 2393 uint64_t zSig2; 2394 2395 if ( zSig0 == 0 ) { 2396 zSig0 = zSig1; 2397 zSig1 = 0; 2398 zExp -= 64; 2399 } 2400 shiftCount = countLeadingZeros64( zSig0 ) - 15; 2401 if ( 0 <= shiftCount ) { 2402 zSig2 = 0; 2403 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2404 } 2405 else { 2406 shift128ExtraRightJamming( 2407 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 2408 } 2409 zExp -= shiftCount; 2410 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 2411 2412 } 2413 2414 /*---------------------------------------------------------------------------- 2415 | Returns the result of converting the 32-bit two's complement integer `a' 2416 | to the single-precision floating-point format. The conversion is performed 2417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2418 *----------------------------------------------------------------------------*/ 2419 2420 float32 int32_to_float32(int32_t a, float_status *status) 2421 { 2422 flag zSign; 2423 2424 if ( a == 0 ) return float32_zero; 2425 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 2426 zSign = ( a < 0 ); 2427 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 2428 } 2429 2430 /*---------------------------------------------------------------------------- 2431 | Returns the result of converting the 32-bit two's complement integer `a' 2432 | to the double-precision floating-point format. The conversion is performed 2433 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2434 *----------------------------------------------------------------------------*/ 2435 2436 float64 int32_to_float64(int32_t a, float_status *status) 2437 { 2438 flag zSign; 2439 uint32_t absA; 2440 int8_t shiftCount; 2441 uint64_t zSig; 2442 2443 if ( a == 0 ) return float64_zero; 2444 zSign = ( a < 0 ); 2445 absA = zSign ? - a : a; 2446 shiftCount = countLeadingZeros32( absA ) + 21; 2447 zSig = absA; 2448 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 2449 2450 } 2451 2452 /*---------------------------------------------------------------------------- 2453 | Returns the result of converting the 32-bit two's complement integer `a' 2454 | to the extended double-precision floating-point format. The conversion 2455 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2456 | Arithmetic. 2457 *----------------------------------------------------------------------------*/ 2458 2459 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2460 { 2461 flag zSign; 2462 uint32_t absA; 2463 int8_t shiftCount; 2464 uint64_t zSig; 2465 2466 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2467 zSign = ( a < 0 ); 2468 absA = zSign ? - a : a; 2469 shiftCount = countLeadingZeros32( absA ) + 32; 2470 zSig = absA; 2471 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 2472 2473 } 2474 2475 /*---------------------------------------------------------------------------- 2476 | Returns the result of converting the 32-bit two's complement integer `a' to 2477 | the quadruple-precision floating-point format. The conversion is performed 2478 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2479 *----------------------------------------------------------------------------*/ 2480 2481 float128 int32_to_float128(int32_t a, float_status *status) 2482 { 2483 flag zSign; 2484 uint32_t absA; 2485 int8_t shiftCount; 2486 uint64_t zSig0; 2487 2488 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2489 zSign = ( a < 0 ); 2490 absA = zSign ? - a : a; 2491 shiftCount = countLeadingZeros32( absA ) + 17; 2492 zSig0 = absA; 2493 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 2494 2495 } 2496 2497 /*---------------------------------------------------------------------------- 2498 | Returns the result of converting the 64-bit two's complement integer `a' 2499 | to the single-precision floating-point format. The conversion is performed 2500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2501 *----------------------------------------------------------------------------*/ 2502 2503 float32 int64_to_float32(int64_t a, float_status *status) 2504 { 2505 flag zSign; 2506 uint64_t absA; 2507 int8_t shiftCount; 2508 2509 if ( a == 0 ) return float32_zero; 2510 zSign = ( a < 0 ); 2511 absA = zSign ? - a : a; 2512 shiftCount = countLeadingZeros64( absA ) - 40; 2513 if ( 0 <= shiftCount ) { 2514 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 2515 } 2516 else { 2517 shiftCount += 7; 2518 if ( shiftCount < 0 ) { 2519 shift64RightJamming( absA, - shiftCount, &absA ); 2520 } 2521 else { 2522 absA <<= shiftCount; 2523 } 2524 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 2525 } 2526 2527 } 2528 2529 /*---------------------------------------------------------------------------- 2530 | Returns the result of converting the 64-bit two's complement integer `a' 2531 | to the double-precision floating-point format. The conversion is performed 2532 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2533 *----------------------------------------------------------------------------*/ 2534 2535 float64 int64_to_float64(int64_t a, float_status *status) 2536 { 2537 flag zSign; 2538 2539 if ( a == 0 ) return float64_zero; 2540 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 2541 return packFloat64( 1, 0x43E, 0 ); 2542 } 2543 zSign = ( a < 0 ); 2544 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 2545 } 2546 2547 /*---------------------------------------------------------------------------- 2548 | Returns the result of converting the 64-bit two's complement integer `a' 2549 | to the extended double-precision floating-point format. The conversion 2550 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2551 | Arithmetic. 2552 *----------------------------------------------------------------------------*/ 2553 2554 floatx80 int64_to_floatx80(int64_t a, float_status *status) 2555 { 2556 flag zSign; 2557 uint64_t absA; 2558 int8_t shiftCount; 2559 2560 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2561 zSign = ( a < 0 ); 2562 absA = zSign ? - a : a; 2563 shiftCount = countLeadingZeros64( absA ); 2564 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 2565 2566 } 2567 2568 /*---------------------------------------------------------------------------- 2569 | Returns the result of converting the 64-bit two's complement integer `a' to 2570 | the quadruple-precision floating-point format. The conversion is performed 2571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2572 *----------------------------------------------------------------------------*/ 2573 2574 float128 int64_to_float128(int64_t a, float_status *status) 2575 { 2576 flag zSign; 2577 uint64_t absA; 2578 int8_t shiftCount; 2579 int32_t zExp; 2580 uint64_t zSig0, zSig1; 2581 2582 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2583 zSign = ( a < 0 ); 2584 absA = zSign ? - a : a; 2585 shiftCount = countLeadingZeros64( absA ) + 49; 2586 zExp = 0x406E - shiftCount; 2587 if ( 64 <= shiftCount ) { 2588 zSig1 = 0; 2589 zSig0 = absA; 2590 shiftCount -= 64; 2591 } 2592 else { 2593 zSig1 = absA; 2594 zSig0 = 0; 2595 } 2596 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2597 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2598 2599 } 2600 2601 /*---------------------------------------------------------------------------- 2602 | Returns the result of converting the 64-bit unsigned integer `a' 2603 | to the single-precision floating-point format. The conversion is performed 2604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2605 *----------------------------------------------------------------------------*/ 2606 2607 float32 uint64_to_float32(uint64_t a, float_status *status) 2608 { 2609 int shiftcount; 2610 2611 if (a == 0) { 2612 return float32_zero; 2613 } 2614 2615 /* Determine (left) shift needed to put first set bit into bit posn 23 2616 * (since packFloat32() expects the binary point between bits 23 and 22); 2617 * this is the fast case for smallish numbers. 2618 */ 2619 shiftcount = countLeadingZeros64(a) - 40; 2620 if (shiftcount >= 0) { 2621 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 2622 } 2623 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 2624 * expects the binary point between bits 30 and 29, hence the + 7. 2625 */ 2626 shiftcount += 7; 2627 if (shiftcount < 0) { 2628 shift64RightJamming(a, -shiftcount, &a); 2629 } else { 2630 a <<= shiftcount; 2631 } 2632 2633 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 2634 } 2635 2636 /*---------------------------------------------------------------------------- 2637 | Returns the result of converting the 64-bit unsigned integer `a' 2638 | to the double-precision floating-point format. The conversion is performed 2639 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2640 *----------------------------------------------------------------------------*/ 2641 2642 float64 uint64_to_float64(uint64_t a, float_status *status) 2643 { 2644 int exp = 0x43C; 2645 int shiftcount; 2646 2647 if (a == 0) { 2648 return float64_zero; 2649 } 2650 2651 shiftcount = countLeadingZeros64(a) - 1; 2652 if (shiftcount < 0) { 2653 shift64RightJamming(a, -shiftcount, &a); 2654 } else { 2655 a <<= shiftcount; 2656 } 2657 return roundAndPackFloat64(0, exp - shiftcount, a, status); 2658 } 2659 2660 /*---------------------------------------------------------------------------- 2661 | Returns the result of converting the 64-bit unsigned integer `a' 2662 | to the quadruple-precision floating-point format. The conversion is performed 2663 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2664 *----------------------------------------------------------------------------*/ 2665 2666 float128 uint64_to_float128(uint64_t a, float_status *status) 2667 { 2668 if (a == 0) { 2669 return float128_zero; 2670 } 2671 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 2672 } 2673 2674 /*---------------------------------------------------------------------------- 2675 | Returns the result of converting the single-precision floating-point value 2676 | `a' to the 32-bit two's complement integer format. The conversion is 2677 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2678 | Arithmetic---which means in particular that the conversion is rounded 2679 | according to the current rounding mode. If `a' is a NaN, the largest 2680 | positive integer is returned. Otherwise, if the conversion overflows, the 2681 | largest integer with the same sign as `a' is returned. 2682 *----------------------------------------------------------------------------*/ 2683 2684 int32_t float32_to_int32(float32 a, float_status *status) 2685 { 2686 flag aSign; 2687 int aExp; 2688 int shiftCount; 2689 uint32_t aSig; 2690 uint64_t aSig64; 2691 2692 a = float32_squash_input_denormal(a, status); 2693 aSig = extractFloat32Frac( a ); 2694 aExp = extractFloat32Exp( a ); 2695 aSign = extractFloat32Sign( a ); 2696 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 2697 if ( aExp ) aSig |= 0x00800000; 2698 shiftCount = 0xAF - aExp; 2699 aSig64 = aSig; 2700 aSig64 <<= 32; 2701 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 2702 return roundAndPackInt32(aSign, aSig64, status); 2703 2704 } 2705 2706 /*---------------------------------------------------------------------------- 2707 | Returns the result of converting the single-precision floating-point value 2708 | `a' to the 32-bit two's complement integer format. The conversion is 2709 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2710 | Arithmetic, except that the conversion is always rounded toward zero. 2711 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2712 | the conversion overflows, the largest integer with the same sign as `a' is 2713 | returned. 2714 *----------------------------------------------------------------------------*/ 2715 2716 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 2717 { 2718 flag aSign; 2719 int aExp; 2720 int shiftCount; 2721 uint32_t aSig; 2722 int32_t z; 2723 a = float32_squash_input_denormal(a, status); 2724 2725 aSig = extractFloat32Frac( a ); 2726 aExp = extractFloat32Exp( a ); 2727 aSign = extractFloat32Sign( a ); 2728 shiftCount = aExp - 0x9E; 2729 if ( 0 <= shiftCount ) { 2730 if ( float32_val(a) != 0xCF000000 ) { 2731 float_raise(float_flag_invalid, status); 2732 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 2733 } 2734 return (int32_t) 0x80000000; 2735 } 2736 else if ( aExp <= 0x7E ) { 2737 if (aExp | aSig) { 2738 status->float_exception_flags |= float_flag_inexact; 2739 } 2740 return 0; 2741 } 2742 aSig = ( aSig | 0x00800000 )<<8; 2743 z = aSig>>( - shiftCount ); 2744 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2745 status->float_exception_flags |= float_flag_inexact; 2746 } 2747 if ( aSign ) z = - z; 2748 return z; 2749 2750 } 2751 2752 /*---------------------------------------------------------------------------- 2753 | Returns the result of converting the single-precision floating-point value 2754 | `a' to the 16-bit two's complement integer format. The conversion is 2755 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2756 | Arithmetic, except that the conversion is always rounded toward zero. 2757 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2758 | the conversion overflows, the largest integer with the same sign as `a' is 2759 | returned. 2760 *----------------------------------------------------------------------------*/ 2761 2762 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 2763 { 2764 flag aSign; 2765 int aExp; 2766 int shiftCount; 2767 uint32_t aSig; 2768 int32_t z; 2769 2770 aSig = extractFloat32Frac( a ); 2771 aExp = extractFloat32Exp( a ); 2772 aSign = extractFloat32Sign( a ); 2773 shiftCount = aExp - 0x8E; 2774 if ( 0 <= shiftCount ) { 2775 if ( float32_val(a) != 0xC7000000 ) { 2776 float_raise(float_flag_invalid, status); 2777 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2778 return 0x7FFF; 2779 } 2780 } 2781 return (int32_t) 0xffff8000; 2782 } 2783 else if ( aExp <= 0x7E ) { 2784 if ( aExp | aSig ) { 2785 status->float_exception_flags |= float_flag_inexact; 2786 } 2787 return 0; 2788 } 2789 shiftCount -= 0x10; 2790 aSig = ( aSig | 0x00800000 )<<8; 2791 z = aSig>>( - shiftCount ); 2792 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2793 status->float_exception_flags |= float_flag_inexact; 2794 } 2795 if ( aSign ) { 2796 z = - z; 2797 } 2798 return z; 2799 2800 } 2801 2802 /*---------------------------------------------------------------------------- 2803 | Returns the result of converting the single-precision floating-point value 2804 | `a' to the 64-bit two's complement integer format. The conversion is 2805 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2806 | Arithmetic---which means in particular that the conversion is rounded 2807 | according to the current rounding mode. If `a' is a NaN, the largest 2808 | positive integer is returned. Otherwise, if the conversion overflows, the 2809 | largest integer with the same sign as `a' is returned. 2810 *----------------------------------------------------------------------------*/ 2811 2812 int64_t float32_to_int64(float32 a, float_status *status) 2813 { 2814 flag aSign; 2815 int aExp; 2816 int shiftCount; 2817 uint32_t aSig; 2818 uint64_t aSig64, aSigExtra; 2819 a = float32_squash_input_denormal(a, status); 2820 2821 aSig = extractFloat32Frac( a ); 2822 aExp = extractFloat32Exp( a ); 2823 aSign = extractFloat32Sign( a ); 2824 shiftCount = 0xBE - aExp; 2825 if ( shiftCount < 0 ) { 2826 float_raise(float_flag_invalid, status); 2827 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2828 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2829 } 2830 return (int64_t) LIT64( 0x8000000000000000 ); 2831 } 2832 if ( aExp ) aSig |= 0x00800000; 2833 aSig64 = aSig; 2834 aSig64 <<= 40; 2835 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 2836 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 2837 2838 } 2839 2840 /*---------------------------------------------------------------------------- 2841 | Returns the result of converting the single-precision floating-point value 2842 | `a' to the 64-bit unsigned integer format. The conversion is 2843 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2844 | Arithmetic---which means in particular that the conversion is rounded 2845 | according to the current rounding mode. If `a' is a NaN, the largest 2846 | unsigned integer is returned. Otherwise, if the conversion overflows, the 2847 | largest unsigned integer is returned. If the 'a' is negative, the result 2848 | is rounded and zero is returned; values that do not round to zero will 2849 | raise the inexact exception flag. 2850 *----------------------------------------------------------------------------*/ 2851 2852 uint64_t float32_to_uint64(float32 a, float_status *status) 2853 { 2854 flag aSign; 2855 int aExp; 2856 int shiftCount; 2857 uint32_t aSig; 2858 uint64_t aSig64, aSigExtra; 2859 a = float32_squash_input_denormal(a, status); 2860 2861 aSig = extractFloat32Frac(a); 2862 aExp = extractFloat32Exp(a); 2863 aSign = extractFloat32Sign(a); 2864 if ((aSign) && (aExp > 126)) { 2865 float_raise(float_flag_invalid, status); 2866 if (float32_is_any_nan(a)) { 2867 return LIT64(0xFFFFFFFFFFFFFFFF); 2868 } else { 2869 return 0; 2870 } 2871 } 2872 shiftCount = 0xBE - aExp; 2873 if (aExp) { 2874 aSig |= 0x00800000; 2875 } 2876 if (shiftCount < 0) { 2877 float_raise(float_flag_invalid, status); 2878 return LIT64(0xFFFFFFFFFFFFFFFF); 2879 } 2880 2881 aSig64 = aSig; 2882 aSig64 <<= 40; 2883 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 2884 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 2885 } 2886 2887 /*---------------------------------------------------------------------------- 2888 | Returns the result of converting the single-precision floating-point value 2889 | `a' to the 64-bit unsigned integer format. The conversion is 2890 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2891 | Arithmetic, except that the conversion is always rounded toward zero. If 2892 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 2893 | conversion overflows, the largest unsigned integer is returned. If the 2894 | 'a' is negative, the result is rounded and zero is returned; values that do 2895 | not round to zero will raise the inexact flag. 2896 *----------------------------------------------------------------------------*/ 2897 2898 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 2899 { 2900 signed char current_rounding_mode = status->float_rounding_mode; 2901 set_float_rounding_mode(float_round_to_zero, status); 2902 int64_t v = float32_to_uint64(a, status); 2903 set_float_rounding_mode(current_rounding_mode, status); 2904 return v; 2905 } 2906 2907 /*---------------------------------------------------------------------------- 2908 | Returns the result of converting the single-precision floating-point value 2909 | `a' to the 64-bit two's complement integer format. The conversion is 2910 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2911 | Arithmetic, except that the conversion is always rounded toward zero. If 2912 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 2913 | conversion overflows, the largest integer with the same sign as `a' is 2914 | returned. 2915 *----------------------------------------------------------------------------*/ 2916 2917 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 2918 { 2919 flag aSign; 2920 int aExp; 2921 int shiftCount; 2922 uint32_t aSig; 2923 uint64_t aSig64; 2924 int64_t z; 2925 a = float32_squash_input_denormal(a, status); 2926 2927 aSig = extractFloat32Frac( a ); 2928 aExp = extractFloat32Exp( a ); 2929 aSign = extractFloat32Sign( a ); 2930 shiftCount = aExp - 0xBE; 2931 if ( 0 <= shiftCount ) { 2932 if ( float32_val(a) != 0xDF000000 ) { 2933 float_raise(float_flag_invalid, status); 2934 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2935 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2936 } 2937 } 2938 return (int64_t) LIT64( 0x8000000000000000 ); 2939 } 2940 else if ( aExp <= 0x7E ) { 2941 if (aExp | aSig) { 2942 status->float_exception_flags |= float_flag_inexact; 2943 } 2944 return 0; 2945 } 2946 aSig64 = aSig | 0x00800000; 2947 aSig64 <<= 40; 2948 z = aSig64>>( - shiftCount ); 2949 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 2950 status->float_exception_flags |= float_flag_inexact; 2951 } 2952 if ( aSign ) z = - z; 2953 return z; 2954 2955 } 2956 2957 /*---------------------------------------------------------------------------- 2958 | Returns the result of converting the single-precision floating-point value 2959 | `a' to the double-precision floating-point format. The conversion is 2960 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2961 | Arithmetic. 2962 *----------------------------------------------------------------------------*/ 2963 2964 float64 float32_to_float64(float32 a, float_status *status) 2965 { 2966 flag aSign; 2967 int aExp; 2968 uint32_t aSig; 2969 a = float32_squash_input_denormal(a, status); 2970 2971 aSig = extractFloat32Frac( a ); 2972 aExp = extractFloat32Exp( a ); 2973 aSign = extractFloat32Sign( a ); 2974 if ( aExp == 0xFF ) { 2975 if (aSig) { 2976 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 2977 } 2978 return packFloat64( aSign, 0x7FF, 0 ); 2979 } 2980 if ( aExp == 0 ) { 2981 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 2982 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2983 --aExp; 2984 } 2985 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 2986 2987 } 2988 2989 /*---------------------------------------------------------------------------- 2990 | Returns the result of converting the single-precision floating-point value 2991 | `a' to the extended double-precision floating-point format. The conversion 2992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2993 | Arithmetic. 2994 *----------------------------------------------------------------------------*/ 2995 2996 floatx80 float32_to_floatx80(float32 a, float_status *status) 2997 { 2998 flag aSign; 2999 int aExp; 3000 uint32_t aSig; 3001 3002 a = float32_squash_input_denormal(a, status); 3003 aSig = extractFloat32Frac( a ); 3004 aExp = extractFloat32Exp( a ); 3005 aSign = extractFloat32Sign( a ); 3006 if ( aExp == 0xFF ) { 3007 if (aSig) { 3008 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3009 } 3010 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3011 } 3012 if ( aExp == 0 ) { 3013 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3014 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3015 } 3016 aSig |= 0x00800000; 3017 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3018 3019 } 3020 3021 /*---------------------------------------------------------------------------- 3022 | Returns the result of converting the single-precision floating-point value 3023 | `a' to the double-precision floating-point format. The conversion is 3024 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3025 | Arithmetic. 3026 *----------------------------------------------------------------------------*/ 3027 3028 float128 float32_to_float128(float32 a, float_status *status) 3029 { 3030 flag aSign; 3031 int aExp; 3032 uint32_t aSig; 3033 3034 a = float32_squash_input_denormal(a, status); 3035 aSig = extractFloat32Frac( a ); 3036 aExp = extractFloat32Exp( a ); 3037 aSign = extractFloat32Sign( a ); 3038 if ( aExp == 0xFF ) { 3039 if (aSig) { 3040 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3041 } 3042 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3043 } 3044 if ( aExp == 0 ) { 3045 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3046 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3047 --aExp; 3048 } 3049 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3050 3051 } 3052 3053 /*---------------------------------------------------------------------------- 3054 | Returns the remainder of the single-precision floating-point value `a' 3055 | with respect to the corresponding value `b'. The operation is performed 3056 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3057 *----------------------------------------------------------------------------*/ 3058 3059 float32 float32_rem(float32 a, float32 b, float_status *status) 3060 { 3061 flag aSign, zSign; 3062 int aExp, bExp, expDiff; 3063 uint32_t aSig, bSig; 3064 uint32_t q; 3065 uint64_t aSig64, bSig64, q64; 3066 uint32_t alternateASig; 3067 int32_t sigMean; 3068 a = float32_squash_input_denormal(a, status); 3069 b = float32_squash_input_denormal(b, status); 3070 3071 aSig = extractFloat32Frac( a ); 3072 aExp = extractFloat32Exp( a ); 3073 aSign = extractFloat32Sign( a ); 3074 bSig = extractFloat32Frac( b ); 3075 bExp = extractFloat32Exp( b ); 3076 if ( aExp == 0xFF ) { 3077 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3078 return propagateFloat32NaN(a, b, status); 3079 } 3080 float_raise(float_flag_invalid, status); 3081 return float32_default_nan(status); 3082 } 3083 if ( bExp == 0xFF ) { 3084 if (bSig) { 3085 return propagateFloat32NaN(a, b, status); 3086 } 3087 return a; 3088 } 3089 if ( bExp == 0 ) { 3090 if ( bSig == 0 ) { 3091 float_raise(float_flag_invalid, status); 3092 return float32_default_nan(status); 3093 } 3094 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3095 } 3096 if ( aExp == 0 ) { 3097 if ( aSig == 0 ) return a; 3098 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3099 } 3100 expDiff = aExp - bExp; 3101 aSig |= 0x00800000; 3102 bSig |= 0x00800000; 3103 if ( expDiff < 32 ) { 3104 aSig <<= 8; 3105 bSig <<= 8; 3106 if ( expDiff < 0 ) { 3107 if ( expDiff < -1 ) return a; 3108 aSig >>= 1; 3109 } 3110 q = ( bSig <= aSig ); 3111 if ( q ) aSig -= bSig; 3112 if ( 0 < expDiff ) { 3113 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3114 q >>= 32 - expDiff; 3115 bSig >>= 2; 3116 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3117 } 3118 else { 3119 aSig >>= 2; 3120 bSig >>= 2; 3121 } 3122 } 3123 else { 3124 if ( bSig <= aSig ) aSig -= bSig; 3125 aSig64 = ( (uint64_t) aSig )<<40; 3126 bSig64 = ( (uint64_t) bSig )<<40; 3127 expDiff -= 64; 3128 while ( 0 < expDiff ) { 3129 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3130 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3131 aSig64 = - ( ( bSig * q64 )<<38 ); 3132 expDiff -= 62; 3133 } 3134 expDiff += 64; 3135 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3136 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3137 q = q64>>( 64 - expDiff ); 3138 bSig <<= 6; 3139 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3140 } 3141 do { 3142 alternateASig = aSig; 3143 ++q; 3144 aSig -= bSig; 3145 } while ( 0 <= (int32_t) aSig ); 3146 sigMean = aSig + alternateASig; 3147 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3148 aSig = alternateASig; 3149 } 3150 zSign = ( (int32_t) aSig < 0 ); 3151 if ( zSign ) aSig = - aSig; 3152 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3153 } 3154 3155 3156 /*---------------------------------------------------------------------------- 3157 | Returns the square root of the single-precision floating-point value `a'. 3158 | The operation is performed according to the IEC/IEEE Standard for Binary 3159 | Floating-Point Arithmetic. 3160 *----------------------------------------------------------------------------*/ 3161 3162 float32 float32_sqrt(float32 a, float_status *status) 3163 { 3164 flag aSign; 3165 int aExp, zExp; 3166 uint32_t aSig, zSig; 3167 uint64_t rem, term; 3168 a = float32_squash_input_denormal(a, status); 3169 3170 aSig = extractFloat32Frac( a ); 3171 aExp = extractFloat32Exp( a ); 3172 aSign = extractFloat32Sign( a ); 3173 if ( aExp == 0xFF ) { 3174 if (aSig) { 3175 return propagateFloat32NaN(a, float32_zero, status); 3176 } 3177 if ( ! aSign ) return a; 3178 float_raise(float_flag_invalid, status); 3179 return float32_default_nan(status); 3180 } 3181 if ( aSign ) { 3182 if ( ( aExp | aSig ) == 0 ) return a; 3183 float_raise(float_flag_invalid, status); 3184 return float32_default_nan(status); 3185 } 3186 if ( aExp == 0 ) { 3187 if ( aSig == 0 ) return float32_zero; 3188 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3189 } 3190 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3191 aSig = ( aSig | 0x00800000 )<<8; 3192 zSig = estimateSqrt32( aExp, aSig ) + 2; 3193 if ( ( zSig & 0x7F ) <= 5 ) { 3194 if ( zSig < 2 ) { 3195 zSig = 0x7FFFFFFF; 3196 goto roundAndPack; 3197 } 3198 aSig >>= aExp & 1; 3199 term = ( (uint64_t) zSig ) * zSig; 3200 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3201 while ( (int64_t) rem < 0 ) { 3202 --zSig; 3203 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3204 } 3205 zSig |= ( rem != 0 ); 3206 } 3207 shift32RightJamming( zSig, 1, &zSig ); 3208 roundAndPack: 3209 return roundAndPackFloat32(0, zExp, zSig, status); 3210 3211 } 3212 3213 /*---------------------------------------------------------------------------- 3214 | Returns the binary exponential of the single-precision floating-point value 3215 | `a'. The operation is performed according to the IEC/IEEE Standard for 3216 | Binary Floating-Point Arithmetic. 3217 | 3218 | Uses the following identities: 3219 | 3220 | 1. ------------------------------------------------------------------------- 3221 | x x*ln(2) 3222 | 2 = e 3223 | 3224 | 2. ------------------------------------------------------------------------- 3225 | 2 3 4 5 n 3226 | x x x x x x x 3227 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3228 | 1! 2! 3! 4! 5! n! 3229 *----------------------------------------------------------------------------*/ 3230 3231 static const float64 float32_exp2_coefficients[15] = 3232 { 3233 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3234 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3235 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3236 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3237 const_float64( 0x3f81111111111111ll ), /* 5 */ 3238 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3239 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3240 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3241 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3242 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3243 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3244 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3245 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3246 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3247 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3248 }; 3249 3250 float32 float32_exp2(float32 a, float_status *status) 3251 { 3252 flag aSign; 3253 int aExp; 3254 uint32_t aSig; 3255 float64 r, x, xn; 3256 int i; 3257 a = float32_squash_input_denormal(a, status); 3258 3259 aSig = extractFloat32Frac( a ); 3260 aExp = extractFloat32Exp( a ); 3261 aSign = extractFloat32Sign( a ); 3262 3263 if ( aExp == 0xFF) { 3264 if (aSig) { 3265 return propagateFloat32NaN(a, float32_zero, status); 3266 } 3267 return (aSign) ? float32_zero : a; 3268 } 3269 if (aExp == 0) { 3270 if (aSig == 0) return float32_one; 3271 } 3272 3273 float_raise(float_flag_inexact, status); 3274 3275 /* ******************************* */ 3276 /* using float64 for approximation */ 3277 /* ******************************* */ 3278 x = float32_to_float64(a, status); 3279 x = float64_mul(x, float64_ln2, status); 3280 3281 xn = x; 3282 r = float64_one; 3283 for (i = 0 ; i < 15 ; i++) { 3284 float64 f; 3285 3286 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3287 r = float64_add(r, f, status); 3288 3289 xn = float64_mul(xn, x, status); 3290 } 3291 3292 return float64_to_float32(r, status); 3293 } 3294 3295 /*---------------------------------------------------------------------------- 3296 | Returns the binary log of the single-precision floating-point value `a'. 3297 | The operation is performed according to the IEC/IEEE Standard for Binary 3298 | Floating-Point Arithmetic. 3299 *----------------------------------------------------------------------------*/ 3300 float32 float32_log2(float32 a, float_status *status) 3301 { 3302 flag aSign, zSign; 3303 int aExp; 3304 uint32_t aSig, zSig, i; 3305 3306 a = float32_squash_input_denormal(a, status); 3307 aSig = extractFloat32Frac( a ); 3308 aExp = extractFloat32Exp( a ); 3309 aSign = extractFloat32Sign( a ); 3310 3311 if ( aExp == 0 ) { 3312 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3313 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3314 } 3315 if ( aSign ) { 3316 float_raise(float_flag_invalid, status); 3317 return float32_default_nan(status); 3318 } 3319 if ( aExp == 0xFF ) { 3320 if (aSig) { 3321 return propagateFloat32NaN(a, float32_zero, status); 3322 } 3323 return a; 3324 } 3325 3326 aExp -= 0x7F; 3327 aSig |= 0x00800000; 3328 zSign = aExp < 0; 3329 zSig = aExp << 23; 3330 3331 for (i = 1 << 22; i > 0; i >>= 1) { 3332 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3333 if ( aSig & 0x01000000 ) { 3334 aSig >>= 1; 3335 zSig |= i; 3336 } 3337 } 3338 3339 if ( zSign ) 3340 zSig = -zSig; 3341 3342 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3343 } 3344 3345 /*---------------------------------------------------------------------------- 3346 | Returns 1 if the single-precision floating-point value `a' is equal to 3347 | the corresponding value `b', and 0 otherwise. The invalid exception is 3348 | raised if either operand is a NaN. Otherwise, the comparison is performed 3349 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3350 *----------------------------------------------------------------------------*/ 3351 3352 int float32_eq(float32 a, float32 b, float_status *status) 3353 { 3354 uint32_t av, bv; 3355 a = float32_squash_input_denormal(a, status); 3356 b = float32_squash_input_denormal(b, status); 3357 3358 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3359 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3360 ) { 3361 float_raise(float_flag_invalid, status); 3362 return 0; 3363 } 3364 av = float32_val(a); 3365 bv = float32_val(b); 3366 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3367 } 3368 3369 /*---------------------------------------------------------------------------- 3370 | Returns 1 if the single-precision floating-point value `a' is less than 3371 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3372 | exception is raised if either operand is a NaN. The comparison is performed 3373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3374 *----------------------------------------------------------------------------*/ 3375 3376 int float32_le(float32 a, float32 b, float_status *status) 3377 { 3378 flag aSign, bSign; 3379 uint32_t av, bv; 3380 a = float32_squash_input_denormal(a, status); 3381 b = float32_squash_input_denormal(b, status); 3382 3383 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3384 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3385 ) { 3386 float_raise(float_flag_invalid, status); 3387 return 0; 3388 } 3389 aSign = extractFloat32Sign( a ); 3390 bSign = extractFloat32Sign( b ); 3391 av = float32_val(a); 3392 bv = float32_val(b); 3393 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3394 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3395 3396 } 3397 3398 /*---------------------------------------------------------------------------- 3399 | Returns 1 if the single-precision floating-point value `a' is less than 3400 | the corresponding value `b', and 0 otherwise. The invalid exception is 3401 | raised if either operand is a NaN. The comparison is performed according 3402 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3403 *----------------------------------------------------------------------------*/ 3404 3405 int float32_lt(float32 a, float32 b, float_status *status) 3406 { 3407 flag aSign, bSign; 3408 uint32_t av, bv; 3409 a = float32_squash_input_denormal(a, status); 3410 b = float32_squash_input_denormal(b, status); 3411 3412 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3413 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3414 ) { 3415 float_raise(float_flag_invalid, status); 3416 return 0; 3417 } 3418 aSign = extractFloat32Sign( a ); 3419 bSign = extractFloat32Sign( b ); 3420 av = float32_val(a); 3421 bv = float32_val(b); 3422 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3423 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3424 3425 } 3426 3427 /*---------------------------------------------------------------------------- 3428 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3429 | be compared, and 0 otherwise. The invalid exception is raised if either 3430 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3431 | Standard for Binary Floating-Point Arithmetic. 3432 *----------------------------------------------------------------------------*/ 3433 3434 int float32_unordered(float32 a, float32 b, float_status *status) 3435 { 3436 a = float32_squash_input_denormal(a, status); 3437 b = float32_squash_input_denormal(b, status); 3438 3439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3441 ) { 3442 float_raise(float_flag_invalid, status); 3443 return 1; 3444 } 3445 return 0; 3446 } 3447 3448 /*---------------------------------------------------------------------------- 3449 | Returns 1 if the single-precision floating-point value `a' is equal to 3450 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3451 | exception. The comparison is performed according to the IEC/IEEE Standard 3452 | for Binary Floating-Point Arithmetic. 3453 *----------------------------------------------------------------------------*/ 3454 3455 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3456 { 3457 a = float32_squash_input_denormal(a, status); 3458 b = float32_squash_input_denormal(b, status); 3459 3460 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3461 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3462 ) { 3463 if (float32_is_signaling_nan(a, status) 3464 || float32_is_signaling_nan(b, status)) { 3465 float_raise(float_flag_invalid, status); 3466 } 3467 return 0; 3468 } 3469 return ( float32_val(a) == float32_val(b) ) || 3470 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3471 } 3472 3473 /*---------------------------------------------------------------------------- 3474 | Returns 1 if the single-precision floating-point value `a' is less than or 3475 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3476 | cause an exception. Otherwise, the comparison is performed according to the 3477 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3478 *----------------------------------------------------------------------------*/ 3479 3480 int float32_le_quiet(float32 a, float32 b, float_status *status) 3481 { 3482 flag aSign, bSign; 3483 uint32_t av, bv; 3484 a = float32_squash_input_denormal(a, status); 3485 b = float32_squash_input_denormal(b, status); 3486 3487 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3488 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3489 ) { 3490 if (float32_is_signaling_nan(a, status) 3491 || float32_is_signaling_nan(b, status)) { 3492 float_raise(float_flag_invalid, status); 3493 } 3494 return 0; 3495 } 3496 aSign = extractFloat32Sign( a ); 3497 bSign = extractFloat32Sign( b ); 3498 av = float32_val(a); 3499 bv = float32_val(b); 3500 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3501 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3502 3503 } 3504 3505 /*---------------------------------------------------------------------------- 3506 | Returns 1 if the single-precision floating-point value `a' is less than 3507 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3508 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3509 | Standard for Binary Floating-Point Arithmetic. 3510 *----------------------------------------------------------------------------*/ 3511 3512 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3513 { 3514 flag aSign, bSign; 3515 uint32_t av, bv; 3516 a = float32_squash_input_denormal(a, status); 3517 b = float32_squash_input_denormal(b, status); 3518 3519 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3520 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3521 ) { 3522 if (float32_is_signaling_nan(a, status) 3523 || float32_is_signaling_nan(b, status)) { 3524 float_raise(float_flag_invalid, status); 3525 } 3526 return 0; 3527 } 3528 aSign = extractFloat32Sign( a ); 3529 bSign = extractFloat32Sign( b ); 3530 av = float32_val(a); 3531 bv = float32_val(b); 3532 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3533 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3534 3535 } 3536 3537 /*---------------------------------------------------------------------------- 3538 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3539 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3540 | comparison is performed according to the IEC/IEEE Standard for Binary 3541 | Floating-Point Arithmetic. 3542 *----------------------------------------------------------------------------*/ 3543 3544 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3545 { 3546 a = float32_squash_input_denormal(a, status); 3547 b = float32_squash_input_denormal(b, status); 3548 3549 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3550 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3551 ) { 3552 if (float32_is_signaling_nan(a, status) 3553 || float32_is_signaling_nan(b, status)) { 3554 float_raise(float_flag_invalid, status); 3555 } 3556 return 1; 3557 } 3558 return 0; 3559 } 3560 3561 /*---------------------------------------------------------------------------- 3562 | Returns the result of converting the double-precision floating-point value 3563 | `a' to the 32-bit two's complement integer format. The conversion is 3564 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3565 | Arithmetic---which means in particular that the conversion is rounded 3566 | according to the current rounding mode. If `a' is a NaN, the largest 3567 | positive integer is returned. Otherwise, if the conversion overflows, the 3568 | largest integer with the same sign as `a' is returned. 3569 *----------------------------------------------------------------------------*/ 3570 3571 int32_t float64_to_int32(float64 a, float_status *status) 3572 { 3573 flag aSign; 3574 int aExp; 3575 int shiftCount; 3576 uint64_t aSig; 3577 a = float64_squash_input_denormal(a, status); 3578 3579 aSig = extractFloat64Frac( a ); 3580 aExp = extractFloat64Exp( a ); 3581 aSign = extractFloat64Sign( a ); 3582 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3583 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3584 shiftCount = 0x42C - aExp; 3585 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3586 return roundAndPackInt32(aSign, aSig, status); 3587 3588 } 3589 3590 /*---------------------------------------------------------------------------- 3591 | Returns the result of converting the double-precision floating-point value 3592 | `a' to the 32-bit two's complement integer format. The conversion is 3593 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3594 | Arithmetic, except that the conversion is always rounded toward zero. 3595 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3596 | the conversion overflows, the largest integer with the same sign as `a' is 3597 | returned. 3598 *----------------------------------------------------------------------------*/ 3599 3600 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3601 { 3602 flag aSign; 3603 int aExp; 3604 int shiftCount; 3605 uint64_t aSig, savedASig; 3606 int32_t z; 3607 a = float64_squash_input_denormal(a, status); 3608 3609 aSig = extractFloat64Frac( a ); 3610 aExp = extractFloat64Exp( a ); 3611 aSign = extractFloat64Sign( a ); 3612 if ( 0x41E < aExp ) { 3613 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3614 goto invalid; 3615 } 3616 else if ( aExp < 0x3FF ) { 3617 if (aExp || aSig) { 3618 status->float_exception_flags |= float_flag_inexact; 3619 } 3620 return 0; 3621 } 3622 aSig |= LIT64( 0x0010000000000000 ); 3623 shiftCount = 0x433 - aExp; 3624 savedASig = aSig; 3625 aSig >>= shiftCount; 3626 z = aSig; 3627 if ( aSign ) z = - z; 3628 if ( ( z < 0 ) ^ aSign ) { 3629 invalid: 3630 float_raise(float_flag_invalid, status); 3631 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3632 } 3633 if ( ( aSig<<shiftCount ) != savedASig ) { 3634 status->float_exception_flags |= float_flag_inexact; 3635 } 3636 return z; 3637 3638 } 3639 3640 /*---------------------------------------------------------------------------- 3641 | Returns the result of converting the double-precision floating-point value 3642 | `a' to the 16-bit two's complement integer format. The conversion is 3643 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3644 | Arithmetic, except that the conversion is always rounded toward zero. 3645 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3646 | the conversion overflows, the largest integer with the same sign as `a' is 3647 | returned. 3648 *----------------------------------------------------------------------------*/ 3649 3650 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3651 { 3652 flag aSign; 3653 int aExp; 3654 int shiftCount; 3655 uint64_t aSig, savedASig; 3656 int32_t z; 3657 3658 aSig = extractFloat64Frac( a ); 3659 aExp = extractFloat64Exp( a ); 3660 aSign = extractFloat64Sign( a ); 3661 if ( 0x40E < aExp ) { 3662 if ( ( aExp == 0x7FF ) && aSig ) { 3663 aSign = 0; 3664 } 3665 goto invalid; 3666 } 3667 else if ( aExp < 0x3FF ) { 3668 if ( aExp || aSig ) { 3669 status->float_exception_flags |= float_flag_inexact; 3670 } 3671 return 0; 3672 } 3673 aSig |= LIT64( 0x0010000000000000 ); 3674 shiftCount = 0x433 - aExp; 3675 savedASig = aSig; 3676 aSig >>= shiftCount; 3677 z = aSig; 3678 if ( aSign ) { 3679 z = - z; 3680 } 3681 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3682 invalid: 3683 float_raise(float_flag_invalid, status); 3684 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3685 } 3686 if ( ( aSig<<shiftCount ) != savedASig ) { 3687 status->float_exception_flags |= float_flag_inexact; 3688 } 3689 return z; 3690 } 3691 3692 /*---------------------------------------------------------------------------- 3693 | Returns the result of converting the double-precision floating-point value 3694 | `a' to the 64-bit two's complement integer format. The conversion is 3695 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3696 | Arithmetic---which means in particular that the conversion is rounded 3697 | according to the current rounding mode. If `a' is a NaN, the largest 3698 | positive integer is returned. Otherwise, if the conversion overflows, the 3699 | largest integer with the same sign as `a' is returned. 3700 *----------------------------------------------------------------------------*/ 3701 3702 int64_t float64_to_int64(float64 a, float_status *status) 3703 { 3704 flag aSign; 3705 int aExp; 3706 int shiftCount; 3707 uint64_t aSig, aSigExtra; 3708 a = float64_squash_input_denormal(a, status); 3709 3710 aSig = extractFloat64Frac( a ); 3711 aExp = extractFloat64Exp( a ); 3712 aSign = extractFloat64Sign( a ); 3713 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3714 shiftCount = 0x433 - aExp; 3715 if ( shiftCount <= 0 ) { 3716 if ( 0x43E < aExp ) { 3717 float_raise(float_flag_invalid, status); 3718 if ( ! aSign 3719 || ( ( aExp == 0x7FF ) 3720 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3721 ) { 3722 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3723 } 3724 return (int64_t) LIT64( 0x8000000000000000 ); 3725 } 3726 aSigExtra = 0; 3727 aSig <<= - shiftCount; 3728 } 3729 else { 3730 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3731 } 3732 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3733 3734 } 3735 3736 /*---------------------------------------------------------------------------- 3737 | Returns the result of converting the double-precision floating-point value 3738 | `a' to the 64-bit two's complement integer format. The conversion is 3739 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3740 | Arithmetic, except that the conversion is always rounded toward zero. 3741 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3742 | the conversion overflows, the largest integer with the same sign as `a' is 3743 | returned. 3744 *----------------------------------------------------------------------------*/ 3745 3746 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3747 { 3748 flag aSign; 3749 int aExp; 3750 int shiftCount; 3751 uint64_t aSig; 3752 int64_t z; 3753 a = float64_squash_input_denormal(a, status); 3754 3755 aSig = extractFloat64Frac( a ); 3756 aExp = extractFloat64Exp( a ); 3757 aSign = extractFloat64Sign( a ); 3758 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3759 shiftCount = aExp - 0x433; 3760 if ( 0 <= shiftCount ) { 3761 if ( 0x43E <= aExp ) { 3762 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3763 float_raise(float_flag_invalid, status); 3764 if ( ! aSign 3765 || ( ( aExp == 0x7FF ) 3766 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3767 ) { 3768 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3769 } 3770 } 3771 return (int64_t) LIT64( 0x8000000000000000 ); 3772 } 3773 z = aSig<<shiftCount; 3774 } 3775 else { 3776 if ( aExp < 0x3FE ) { 3777 if (aExp | aSig) { 3778 status->float_exception_flags |= float_flag_inexact; 3779 } 3780 return 0; 3781 } 3782 z = aSig>>( - shiftCount ); 3783 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3784 status->float_exception_flags |= float_flag_inexact; 3785 } 3786 } 3787 if ( aSign ) z = - z; 3788 return z; 3789 3790 } 3791 3792 /*---------------------------------------------------------------------------- 3793 | Returns the result of converting the double-precision floating-point value 3794 | `a' to the single-precision floating-point format. The conversion is 3795 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3796 | Arithmetic. 3797 *----------------------------------------------------------------------------*/ 3798 3799 float32 float64_to_float32(float64 a, float_status *status) 3800 { 3801 flag aSign; 3802 int aExp; 3803 uint64_t aSig; 3804 uint32_t zSig; 3805 a = float64_squash_input_denormal(a, status); 3806 3807 aSig = extractFloat64Frac( a ); 3808 aExp = extractFloat64Exp( a ); 3809 aSign = extractFloat64Sign( a ); 3810 if ( aExp == 0x7FF ) { 3811 if (aSig) { 3812 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3813 } 3814 return packFloat32( aSign, 0xFF, 0 ); 3815 } 3816 shift64RightJamming( aSig, 22, &aSig ); 3817 zSig = aSig; 3818 if ( aExp || zSig ) { 3819 zSig |= 0x40000000; 3820 aExp -= 0x381; 3821 } 3822 return roundAndPackFloat32(aSign, aExp, zSig, status); 3823 3824 } 3825 3826 3827 /*---------------------------------------------------------------------------- 3828 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3829 | half-precision floating-point value, returning the result. After being 3830 | shifted into the proper positions, the three fields are simply added 3831 | together to form the result. This means that any integer portion of `zSig' 3832 | will be added into the exponent. Since a properly normalized significand 3833 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3834 | than the desired result exponent whenever `zSig' is a complete, normalized 3835 | significand. 3836 *----------------------------------------------------------------------------*/ 3837 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3838 { 3839 return make_float16( 3840 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3841 } 3842 3843 /*---------------------------------------------------------------------------- 3844 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3845 | and significand `zSig', and returns the proper half-precision floating- 3846 | point value corresponding to the abstract input. Ordinarily, the abstract 3847 | value is simply rounded and packed into the half-precision format, with 3848 | the inexact exception raised if the abstract input cannot be represented 3849 | exactly. However, if the abstract value is too large, the overflow and 3850 | inexact exceptions are raised and an infinity or maximal finite value is 3851 | returned. If the abstract value is too small, the input value is rounded to 3852 | a subnormal number, and the underflow and inexact exceptions are raised if 3853 | the abstract input cannot be represented exactly as a subnormal half- 3854 | precision floating-point number. 3855 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3856 | ARM-style "alternative representation", which omits the NaN and Inf 3857 | encodings in order to raise the maximum representable exponent by one. 3858 | The input significand `zSig' has its binary point between bits 22 3859 | and 23, which is 13 bits to the left of the usual location. This shifted 3860 | significand must be normalized or smaller. If `zSig' is not normalized, 3861 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3862 | and it must not require rounding. In the usual case that `zSig' is 3863 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3864 | Note the slightly odd position of the binary point in zSig compared with the 3865 | other roundAndPackFloat functions. This should probably be fixed if we 3866 | need to implement more float16 routines than just conversion. 3867 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3868 | Binary Floating-Point Arithmetic. 3869 *----------------------------------------------------------------------------*/ 3870 3871 static float16 roundAndPackFloat16(flag zSign, int zExp, 3872 uint32_t zSig, flag ieee, 3873 float_status *status) 3874 { 3875 int maxexp = ieee ? 29 : 30; 3876 uint32_t mask; 3877 uint32_t increment; 3878 bool rounding_bumps_exp; 3879 bool is_tiny = false; 3880 3881 /* Calculate the mask of bits of the mantissa which are not 3882 * representable in half-precision and will be lost. 3883 */ 3884 if (zExp < 1) { 3885 /* Will be denormal in halfprec */ 3886 mask = 0x00ffffff; 3887 if (zExp >= -11) { 3888 mask >>= 11 + zExp; 3889 } 3890 } else { 3891 /* Normal number in halfprec */ 3892 mask = 0x00001fff; 3893 } 3894 3895 switch (status->float_rounding_mode) { 3896 case float_round_nearest_even: 3897 increment = (mask + 1) >> 1; 3898 if ((zSig & mask) == increment) { 3899 increment = zSig & (increment << 1); 3900 } 3901 break; 3902 case float_round_ties_away: 3903 increment = (mask + 1) >> 1; 3904 break; 3905 case float_round_up: 3906 increment = zSign ? 0 : mask; 3907 break; 3908 case float_round_down: 3909 increment = zSign ? mask : 0; 3910 break; 3911 default: /* round_to_zero */ 3912 increment = 0; 3913 break; 3914 } 3915 3916 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3917 3918 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3919 if (ieee) { 3920 float_raise(float_flag_overflow | float_flag_inexact, status); 3921 return packFloat16(zSign, 0x1f, 0); 3922 } else { 3923 float_raise(float_flag_invalid, status); 3924 return packFloat16(zSign, 0x1f, 0x3ff); 3925 } 3926 } 3927 3928 if (zExp < 0) { 3929 /* Note that flush-to-zero does not affect half-precision results */ 3930 is_tiny = 3931 (status->float_detect_tininess == float_tininess_before_rounding) 3932 || (zExp < -1) 3933 || (!rounding_bumps_exp); 3934 } 3935 if (zSig & mask) { 3936 float_raise(float_flag_inexact, status); 3937 if (is_tiny) { 3938 float_raise(float_flag_underflow, status); 3939 } 3940 } 3941 3942 zSig += increment; 3943 if (rounding_bumps_exp) { 3944 zSig >>= 1; 3945 zExp++; 3946 } 3947 3948 if (zExp < -10) { 3949 return packFloat16(zSign, 0, 0); 3950 } 3951 if (zExp < 0) { 3952 zSig >>= -zExp; 3953 zExp = 0; 3954 } 3955 return packFloat16(zSign, zExp, zSig >> 13); 3956 } 3957 3958 /*---------------------------------------------------------------------------- 3959 | If `a' is denormal and we are in flush-to-zero mode then set the 3960 | input-denormal exception and return zero. Otherwise just return the value. 3961 *----------------------------------------------------------------------------*/ 3962 float16 float16_squash_input_denormal(float16 a, float_status *status) 3963 { 3964 if (status->flush_inputs_to_zero) { 3965 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3966 float_raise(float_flag_input_denormal, status); 3967 return make_float16(float16_val(a) & 0x8000); 3968 } 3969 } 3970 return a; 3971 } 3972 3973 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3974 uint32_t *zSigPtr) 3975 { 3976 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3977 *zSigPtr = aSig << shiftCount; 3978 *zExpPtr = 1 - shiftCount; 3979 } 3980 3981 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3982 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3983 3984 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3985 { 3986 flag aSign; 3987 int aExp; 3988 uint32_t aSig; 3989 3990 aSign = extractFloat16Sign(a); 3991 aExp = extractFloat16Exp(a); 3992 aSig = extractFloat16Frac(a); 3993 3994 if (aExp == 0x1f && ieee) { 3995 if (aSig) { 3996 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3997 } 3998 return packFloat32(aSign, 0xff, 0); 3999 } 4000 if (aExp == 0) { 4001 if (aSig == 0) { 4002 return packFloat32(aSign, 0, 0); 4003 } 4004 4005 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 4006 aExp--; 4007 } 4008 return packFloat32( aSign, aExp + 0x70, aSig << 13); 4009 } 4010 4011 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 4012 { 4013 flag aSign; 4014 int aExp; 4015 uint32_t aSig; 4016 4017 a = float32_squash_input_denormal(a, status); 4018 4019 aSig = extractFloat32Frac( a ); 4020 aExp = extractFloat32Exp( a ); 4021 aSign = extractFloat32Sign( a ); 4022 if ( aExp == 0xFF ) { 4023 if (aSig) { 4024 /* Input is a NaN */ 4025 if (!ieee) { 4026 float_raise(float_flag_invalid, status); 4027 return packFloat16(aSign, 0, 0); 4028 } 4029 return commonNaNToFloat16( 4030 float32ToCommonNaN(a, status), status); 4031 } 4032 /* Infinity */ 4033 if (!ieee) { 4034 float_raise(float_flag_invalid, status); 4035 return packFloat16(aSign, 0x1f, 0x3ff); 4036 } 4037 return packFloat16(aSign, 0x1f, 0); 4038 } 4039 if (aExp == 0 && aSig == 0) { 4040 return packFloat16(aSign, 0, 0); 4041 } 4042 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4043 * even if the input is denormal; however this is harmless because 4044 * the largest possible single-precision denormal is still smaller 4045 * than the smallest representable half-precision denormal, and so we 4046 * will end up ignoring aSig and returning via the "always return zero" 4047 * codepath. 4048 */ 4049 aSig |= 0x00800000; 4050 aExp -= 0x71; 4051 4052 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 4053 } 4054 4055 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 4056 { 4057 flag aSign; 4058 int aExp; 4059 uint32_t aSig; 4060 4061 aSign = extractFloat16Sign(a); 4062 aExp = extractFloat16Exp(a); 4063 aSig = extractFloat16Frac(a); 4064 4065 if (aExp == 0x1f && ieee) { 4066 if (aSig) { 4067 return commonNaNToFloat64( 4068 float16ToCommonNaN(a, status), status); 4069 } 4070 return packFloat64(aSign, 0x7ff, 0); 4071 } 4072 if (aExp == 0) { 4073 if (aSig == 0) { 4074 return packFloat64(aSign, 0, 0); 4075 } 4076 4077 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 4078 aExp--; 4079 } 4080 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 4081 } 4082 4083 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 4084 { 4085 flag aSign; 4086 int aExp; 4087 uint64_t aSig; 4088 uint32_t zSig; 4089 4090 a = float64_squash_input_denormal(a, status); 4091 4092 aSig = extractFloat64Frac(a); 4093 aExp = extractFloat64Exp(a); 4094 aSign = extractFloat64Sign(a); 4095 if (aExp == 0x7FF) { 4096 if (aSig) { 4097 /* Input is a NaN */ 4098 if (!ieee) { 4099 float_raise(float_flag_invalid, status); 4100 return packFloat16(aSign, 0, 0); 4101 } 4102 return commonNaNToFloat16( 4103 float64ToCommonNaN(a, status), status); 4104 } 4105 /* Infinity */ 4106 if (!ieee) { 4107 float_raise(float_flag_invalid, status); 4108 return packFloat16(aSign, 0x1f, 0x3ff); 4109 } 4110 return packFloat16(aSign, 0x1f, 0); 4111 } 4112 shift64RightJamming(aSig, 29, &aSig); 4113 zSig = aSig; 4114 if (aExp == 0 && zSig == 0) { 4115 return packFloat16(aSign, 0, 0); 4116 } 4117 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4118 * even if the input is denormal; however this is harmless because 4119 * the largest possible single-precision denormal is still smaller 4120 * than the smallest representable half-precision denormal, and so we 4121 * will end up ignoring aSig and returning via the "always return zero" 4122 * codepath. 4123 */ 4124 zSig |= 0x00800000; 4125 aExp -= 0x3F1; 4126 4127 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4128 } 4129 4130 /*---------------------------------------------------------------------------- 4131 | Returns the result of converting the double-precision floating-point value 4132 | `a' to the extended double-precision floating-point format. The conversion 4133 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4134 | Arithmetic. 4135 *----------------------------------------------------------------------------*/ 4136 4137 floatx80 float64_to_floatx80(float64 a, float_status *status) 4138 { 4139 flag aSign; 4140 int aExp; 4141 uint64_t aSig; 4142 4143 a = float64_squash_input_denormal(a, status); 4144 aSig = extractFloat64Frac( a ); 4145 aExp = extractFloat64Exp( a ); 4146 aSign = extractFloat64Sign( a ); 4147 if ( aExp == 0x7FF ) { 4148 if (aSig) { 4149 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4150 } 4151 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4152 } 4153 if ( aExp == 0 ) { 4154 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4155 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4156 } 4157 return 4158 packFloatx80( 4159 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4160 4161 } 4162 4163 /*---------------------------------------------------------------------------- 4164 | Returns the result of converting the double-precision floating-point value 4165 | `a' to the quadruple-precision floating-point format. The conversion is 4166 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4167 | Arithmetic. 4168 *----------------------------------------------------------------------------*/ 4169 4170 float128 float64_to_float128(float64 a, float_status *status) 4171 { 4172 flag aSign; 4173 int aExp; 4174 uint64_t aSig, zSig0, zSig1; 4175 4176 a = float64_squash_input_denormal(a, status); 4177 aSig = extractFloat64Frac( a ); 4178 aExp = extractFloat64Exp( a ); 4179 aSign = extractFloat64Sign( a ); 4180 if ( aExp == 0x7FF ) { 4181 if (aSig) { 4182 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4183 } 4184 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4185 } 4186 if ( aExp == 0 ) { 4187 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4188 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4189 --aExp; 4190 } 4191 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4192 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4193 4194 } 4195 4196 4197 /*---------------------------------------------------------------------------- 4198 | Returns the remainder of the double-precision floating-point value `a' 4199 | with respect to the corresponding value `b'. The operation is performed 4200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4201 *----------------------------------------------------------------------------*/ 4202 4203 float64 float64_rem(float64 a, float64 b, float_status *status) 4204 { 4205 flag aSign, zSign; 4206 int aExp, bExp, expDiff; 4207 uint64_t aSig, bSig; 4208 uint64_t q, alternateASig; 4209 int64_t sigMean; 4210 4211 a = float64_squash_input_denormal(a, status); 4212 b = float64_squash_input_denormal(b, status); 4213 aSig = extractFloat64Frac( a ); 4214 aExp = extractFloat64Exp( a ); 4215 aSign = extractFloat64Sign( a ); 4216 bSig = extractFloat64Frac( b ); 4217 bExp = extractFloat64Exp( b ); 4218 if ( aExp == 0x7FF ) { 4219 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4220 return propagateFloat64NaN(a, b, status); 4221 } 4222 float_raise(float_flag_invalid, status); 4223 return float64_default_nan(status); 4224 } 4225 if ( bExp == 0x7FF ) { 4226 if (bSig) { 4227 return propagateFloat64NaN(a, b, status); 4228 } 4229 return a; 4230 } 4231 if ( bExp == 0 ) { 4232 if ( bSig == 0 ) { 4233 float_raise(float_flag_invalid, status); 4234 return float64_default_nan(status); 4235 } 4236 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4237 } 4238 if ( aExp == 0 ) { 4239 if ( aSig == 0 ) return a; 4240 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4241 } 4242 expDiff = aExp - bExp; 4243 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4244 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4245 if ( expDiff < 0 ) { 4246 if ( expDiff < -1 ) return a; 4247 aSig >>= 1; 4248 } 4249 q = ( bSig <= aSig ); 4250 if ( q ) aSig -= bSig; 4251 expDiff -= 64; 4252 while ( 0 < expDiff ) { 4253 q = estimateDiv128To64( aSig, 0, bSig ); 4254 q = ( 2 < q ) ? q - 2 : 0; 4255 aSig = - ( ( bSig>>2 ) * q ); 4256 expDiff -= 62; 4257 } 4258 expDiff += 64; 4259 if ( 0 < expDiff ) { 4260 q = estimateDiv128To64( aSig, 0, bSig ); 4261 q = ( 2 < q ) ? q - 2 : 0; 4262 q >>= 64 - expDiff; 4263 bSig >>= 2; 4264 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4265 } 4266 else { 4267 aSig >>= 2; 4268 bSig >>= 2; 4269 } 4270 do { 4271 alternateASig = aSig; 4272 ++q; 4273 aSig -= bSig; 4274 } while ( 0 <= (int64_t) aSig ); 4275 sigMean = aSig + alternateASig; 4276 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4277 aSig = alternateASig; 4278 } 4279 zSign = ( (int64_t) aSig < 0 ); 4280 if ( zSign ) aSig = - aSig; 4281 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4282 4283 } 4284 4285 4286 /*---------------------------------------------------------------------------- 4287 | Returns the square root of the double-precision floating-point value `a'. 4288 | The operation is performed according to the IEC/IEEE Standard for Binary 4289 | Floating-Point Arithmetic. 4290 *----------------------------------------------------------------------------*/ 4291 4292 float64 float64_sqrt(float64 a, float_status *status) 4293 { 4294 flag aSign; 4295 int aExp, zExp; 4296 uint64_t aSig, zSig, doubleZSig; 4297 uint64_t rem0, rem1, term0, term1; 4298 a = float64_squash_input_denormal(a, status); 4299 4300 aSig = extractFloat64Frac( a ); 4301 aExp = extractFloat64Exp( a ); 4302 aSign = extractFloat64Sign( a ); 4303 if ( aExp == 0x7FF ) { 4304 if (aSig) { 4305 return propagateFloat64NaN(a, a, status); 4306 } 4307 if ( ! aSign ) return a; 4308 float_raise(float_flag_invalid, status); 4309 return float64_default_nan(status); 4310 } 4311 if ( aSign ) { 4312 if ( ( aExp | aSig ) == 0 ) return a; 4313 float_raise(float_flag_invalid, status); 4314 return float64_default_nan(status); 4315 } 4316 if ( aExp == 0 ) { 4317 if ( aSig == 0 ) return float64_zero; 4318 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4319 } 4320 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4321 aSig |= LIT64( 0x0010000000000000 ); 4322 zSig = estimateSqrt32( aExp, aSig>>21 ); 4323 aSig <<= 9 - ( aExp & 1 ); 4324 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4325 if ( ( zSig & 0x1FF ) <= 5 ) { 4326 doubleZSig = zSig<<1; 4327 mul64To128( zSig, zSig, &term0, &term1 ); 4328 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4329 while ( (int64_t) rem0 < 0 ) { 4330 --zSig; 4331 doubleZSig -= 2; 4332 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4333 } 4334 zSig |= ( ( rem0 | rem1 ) != 0 ); 4335 } 4336 return roundAndPackFloat64(0, zExp, zSig, status); 4337 4338 } 4339 4340 /*---------------------------------------------------------------------------- 4341 | Returns the binary log of the double-precision floating-point value `a'. 4342 | The operation is performed according to the IEC/IEEE Standard for Binary 4343 | Floating-Point Arithmetic. 4344 *----------------------------------------------------------------------------*/ 4345 float64 float64_log2(float64 a, float_status *status) 4346 { 4347 flag aSign, zSign; 4348 int aExp; 4349 uint64_t aSig, aSig0, aSig1, zSig, i; 4350 a = float64_squash_input_denormal(a, status); 4351 4352 aSig = extractFloat64Frac( a ); 4353 aExp = extractFloat64Exp( a ); 4354 aSign = extractFloat64Sign( a ); 4355 4356 if ( aExp == 0 ) { 4357 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4358 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4359 } 4360 if ( aSign ) { 4361 float_raise(float_flag_invalid, status); 4362 return float64_default_nan(status); 4363 } 4364 if ( aExp == 0x7FF ) { 4365 if (aSig) { 4366 return propagateFloat64NaN(a, float64_zero, status); 4367 } 4368 return a; 4369 } 4370 4371 aExp -= 0x3FF; 4372 aSig |= LIT64( 0x0010000000000000 ); 4373 zSign = aExp < 0; 4374 zSig = (uint64_t)aExp << 52; 4375 for (i = 1LL << 51; i > 0; i >>= 1) { 4376 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4377 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4378 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4379 aSig >>= 1; 4380 zSig |= i; 4381 } 4382 } 4383 4384 if ( zSign ) 4385 zSig = -zSig; 4386 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4387 } 4388 4389 /*---------------------------------------------------------------------------- 4390 | Returns 1 if the double-precision floating-point value `a' is equal to the 4391 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4392 | if either operand is a NaN. Otherwise, the comparison is performed 4393 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4394 *----------------------------------------------------------------------------*/ 4395 4396 int float64_eq(float64 a, float64 b, float_status *status) 4397 { 4398 uint64_t av, bv; 4399 a = float64_squash_input_denormal(a, status); 4400 b = float64_squash_input_denormal(b, status); 4401 4402 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4403 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4404 ) { 4405 float_raise(float_flag_invalid, status); 4406 return 0; 4407 } 4408 av = float64_val(a); 4409 bv = float64_val(b); 4410 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4411 4412 } 4413 4414 /*---------------------------------------------------------------------------- 4415 | Returns 1 if the double-precision floating-point value `a' is less than or 4416 | equal to the corresponding value `b', and 0 otherwise. The invalid 4417 | exception is raised if either operand is a NaN. The comparison is performed 4418 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4419 *----------------------------------------------------------------------------*/ 4420 4421 int float64_le(float64 a, float64 b, float_status *status) 4422 { 4423 flag aSign, bSign; 4424 uint64_t av, bv; 4425 a = float64_squash_input_denormal(a, status); 4426 b = float64_squash_input_denormal(b, status); 4427 4428 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4429 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4430 ) { 4431 float_raise(float_flag_invalid, status); 4432 return 0; 4433 } 4434 aSign = extractFloat64Sign( a ); 4435 bSign = extractFloat64Sign( b ); 4436 av = float64_val(a); 4437 bv = float64_val(b); 4438 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4439 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4440 4441 } 4442 4443 /*---------------------------------------------------------------------------- 4444 | Returns 1 if the double-precision floating-point value `a' is less than 4445 | the corresponding value `b', and 0 otherwise. The invalid exception is 4446 | raised if either operand is a NaN. The comparison is performed according 4447 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4448 *----------------------------------------------------------------------------*/ 4449 4450 int float64_lt(float64 a, float64 b, float_status *status) 4451 { 4452 flag aSign, bSign; 4453 uint64_t av, bv; 4454 4455 a = float64_squash_input_denormal(a, status); 4456 b = float64_squash_input_denormal(b, status); 4457 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4458 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4459 ) { 4460 float_raise(float_flag_invalid, status); 4461 return 0; 4462 } 4463 aSign = extractFloat64Sign( a ); 4464 bSign = extractFloat64Sign( b ); 4465 av = float64_val(a); 4466 bv = float64_val(b); 4467 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4468 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4469 4470 } 4471 4472 /*---------------------------------------------------------------------------- 4473 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4474 | be compared, and 0 otherwise. The invalid exception is raised if either 4475 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4476 | Standard for Binary Floating-Point Arithmetic. 4477 *----------------------------------------------------------------------------*/ 4478 4479 int float64_unordered(float64 a, float64 b, float_status *status) 4480 { 4481 a = float64_squash_input_denormal(a, status); 4482 b = float64_squash_input_denormal(b, status); 4483 4484 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4485 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4486 ) { 4487 float_raise(float_flag_invalid, status); 4488 return 1; 4489 } 4490 return 0; 4491 } 4492 4493 /*---------------------------------------------------------------------------- 4494 | Returns 1 if the double-precision floating-point value `a' is equal to the 4495 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4496 | exception.The comparison is performed according to the IEC/IEEE Standard 4497 | for Binary Floating-Point Arithmetic. 4498 *----------------------------------------------------------------------------*/ 4499 4500 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4501 { 4502 uint64_t av, bv; 4503 a = float64_squash_input_denormal(a, status); 4504 b = float64_squash_input_denormal(b, status); 4505 4506 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4507 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4508 ) { 4509 if (float64_is_signaling_nan(a, status) 4510 || float64_is_signaling_nan(b, status)) { 4511 float_raise(float_flag_invalid, status); 4512 } 4513 return 0; 4514 } 4515 av = float64_val(a); 4516 bv = float64_val(b); 4517 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4518 4519 } 4520 4521 /*---------------------------------------------------------------------------- 4522 | Returns 1 if the double-precision floating-point value `a' is less than or 4523 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4524 | cause an exception. Otherwise, the comparison is performed according to the 4525 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4526 *----------------------------------------------------------------------------*/ 4527 4528 int float64_le_quiet(float64 a, float64 b, float_status *status) 4529 { 4530 flag aSign, bSign; 4531 uint64_t av, bv; 4532 a = float64_squash_input_denormal(a, status); 4533 b = float64_squash_input_denormal(b, status); 4534 4535 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4536 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4537 ) { 4538 if (float64_is_signaling_nan(a, status) 4539 || float64_is_signaling_nan(b, status)) { 4540 float_raise(float_flag_invalid, status); 4541 } 4542 return 0; 4543 } 4544 aSign = extractFloat64Sign( a ); 4545 bSign = extractFloat64Sign( b ); 4546 av = float64_val(a); 4547 bv = float64_val(b); 4548 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4549 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4550 4551 } 4552 4553 /*---------------------------------------------------------------------------- 4554 | Returns 1 if the double-precision floating-point value `a' is less than 4555 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4556 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4557 | Standard for Binary Floating-Point Arithmetic. 4558 *----------------------------------------------------------------------------*/ 4559 4560 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4561 { 4562 flag aSign, bSign; 4563 uint64_t av, bv; 4564 a = float64_squash_input_denormal(a, status); 4565 b = float64_squash_input_denormal(b, status); 4566 4567 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4568 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4569 ) { 4570 if (float64_is_signaling_nan(a, status) 4571 || float64_is_signaling_nan(b, status)) { 4572 float_raise(float_flag_invalid, status); 4573 } 4574 return 0; 4575 } 4576 aSign = extractFloat64Sign( a ); 4577 bSign = extractFloat64Sign( b ); 4578 av = float64_val(a); 4579 bv = float64_val(b); 4580 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4581 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4582 4583 } 4584 4585 /*---------------------------------------------------------------------------- 4586 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4587 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4588 | comparison is performed according to the IEC/IEEE Standard for Binary 4589 | Floating-Point Arithmetic. 4590 *----------------------------------------------------------------------------*/ 4591 4592 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4593 { 4594 a = float64_squash_input_denormal(a, status); 4595 b = float64_squash_input_denormal(b, status); 4596 4597 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4598 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4599 ) { 4600 if (float64_is_signaling_nan(a, status) 4601 || float64_is_signaling_nan(b, status)) { 4602 float_raise(float_flag_invalid, status); 4603 } 4604 return 1; 4605 } 4606 return 0; 4607 } 4608 4609 /*---------------------------------------------------------------------------- 4610 | Returns the result of converting the extended double-precision floating- 4611 | point value `a' to the 32-bit two's complement integer format. The 4612 | conversion is performed according to the IEC/IEEE Standard for Binary 4613 | Floating-Point Arithmetic---which means in particular that the conversion 4614 | is rounded according to the current rounding mode. If `a' is a NaN, the 4615 | largest positive integer is returned. Otherwise, if the conversion 4616 | overflows, the largest integer with the same sign as `a' is returned. 4617 *----------------------------------------------------------------------------*/ 4618 4619 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4620 { 4621 flag aSign; 4622 int32_t aExp, shiftCount; 4623 uint64_t aSig; 4624 4625 if (floatx80_invalid_encoding(a)) { 4626 float_raise(float_flag_invalid, status); 4627 return 1 << 31; 4628 } 4629 aSig = extractFloatx80Frac( a ); 4630 aExp = extractFloatx80Exp( a ); 4631 aSign = extractFloatx80Sign( a ); 4632 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4633 shiftCount = 0x4037 - aExp; 4634 if ( shiftCount <= 0 ) shiftCount = 1; 4635 shift64RightJamming( aSig, shiftCount, &aSig ); 4636 return roundAndPackInt32(aSign, aSig, status); 4637 4638 } 4639 4640 /*---------------------------------------------------------------------------- 4641 | Returns the result of converting the extended double-precision floating- 4642 | point value `a' to the 32-bit two's complement integer format. The 4643 | conversion is performed according to the IEC/IEEE Standard for Binary 4644 | Floating-Point Arithmetic, except that the conversion is always rounded 4645 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4646 | Otherwise, if the conversion overflows, the largest integer with the same 4647 | sign as `a' is returned. 4648 *----------------------------------------------------------------------------*/ 4649 4650 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4651 { 4652 flag aSign; 4653 int32_t aExp, shiftCount; 4654 uint64_t aSig, savedASig; 4655 int32_t z; 4656 4657 if (floatx80_invalid_encoding(a)) { 4658 float_raise(float_flag_invalid, status); 4659 return 1 << 31; 4660 } 4661 aSig = extractFloatx80Frac( a ); 4662 aExp = extractFloatx80Exp( a ); 4663 aSign = extractFloatx80Sign( a ); 4664 if ( 0x401E < aExp ) { 4665 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4666 goto invalid; 4667 } 4668 else if ( aExp < 0x3FFF ) { 4669 if (aExp || aSig) { 4670 status->float_exception_flags |= float_flag_inexact; 4671 } 4672 return 0; 4673 } 4674 shiftCount = 0x403E - aExp; 4675 savedASig = aSig; 4676 aSig >>= shiftCount; 4677 z = aSig; 4678 if ( aSign ) z = - z; 4679 if ( ( z < 0 ) ^ aSign ) { 4680 invalid: 4681 float_raise(float_flag_invalid, status); 4682 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4683 } 4684 if ( ( aSig<<shiftCount ) != savedASig ) { 4685 status->float_exception_flags |= float_flag_inexact; 4686 } 4687 return z; 4688 4689 } 4690 4691 /*---------------------------------------------------------------------------- 4692 | Returns the result of converting the extended double-precision floating- 4693 | point value `a' to the 64-bit two's complement integer format. The 4694 | conversion is performed according to the IEC/IEEE Standard for Binary 4695 | Floating-Point Arithmetic---which means in particular that the conversion 4696 | is rounded according to the current rounding mode. If `a' is a NaN, 4697 | the largest positive integer is returned. Otherwise, if the conversion 4698 | overflows, the largest integer with the same sign as `a' is returned. 4699 *----------------------------------------------------------------------------*/ 4700 4701 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4702 { 4703 flag aSign; 4704 int32_t aExp, shiftCount; 4705 uint64_t aSig, aSigExtra; 4706 4707 if (floatx80_invalid_encoding(a)) { 4708 float_raise(float_flag_invalid, status); 4709 return 1ULL << 63; 4710 } 4711 aSig = extractFloatx80Frac( a ); 4712 aExp = extractFloatx80Exp( a ); 4713 aSign = extractFloatx80Sign( a ); 4714 shiftCount = 0x403E - aExp; 4715 if ( shiftCount <= 0 ) { 4716 if ( shiftCount ) { 4717 float_raise(float_flag_invalid, status); 4718 if ( ! aSign 4719 || ( ( aExp == 0x7FFF ) 4720 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4721 ) { 4722 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4723 } 4724 return (int64_t) LIT64( 0x8000000000000000 ); 4725 } 4726 aSigExtra = 0; 4727 } 4728 else { 4729 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4730 } 4731 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4732 4733 } 4734 4735 /*---------------------------------------------------------------------------- 4736 | Returns the result of converting the extended double-precision floating- 4737 | point value `a' to the 64-bit two's complement integer format. The 4738 | conversion is performed according to the IEC/IEEE Standard for Binary 4739 | Floating-Point Arithmetic, except that the conversion is always rounded 4740 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4741 | Otherwise, if the conversion overflows, the largest integer with the same 4742 | sign as `a' is returned. 4743 *----------------------------------------------------------------------------*/ 4744 4745 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4746 { 4747 flag aSign; 4748 int32_t aExp, shiftCount; 4749 uint64_t aSig; 4750 int64_t z; 4751 4752 if (floatx80_invalid_encoding(a)) { 4753 float_raise(float_flag_invalid, status); 4754 return 1ULL << 63; 4755 } 4756 aSig = extractFloatx80Frac( a ); 4757 aExp = extractFloatx80Exp( a ); 4758 aSign = extractFloatx80Sign( a ); 4759 shiftCount = aExp - 0x403E; 4760 if ( 0 <= shiftCount ) { 4761 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4762 if ( ( a.high != 0xC03E ) || aSig ) { 4763 float_raise(float_flag_invalid, status); 4764 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4765 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4766 } 4767 } 4768 return (int64_t) LIT64( 0x8000000000000000 ); 4769 } 4770 else if ( aExp < 0x3FFF ) { 4771 if (aExp | aSig) { 4772 status->float_exception_flags |= float_flag_inexact; 4773 } 4774 return 0; 4775 } 4776 z = aSig>>( - shiftCount ); 4777 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4778 status->float_exception_flags |= float_flag_inexact; 4779 } 4780 if ( aSign ) z = - z; 4781 return z; 4782 4783 } 4784 4785 /*---------------------------------------------------------------------------- 4786 | Returns the result of converting the extended double-precision floating- 4787 | point value `a' to the single-precision floating-point format. The 4788 | conversion is performed according to the IEC/IEEE Standard for Binary 4789 | Floating-Point Arithmetic. 4790 *----------------------------------------------------------------------------*/ 4791 4792 float32 floatx80_to_float32(floatx80 a, float_status *status) 4793 { 4794 flag aSign; 4795 int32_t aExp; 4796 uint64_t aSig; 4797 4798 if (floatx80_invalid_encoding(a)) { 4799 float_raise(float_flag_invalid, status); 4800 return float32_default_nan(status); 4801 } 4802 aSig = extractFloatx80Frac( a ); 4803 aExp = extractFloatx80Exp( a ); 4804 aSign = extractFloatx80Sign( a ); 4805 if ( aExp == 0x7FFF ) { 4806 if ( (uint64_t) ( aSig<<1 ) ) { 4807 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4808 } 4809 return packFloat32( aSign, 0xFF, 0 ); 4810 } 4811 shift64RightJamming( aSig, 33, &aSig ); 4812 if ( aExp || aSig ) aExp -= 0x3F81; 4813 return roundAndPackFloat32(aSign, aExp, aSig, status); 4814 4815 } 4816 4817 /*---------------------------------------------------------------------------- 4818 | Returns the result of converting the extended double-precision floating- 4819 | point value `a' to the double-precision floating-point format. The 4820 | conversion is performed according to the IEC/IEEE Standard for Binary 4821 | Floating-Point Arithmetic. 4822 *----------------------------------------------------------------------------*/ 4823 4824 float64 floatx80_to_float64(floatx80 a, float_status *status) 4825 { 4826 flag aSign; 4827 int32_t aExp; 4828 uint64_t aSig, zSig; 4829 4830 if (floatx80_invalid_encoding(a)) { 4831 float_raise(float_flag_invalid, status); 4832 return float64_default_nan(status); 4833 } 4834 aSig = extractFloatx80Frac( a ); 4835 aExp = extractFloatx80Exp( a ); 4836 aSign = extractFloatx80Sign( a ); 4837 if ( aExp == 0x7FFF ) { 4838 if ( (uint64_t) ( aSig<<1 ) ) { 4839 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4840 } 4841 return packFloat64( aSign, 0x7FF, 0 ); 4842 } 4843 shift64RightJamming( aSig, 1, &zSig ); 4844 if ( aExp || aSig ) aExp -= 0x3C01; 4845 return roundAndPackFloat64(aSign, aExp, zSig, status); 4846 4847 } 4848 4849 /*---------------------------------------------------------------------------- 4850 | Returns the result of converting the extended double-precision floating- 4851 | point value `a' to the quadruple-precision floating-point format. The 4852 | conversion is performed according to the IEC/IEEE Standard for Binary 4853 | Floating-Point Arithmetic. 4854 *----------------------------------------------------------------------------*/ 4855 4856 float128 floatx80_to_float128(floatx80 a, float_status *status) 4857 { 4858 flag aSign; 4859 int aExp; 4860 uint64_t aSig, zSig0, zSig1; 4861 4862 if (floatx80_invalid_encoding(a)) { 4863 float_raise(float_flag_invalid, status); 4864 return float128_default_nan(status); 4865 } 4866 aSig = extractFloatx80Frac( a ); 4867 aExp = extractFloatx80Exp( a ); 4868 aSign = extractFloatx80Sign( a ); 4869 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4870 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4871 } 4872 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4873 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4874 4875 } 4876 4877 /*---------------------------------------------------------------------------- 4878 | Rounds the extended double-precision floating-point value `a' 4879 | to the precision provided by floatx80_rounding_precision and returns the 4880 | result as an extended double-precision floating-point value. 4881 | The operation is performed according to the IEC/IEEE Standard for Binary 4882 | Floating-Point Arithmetic. 4883 *----------------------------------------------------------------------------*/ 4884 4885 floatx80 floatx80_round(floatx80 a, float_status *status) 4886 { 4887 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4888 extractFloatx80Sign(a), 4889 extractFloatx80Exp(a), 4890 extractFloatx80Frac(a), 0, status); 4891 } 4892 4893 /*---------------------------------------------------------------------------- 4894 | Rounds the extended double-precision floating-point value `a' to an integer, 4895 | and returns the result as an extended quadruple-precision floating-point 4896 | value. The operation is performed according to the IEC/IEEE Standard for 4897 | Binary Floating-Point Arithmetic. 4898 *----------------------------------------------------------------------------*/ 4899 4900 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4901 { 4902 flag aSign; 4903 int32_t aExp; 4904 uint64_t lastBitMask, roundBitsMask; 4905 floatx80 z; 4906 4907 if (floatx80_invalid_encoding(a)) { 4908 float_raise(float_flag_invalid, status); 4909 return floatx80_default_nan(status); 4910 } 4911 aExp = extractFloatx80Exp( a ); 4912 if ( 0x403E <= aExp ) { 4913 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4914 return propagateFloatx80NaN(a, a, status); 4915 } 4916 return a; 4917 } 4918 if ( aExp < 0x3FFF ) { 4919 if ( ( aExp == 0 ) 4920 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4921 return a; 4922 } 4923 status->float_exception_flags |= float_flag_inexact; 4924 aSign = extractFloatx80Sign( a ); 4925 switch (status->float_rounding_mode) { 4926 case float_round_nearest_even: 4927 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4928 ) { 4929 return 4930 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4931 } 4932 break; 4933 case float_round_ties_away: 4934 if (aExp == 0x3FFE) { 4935 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4936 } 4937 break; 4938 case float_round_down: 4939 return 4940 aSign ? 4941 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4942 : packFloatx80( 0, 0, 0 ); 4943 case float_round_up: 4944 return 4945 aSign ? packFloatx80( 1, 0, 0 ) 4946 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4947 } 4948 return packFloatx80( aSign, 0, 0 ); 4949 } 4950 lastBitMask = 1; 4951 lastBitMask <<= 0x403E - aExp; 4952 roundBitsMask = lastBitMask - 1; 4953 z = a; 4954 switch (status->float_rounding_mode) { 4955 case float_round_nearest_even: 4956 z.low += lastBitMask>>1; 4957 if ((z.low & roundBitsMask) == 0) { 4958 z.low &= ~lastBitMask; 4959 } 4960 break; 4961 case float_round_ties_away: 4962 z.low += lastBitMask >> 1; 4963 break; 4964 case float_round_to_zero: 4965 break; 4966 case float_round_up: 4967 if (!extractFloatx80Sign(z)) { 4968 z.low += roundBitsMask; 4969 } 4970 break; 4971 case float_round_down: 4972 if (extractFloatx80Sign(z)) { 4973 z.low += roundBitsMask; 4974 } 4975 break; 4976 default: 4977 abort(); 4978 } 4979 z.low &= ~ roundBitsMask; 4980 if ( z.low == 0 ) { 4981 ++z.high; 4982 z.low = LIT64( 0x8000000000000000 ); 4983 } 4984 if (z.low != a.low) { 4985 status->float_exception_flags |= float_flag_inexact; 4986 } 4987 return z; 4988 4989 } 4990 4991 /*---------------------------------------------------------------------------- 4992 | Returns the result of adding the absolute values of the extended double- 4993 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4994 | negated before being returned. `zSign' is ignored if the result is a NaN. 4995 | The addition is performed according to the IEC/IEEE Standard for Binary 4996 | Floating-Point Arithmetic. 4997 *----------------------------------------------------------------------------*/ 4998 4999 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5000 float_status *status) 5001 { 5002 int32_t aExp, bExp, zExp; 5003 uint64_t aSig, bSig, zSig0, zSig1; 5004 int32_t expDiff; 5005 5006 aSig = extractFloatx80Frac( a ); 5007 aExp = extractFloatx80Exp( a ); 5008 bSig = extractFloatx80Frac( b ); 5009 bExp = extractFloatx80Exp( b ); 5010 expDiff = aExp - bExp; 5011 if ( 0 < expDiff ) { 5012 if ( aExp == 0x7FFF ) { 5013 if ((uint64_t)(aSig << 1)) { 5014 return propagateFloatx80NaN(a, b, status); 5015 } 5016 return a; 5017 } 5018 if ( bExp == 0 ) --expDiff; 5019 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5020 zExp = aExp; 5021 } 5022 else if ( expDiff < 0 ) { 5023 if ( bExp == 0x7FFF ) { 5024 if ((uint64_t)(bSig << 1)) { 5025 return propagateFloatx80NaN(a, b, status); 5026 } 5027 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5028 } 5029 if ( aExp == 0 ) ++expDiff; 5030 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5031 zExp = bExp; 5032 } 5033 else { 5034 if ( aExp == 0x7FFF ) { 5035 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5036 return propagateFloatx80NaN(a, b, status); 5037 } 5038 return a; 5039 } 5040 zSig1 = 0; 5041 zSig0 = aSig + bSig; 5042 if ( aExp == 0 ) { 5043 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5044 goto roundAndPack; 5045 } 5046 zExp = aExp; 5047 goto shiftRight1; 5048 } 5049 zSig0 = aSig + bSig; 5050 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5051 shiftRight1: 5052 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5053 zSig0 |= LIT64( 0x8000000000000000 ); 5054 ++zExp; 5055 roundAndPack: 5056 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5057 zSign, zExp, zSig0, zSig1, status); 5058 } 5059 5060 /*---------------------------------------------------------------------------- 5061 | Returns the result of subtracting the absolute values of the extended 5062 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5063 | difference is negated before being returned. `zSign' is ignored if the 5064 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5065 | Standard for Binary Floating-Point Arithmetic. 5066 *----------------------------------------------------------------------------*/ 5067 5068 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5069 float_status *status) 5070 { 5071 int32_t aExp, bExp, zExp; 5072 uint64_t aSig, bSig, zSig0, zSig1; 5073 int32_t expDiff; 5074 5075 aSig = extractFloatx80Frac( a ); 5076 aExp = extractFloatx80Exp( a ); 5077 bSig = extractFloatx80Frac( b ); 5078 bExp = extractFloatx80Exp( b ); 5079 expDiff = aExp - bExp; 5080 if ( 0 < expDiff ) goto aExpBigger; 5081 if ( expDiff < 0 ) goto bExpBigger; 5082 if ( aExp == 0x7FFF ) { 5083 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5084 return propagateFloatx80NaN(a, b, status); 5085 } 5086 float_raise(float_flag_invalid, status); 5087 return floatx80_default_nan(status); 5088 } 5089 if ( aExp == 0 ) { 5090 aExp = 1; 5091 bExp = 1; 5092 } 5093 zSig1 = 0; 5094 if ( bSig < aSig ) goto aBigger; 5095 if ( aSig < bSig ) goto bBigger; 5096 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5097 bExpBigger: 5098 if ( bExp == 0x7FFF ) { 5099 if ((uint64_t)(bSig << 1)) { 5100 return propagateFloatx80NaN(a, b, status); 5101 } 5102 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5103 } 5104 if ( aExp == 0 ) ++expDiff; 5105 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5106 bBigger: 5107 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5108 zExp = bExp; 5109 zSign ^= 1; 5110 goto normalizeRoundAndPack; 5111 aExpBigger: 5112 if ( aExp == 0x7FFF ) { 5113 if ((uint64_t)(aSig << 1)) { 5114 return propagateFloatx80NaN(a, b, status); 5115 } 5116 return a; 5117 } 5118 if ( bExp == 0 ) --expDiff; 5119 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5120 aBigger: 5121 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5122 zExp = aExp; 5123 normalizeRoundAndPack: 5124 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5125 zSign, zExp, zSig0, zSig1, status); 5126 } 5127 5128 /*---------------------------------------------------------------------------- 5129 | Returns the result of adding the extended double-precision floating-point 5130 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5131 | Standard for Binary Floating-Point Arithmetic. 5132 *----------------------------------------------------------------------------*/ 5133 5134 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5135 { 5136 flag aSign, bSign; 5137 5138 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5139 float_raise(float_flag_invalid, status); 5140 return floatx80_default_nan(status); 5141 } 5142 aSign = extractFloatx80Sign( a ); 5143 bSign = extractFloatx80Sign( b ); 5144 if ( aSign == bSign ) { 5145 return addFloatx80Sigs(a, b, aSign, status); 5146 } 5147 else { 5148 return subFloatx80Sigs(a, b, aSign, status); 5149 } 5150 5151 } 5152 5153 /*---------------------------------------------------------------------------- 5154 | Returns the result of subtracting the extended double-precision floating- 5155 | point values `a' and `b'. The operation is performed according to the 5156 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5157 *----------------------------------------------------------------------------*/ 5158 5159 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5160 { 5161 flag aSign, bSign; 5162 5163 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5164 float_raise(float_flag_invalid, status); 5165 return floatx80_default_nan(status); 5166 } 5167 aSign = extractFloatx80Sign( a ); 5168 bSign = extractFloatx80Sign( b ); 5169 if ( aSign == bSign ) { 5170 return subFloatx80Sigs(a, b, aSign, status); 5171 } 5172 else { 5173 return addFloatx80Sigs(a, b, aSign, status); 5174 } 5175 5176 } 5177 5178 /*---------------------------------------------------------------------------- 5179 | Returns the result of multiplying the extended double-precision floating- 5180 | point values `a' and `b'. The operation is performed according to the 5181 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5182 *----------------------------------------------------------------------------*/ 5183 5184 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5185 { 5186 flag aSign, bSign, zSign; 5187 int32_t aExp, bExp, zExp; 5188 uint64_t aSig, bSig, zSig0, zSig1; 5189 5190 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5191 float_raise(float_flag_invalid, status); 5192 return floatx80_default_nan(status); 5193 } 5194 aSig = extractFloatx80Frac( a ); 5195 aExp = extractFloatx80Exp( a ); 5196 aSign = extractFloatx80Sign( a ); 5197 bSig = extractFloatx80Frac( b ); 5198 bExp = extractFloatx80Exp( b ); 5199 bSign = extractFloatx80Sign( b ); 5200 zSign = aSign ^ bSign; 5201 if ( aExp == 0x7FFF ) { 5202 if ( (uint64_t) ( aSig<<1 ) 5203 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5204 return propagateFloatx80NaN(a, b, status); 5205 } 5206 if ( ( bExp | bSig ) == 0 ) goto invalid; 5207 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5208 } 5209 if ( bExp == 0x7FFF ) { 5210 if ((uint64_t)(bSig << 1)) { 5211 return propagateFloatx80NaN(a, b, status); 5212 } 5213 if ( ( aExp | aSig ) == 0 ) { 5214 invalid: 5215 float_raise(float_flag_invalid, status); 5216 return floatx80_default_nan(status); 5217 } 5218 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5219 } 5220 if ( aExp == 0 ) { 5221 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5222 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5223 } 5224 if ( bExp == 0 ) { 5225 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5226 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5227 } 5228 zExp = aExp + bExp - 0x3FFE; 5229 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5230 if ( 0 < (int64_t) zSig0 ) { 5231 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5232 --zExp; 5233 } 5234 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5235 zSign, zExp, zSig0, zSig1, status); 5236 } 5237 5238 /*---------------------------------------------------------------------------- 5239 | Returns the result of dividing the extended double-precision floating-point 5240 | value `a' by the corresponding value `b'. The operation is performed 5241 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5242 *----------------------------------------------------------------------------*/ 5243 5244 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5245 { 5246 flag aSign, bSign, zSign; 5247 int32_t aExp, bExp, zExp; 5248 uint64_t aSig, bSig, zSig0, zSig1; 5249 uint64_t rem0, rem1, rem2, term0, term1, term2; 5250 5251 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5252 float_raise(float_flag_invalid, status); 5253 return floatx80_default_nan(status); 5254 } 5255 aSig = extractFloatx80Frac( a ); 5256 aExp = extractFloatx80Exp( a ); 5257 aSign = extractFloatx80Sign( a ); 5258 bSig = extractFloatx80Frac( b ); 5259 bExp = extractFloatx80Exp( b ); 5260 bSign = extractFloatx80Sign( b ); 5261 zSign = aSign ^ bSign; 5262 if ( aExp == 0x7FFF ) { 5263 if ((uint64_t)(aSig << 1)) { 5264 return propagateFloatx80NaN(a, b, status); 5265 } 5266 if ( bExp == 0x7FFF ) { 5267 if ((uint64_t)(bSig << 1)) { 5268 return propagateFloatx80NaN(a, b, status); 5269 } 5270 goto invalid; 5271 } 5272 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5273 } 5274 if ( bExp == 0x7FFF ) { 5275 if ((uint64_t)(bSig << 1)) { 5276 return propagateFloatx80NaN(a, b, status); 5277 } 5278 return packFloatx80( zSign, 0, 0 ); 5279 } 5280 if ( bExp == 0 ) { 5281 if ( bSig == 0 ) { 5282 if ( ( aExp | aSig ) == 0 ) { 5283 invalid: 5284 float_raise(float_flag_invalid, status); 5285 return floatx80_default_nan(status); 5286 } 5287 float_raise(float_flag_divbyzero, status); 5288 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5289 } 5290 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5291 } 5292 if ( aExp == 0 ) { 5293 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5294 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5295 } 5296 zExp = aExp - bExp + 0x3FFE; 5297 rem1 = 0; 5298 if ( bSig <= aSig ) { 5299 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5300 ++zExp; 5301 } 5302 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5303 mul64To128( bSig, zSig0, &term0, &term1 ); 5304 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5305 while ( (int64_t) rem0 < 0 ) { 5306 --zSig0; 5307 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5308 } 5309 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5310 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5311 mul64To128( bSig, zSig1, &term1, &term2 ); 5312 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5313 while ( (int64_t) rem1 < 0 ) { 5314 --zSig1; 5315 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5316 } 5317 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5318 } 5319 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5320 zSign, zExp, zSig0, zSig1, status); 5321 } 5322 5323 /*---------------------------------------------------------------------------- 5324 | Returns the remainder of the extended double-precision floating-point value 5325 | `a' with respect to the corresponding value `b'. The operation is performed 5326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5327 *----------------------------------------------------------------------------*/ 5328 5329 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5330 { 5331 flag aSign, zSign; 5332 int32_t aExp, bExp, expDiff; 5333 uint64_t aSig0, aSig1, bSig; 5334 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5335 5336 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5337 float_raise(float_flag_invalid, status); 5338 return floatx80_default_nan(status); 5339 } 5340 aSig0 = extractFloatx80Frac( a ); 5341 aExp = extractFloatx80Exp( a ); 5342 aSign = extractFloatx80Sign( a ); 5343 bSig = extractFloatx80Frac( b ); 5344 bExp = extractFloatx80Exp( b ); 5345 if ( aExp == 0x7FFF ) { 5346 if ( (uint64_t) ( aSig0<<1 ) 5347 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5348 return propagateFloatx80NaN(a, b, status); 5349 } 5350 goto invalid; 5351 } 5352 if ( bExp == 0x7FFF ) { 5353 if ((uint64_t)(bSig << 1)) { 5354 return propagateFloatx80NaN(a, b, status); 5355 } 5356 return a; 5357 } 5358 if ( bExp == 0 ) { 5359 if ( bSig == 0 ) { 5360 invalid: 5361 float_raise(float_flag_invalid, status); 5362 return floatx80_default_nan(status); 5363 } 5364 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5365 } 5366 if ( aExp == 0 ) { 5367 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5368 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5369 } 5370 bSig |= LIT64( 0x8000000000000000 ); 5371 zSign = aSign; 5372 expDiff = aExp - bExp; 5373 aSig1 = 0; 5374 if ( expDiff < 0 ) { 5375 if ( expDiff < -1 ) return a; 5376 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5377 expDiff = 0; 5378 } 5379 q = ( bSig <= aSig0 ); 5380 if ( q ) aSig0 -= bSig; 5381 expDiff -= 64; 5382 while ( 0 < expDiff ) { 5383 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5384 q = ( 2 < q ) ? q - 2 : 0; 5385 mul64To128( bSig, q, &term0, &term1 ); 5386 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5387 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5388 expDiff -= 62; 5389 } 5390 expDiff += 64; 5391 if ( 0 < expDiff ) { 5392 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5393 q = ( 2 < q ) ? q - 2 : 0; 5394 q >>= 64 - expDiff; 5395 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5396 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5397 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5398 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5399 ++q; 5400 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5401 } 5402 } 5403 else { 5404 term1 = 0; 5405 term0 = bSig; 5406 } 5407 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5408 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5409 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5410 && ( q & 1 ) ) 5411 ) { 5412 aSig0 = alternateASig0; 5413 aSig1 = alternateASig1; 5414 zSign = ! zSign; 5415 } 5416 return 5417 normalizeRoundAndPackFloatx80( 5418 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5419 5420 } 5421 5422 /*---------------------------------------------------------------------------- 5423 | Returns the square root of the extended double-precision floating-point 5424 | value `a'. The operation is performed according to the IEC/IEEE Standard 5425 | for Binary Floating-Point Arithmetic. 5426 *----------------------------------------------------------------------------*/ 5427 5428 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5429 { 5430 flag aSign; 5431 int32_t aExp, zExp; 5432 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5433 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5434 5435 if (floatx80_invalid_encoding(a)) { 5436 float_raise(float_flag_invalid, status); 5437 return floatx80_default_nan(status); 5438 } 5439 aSig0 = extractFloatx80Frac( a ); 5440 aExp = extractFloatx80Exp( a ); 5441 aSign = extractFloatx80Sign( a ); 5442 if ( aExp == 0x7FFF ) { 5443 if ((uint64_t)(aSig0 << 1)) { 5444 return propagateFloatx80NaN(a, a, status); 5445 } 5446 if ( ! aSign ) return a; 5447 goto invalid; 5448 } 5449 if ( aSign ) { 5450 if ( ( aExp | aSig0 ) == 0 ) return a; 5451 invalid: 5452 float_raise(float_flag_invalid, status); 5453 return floatx80_default_nan(status); 5454 } 5455 if ( aExp == 0 ) { 5456 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5457 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5458 } 5459 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5460 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5461 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5462 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5463 doubleZSig0 = zSig0<<1; 5464 mul64To128( zSig0, zSig0, &term0, &term1 ); 5465 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5466 while ( (int64_t) rem0 < 0 ) { 5467 --zSig0; 5468 doubleZSig0 -= 2; 5469 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5470 } 5471 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5472 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5473 if ( zSig1 == 0 ) zSig1 = 1; 5474 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5475 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5476 mul64To128( zSig1, zSig1, &term2, &term3 ); 5477 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5478 while ( (int64_t) rem1 < 0 ) { 5479 --zSig1; 5480 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5481 term3 |= 1; 5482 term2 |= doubleZSig0; 5483 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5484 } 5485 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5486 } 5487 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5488 zSig0 |= doubleZSig0; 5489 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5490 0, zExp, zSig0, zSig1, status); 5491 } 5492 5493 /*---------------------------------------------------------------------------- 5494 | Returns 1 if the extended double-precision floating-point value `a' is equal 5495 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5496 | raised if either operand is a NaN. Otherwise, the comparison is performed 5497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5498 *----------------------------------------------------------------------------*/ 5499 5500 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5501 { 5502 5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5504 || (extractFloatx80Exp(a) == 0x7FFF 5505 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5506 || (extractFloatx80Exp(b) == 0x7FFF 5507 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5508 ) { 5509 float_raise(float_flag_invalid, status); 5510 return 0; 5511 } 5512 return 5513 ( a.low == b.low ) 5514 && ( ( a.high == b.high ) 5515 || ( ( a.low == 0 ) 5516 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5517 ); 5518 5519 } 5520 5521 /*---------------------------------------------------------------------------- 5522 | Returns 1 if the extended double-precision floating-point value `a' is 5523 | less than or equal to the corresponding value `b', and 0 otherwise. The 5524 | invalid exception is raised if either operand is a NaN. The comparison is 5525 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5526 | Arithmetic. 5527 *----------------------------------------------------------------------------*/ 5528 5529 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5530 { 5531 flag aSign, bSign; 5532 5533 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5534 || (extractFloatx80Exp(a) == 0x7FFF 5535 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5536 || (extractFloatx80Exp(b) == 0x7FFF 5537 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5538 ) { 5539 float_raise(float_flag_invalid, status); 5540 return 0; 5541 } 5542 aSign = extractFloatx80Sign( a ); 5543 bSign = extractFloatx80Sign( b ); 5544 if ( aSign != bSign ) { 5545 return 5546 aSign 5547 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5548 == 0 ); 5549 } 5550 return 5551 aSign ? le128( b.high, b.low, a.high, a.low ) 5552 : le128( a.high, a.low, b.high, b.low ); 5553 5554 } 5555 5556 /*---------------------------------------------------------------------------- 5557 | Returns 1 if the extended double-precision floating-point value `a' is 5558 | less than the corresponding value `b', and 0 otherwise. The invalid 5559 | exception is raised if either operand is a NaN. The comparison is performed 5560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5561 *----------------------------------------------------------------------------*/ 5562 5563 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5564 { 5565 flag aSign, bSign; 5566 5567 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5568 || (extractFloatx80Exp(a) == 0x7FFF 5569 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5570 || (extractFloatx80Exp(b) == 0x7FFF 5571 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5572 ) { 5573 float_raise(float_flag_invalid, status); 5574 return 0; 5575 } 5576 aSign = extractFloatx80Sign( a ); 5577 bSign = extractFloatx80Sign( b ); 5578 if ( aSign != bSign ) { 5579 return 5580 aSign 5581 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5582 != 0 ); 5583 } 5584 return 5585 aSign ? lt128( b.high, b.low, a.high, a.low ) 5586 : lt128( a.high, a.low, b.high, b.low ); 5587 5588 } 5589 5590 /*---------------------------------------------------------------------------- 5591 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5592 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5593 | either operand is a NaN. The comparison is performed according to the 5594 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5595 *----------------------------------------------------------------------------*/ 5596 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5597 { 5598 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5599 || (extractFloatx80Exp(a) == 0x7FFF 5600 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5601 || (extractFloatx80Exp(b) == 0x7FFF 5602 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5603 ) { 5604 float_raise(float_flag_invalid, status); 5605 return 1; 5606 } 5607 return 0; 5608 } 5609 5610 /*---------------------------------------------------------------------------- 5611 | Returns 1 if the extended double-precision floating-point value `a' is 5612 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5613 | cause an exception. The comparison is performed according to the IEC/IEEE 5614 | Standard for Binary Floating-Point Arithmetic. 5615 *----------------------------------------------------------------------------*/ 5616 5617 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5618 { 5619 5620 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5621 float_raise(float_flag_invalid, status); 5622 return 0; 5623 } 5624 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5625 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5626 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5627 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5628 ) { 5629 if (floatx80_is_signaling_nan(a, status) 5630 || floatx80_is_signaling_nan(b, status)) { 5631 float_raise(float_flag_invalid, status); 5632 } 5633 return 0; 5634 } 5635 return 5636 ( a.low == b.low ) 5637 && ( ( a.high == b.high ) 5638 || ( ( a.low == 0 ) 5639 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5640 ); 5641 5642 } 5643 5644 /*---------------------------------------------------------------------------- 5645 | Returns 1 if the extended double-precision floating-point value `a' is less 5646 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5647 | do not cause an exception. Otherwise, the comparison is performed according 5648 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5649 *----------------------------------------------------------------------------*/ 5650 5651 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5652 { 5653 flag aSign, bSign; 5654 5655 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5656 float_raise(float_flag_invalid, status); 5657 return 0; 5658 } 5659 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5660 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5661 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5662 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5663 ) { 5664 if (floatx80_is_signaling_nan(a, status) 5665 || floatx80_is_signaling_nan(b, status)) { 5666 float_raise(float_flag_invalid, status); 5667 } 5668 return 0; 5669 } 5670 aSign = extractFloatx80Sign( a ); 5671 bSign = extractFloatx80Sign( b ); 5672 if ( aSign != bSign ) { 5673 return 5674 aSign 5675 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5676 == 0 ); 5677 } 5678 return 5679 aSign ? le128( b.high, b.low, a.high, a.low ) 5680 : le128( a.high, a.low, b.high, b.low ); 5681 5682 } 5683 5684 /*---------------------------------------------------------------------------- 5685 | Returns 1 if the extended double-precision floating-point value `a' is less 5686 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5687 | an exception. Otherwise, the comparison is performed according to the 5688 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5689 *----------------------------------------------------------------------------*/ 5690 5691 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5692 { 5693 flag aSign, bSign; 5694 5695 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5696 float_raise(float_flag_invalid, status); 5697 return 0; 5698 } 5699 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5700 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5701 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5702 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5703 ) { 5704 if (floatx80_is_signaling_nan(a, status) 5705 || floatx80_is_signaling_nan(b, status)) { 5706 float_raise(float_flag_invalid, status); 5707 } 5708 return 0; 5709 } 5710 aSign = extractFloatx80Sign( a ); 5711 bSign = extractFloatx80Sign( b ); 5712 if ( aSign != bSign ) { 5713 return 5714 aSign 5715 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5716 != 0 ); 5717 } 5718 return 5719 aSign ? lt128( b.high, b.low, a.high, a.low ) 5720 : lt128( a.high, a.low, b.high, b.low ); 5721 5722 } 5723 5724 /*---------------------------------------------------------------------------- 5725 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5726 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5727 | The comparison is performed according to the IEC/IEEE Standard for Binary 5728 | Floating-Point Arithmetic. 5729 *----------------------------------------------------------------------------*/ 5730 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5731 { 5732 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5733 float_raise(float_flag_invalid, status); 5734 return 1; 5735 } 5736 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5737 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5738 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5739 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5740 ) { 5741 if (floatx80_is_signaling_nan(a, status) 5742 || floatx80_is_signaling_nan(b, status)) { 5743 float_raise(float_flag_invalid, status); 5744 } 5745 return 1; 5746 } 5747 return 0; 5748 } 5749 5750 /*---------------------------------------------------------------------------- 5751 | Returns the result of converting the quadruple-precision floating-point 5752 | value `a' to the 32-bit two's complement integer format. The conversion 5753 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5754 | Arithmetic---which means in particular that the conversion is rounded 5755 | according to the current rounding mode. If `a' is a NaN, the largest 5756 | positive integer is returned. Otherwise, if the conversion overflows, the 5757 | largest integer with the same sign as `a' is returned. 5758 *----------------------------------------------------------------------------*/ 5759 5760 int32_t float128_to_int32(float128 a, float_status *status) 5761 { 5762 flag aSign; 5763 int32_t aExp, shiftCount; 5764 uint64_t aSig0, aSig1; 5765 5766 aSig1 = extractFloat128Frac1( a ); 5767 aSig0 = extractFloat128Frac0( a ); 5768 aExp = extractFloat128Exp( a ); 5769 aSign = extractFloat128Sign( a ); 5770 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5771 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5772 aSig0 |= ( aSig1 != 0 ); 5773 shiftCount = 0x4028 - aExp; 5774 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5775 return roundAndPackInt32(aSign, aSig0, status); 5776 5777 } 5778 5779 /*---------------------------------------------------------------------------- 5780 | Returns the result of converting the quadruple-precision floating-point 5781 | value `a' to the 32-bit two's complement integer format. The conversion 5782 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5783 | Arithmetic, except that the conversion is always rounded toward zero. If 5784 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5785 | conversion overflows, the largest integer with the same sign as `a' is 5786 | returned. 5787 *----------------------------------------------------------------------------*/ 5788 5789 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5790 { 5791 flag aSign; 5792 int32_t aExp, shiftCount; 5793 uint64_t aSig0, aSig1, savedASig; 5794 int32_t z; 5795 5796 aSig1 = extractFloat128Frac1( a ); 5797 aSig0 = extractFloat128Frac0( a ); 5798 aExp = extractFloat128Exp( a ); 5799 aSign = extractFloat128Sign( a ); 5800 aSig0 |= ( aSig1 != 0 ); 5801 if ( 0x401E < aExp ) { 5802 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5803 goto invalid; 5804 } 5805 else if ( aExp < 0x3FFF ) { 5806 if (aExp || aSig0) { 5807 status->float_exception_flags |= float_flag_inexact; 5808 } 5809 return 0; 5810 } 5811 aSig0 |= LIT64( 0x0001000000000000 ); 5812 shiftCount = 0x402F - aExp; 5813 savedASig = aSig0; 5814 aSig0 >>= shiftCount; 5815 z = aSig0; 5816 if ( aSign ) z = - z; 5817 if ( ( z < 0 ) ^ aSign ) { 5818 invalid: 5819 float_raise(float_flag_invalid, status); 5820 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5821 } 5822 if ( ( aSig0<<shiftCount ) != savedASig ) { 5823 status->float_exception_flags |= float_flag_inexact; 5824 } 5825 return z; 5826 5827 } 5828 5829 /*---------------------------------------------------------------------------- 5830 | Returns the result of converting the quadruple-precision floating-point 5831 | value `a' to the 64-bit two's complement integer format. The conversion 5832 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5833 | Arithmetic---which means in particular that the conversion is rounded 5834 | according to the current rounding mode. If `a' is a NaN, the largest 5835 | positive integer is returned. Otherwise, if the conversion overflows, the 5836 | largest integer with the same sign as `a' is returned. 5837 *----------------------------------------------------------------------------*/ 5838 5839 int64_t float128_to_int64(float128 a, float_status *status) 5840 { 5841 flag aSign; 5842 int32_t aExp, shiftCount; 5843 uint64_t aSig0, aSig1; 5844 5845 aSig1 = extractFloat128Frac1( a ); 5846 aSig0 = extractFloat128Frac0( a ); 5847 aExp = extractFloat128Exp( a ); 5848 aSign = extractFloat128Sign( a ); 5849 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5850 shiftCount = 0x402F - aExp; 5851 if ( shiftCount <= 0 ) { 5852 if ( 0x403E < aExp ) { 5853 float_raise(float_flag_invalid, status); 5854 if ( ! aSign 5855 || ( ( aExp == 0x7FFF ) 5856 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5857 ) 5858 ) { 5859 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5860 } 5861 return (int64_t) LIT64( 0x8000000000000000 ); 5862 } 5863 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5864 } 5865 else { 5866 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5867 } 5868 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5869 5870 } 5871 5872 /*---------------------------------------------------------------------------- 5873 | Returns the result of converting the quadruple-precision floating-point 5874 | value `a' to the 64-bit two's complement integer format. The conversion 5875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5876 | Arithmetic, except that the conversion is always rounded toward zero. 5877 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5878 | the conversion overflows, the largest integer with the same sign as `a' is 5879 | returned. 5880 *----------------------------------------------------------------------------*/ 5881 5882 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5883 { 5884 flag aSign; 5885 int32_t aExp, shiftCount; 5886 uint64_t aSig0, aSig1; 5887 int64_t z; 5888 5889 aSig1 = extractFloat128Frac1( a ); 5890 aSig0 = extractFloat128Frac0( a ); 5891 aExp = extractFloat128Exp( a ); 5892 aSign = extractFloat128Sign( a ); 5893 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5894 shiftCount = aExp - 0x402F; 5895 if ( 0 < shiftCount ) { 5896 if ( 0x403E <= aExp ) { 5897 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5898 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5899 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5900 if (aSig1) { 5901 status->float_exception_flags |= float_flag_inexact; 5902 } 5903 } 5904 else { 5905 float_raise(float_flag_invalid, status); 5906 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5907 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5908 } 5909 } 5910 return (int64_t) LIT64( 0x8000000000000000 ); 5911 } 5912 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5913 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5914 status->float_exception_flags |= float_flag_inexact; 5915 } 5916 } 5917 else { 5918 if ( aExp < 0x3FFF ) { 5919 if ( aExp | aSig0 | aSig1 ) { 5920 status->float_exception_flags |= float_flag_inexact; 5921 } 5922 return 0; 5923 } 5924 z = aSig0>>( - shiftCount ); 5925 if ( aSig1 5926 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5927 status->float_exception_flags |= float_flag_inexact; 5928 } 5929 } 5930 if ( aSign ) z = - z; 5931 return z; 5932 5933 } 5934 5935 /*---------------------------------------------------------------------------- 5936 | Returns the result of converting the quadruple-precision floating-point value 5937 | `a' to the 64-bit unsigned integer format. The conversion is 5938 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5939 | Arithmetic---which means in particular that the conversion is rounded 5940 | according to the current rounding mode. If `a' is a NaN, the largest 5941 | positive integer is returned. If the conversion overflows, the 5942 | largest unsigned integer is returned. If 'a' is negative, the value is 5943 | rounded and zero is returned; negative values that do not round to zero 5944 | will raise the inexact exception. 5945 *----------------------------------------------------------------------------*/ 5946 5947 uint64_t float128_to_uint64(float128 a, float_status *status) 5948 { 5949 flag aSign; 5950 int aExp; 5951 int shiftCount; 5952 uint64_t aSig0, aSig1; 5953 5954 aSig0 = extractFloat128Frac0(a); 5955 aSig1 = extractFloat128Frac1(a); 5956 aExp = extractFloat128Exp(a); 5957 aSign = extractFloat128Sign(a); 5958 if (aSign && (aExp > 0x3FFE)) { 5959 float_raise(float_flag_invalid, status); 5960 if (float128_is_any_nan(a)) { 5961 return LIT64(0xFFFFFFFFFFFFFFFF); 5962 } else { 5963 return 0; 5964 } 5965 } 5966 if (aExp) { 5967 aSig0 |= LIT64(0x0001000000000000); 5968 } 5969 shiftCount = 0x402F - aExp; 5970 if (shiftCount <= 0) { 5971 if (0x403E < aExp) { 5972 float_raise(float_flag_invalid, status); 5973 return LIT64(0xFFFFFFFFFFFFFFFF); 5974 } 5975 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5976 } else { 5977 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5978 } 5979 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5980 } 5981 5982 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5983 { 5984 uint64_t v; 5985 signed char current_rounding_mode = status->float_rounding_mode; 5986 5987 set_float_rounding_mode(float_round_to_zero, status); 5988 v = float128_to_uint64(a, status); 5989 set_float_rounding_mode(current_rounding_mode, status); 5990 5991 return v; 5992 } 5993 5994 /*---------------------------------------------------------------------------- 5995 | Returns the result of converting the quadruple-precision floating-point 5996 | value `a' to the 32-bit unsigned integer format. The conversion 5997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5998 | Arithmetic except that the conversion is always rounded toward zero. 5999 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6000 | if the conversion overflows, the largest unsigned integer is returned. 6001 | If 'a' is negative, the value is rounded and zero is returned; negative 6002 | values that do not round to zero will raise the inexact exception. 6003 *----------------------------------------------------------------------------*/ 6004 6005 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6006 { 6007 uint64_t v; 6008 uint32_t res; 6009 int old_exc_flags = get_float_exception_flags(status); 6010 6011 v = float128_to_uint64_round_to_zero(a, status); 6012 if (v > 0xffffffff) { 6013 res = 0xffffffff; 6014 } else { 6015 return v; 6016 } 6017 set_float_exception_flags(old_exc_flags, status); 6018 float_raise(float_flag_invalid, status); 6019 return res; 6020 } 6021 6022 /*---------------------------------------------------------------------------- 6023 | Returns the result of converting the quadruple-precision floating-point 6024 | value `a' to the single-precision floating-point format. The conversion 6025 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6026 | Arithmetic. 6027 *----------------------------------------------------------------------------*/ 6028 6029 float32 float128_to_float32(float128 a, float_status *status) 6030 { 6031 flag aSign; 6032 int32_t aExp; 6033 uint64_t aSig0, aSig1; 6034 uint32_t zSig; 6035 6036 aSig1 = extractFloat128Frac1( a ); 6037 aSig0 = extractFloat128Frac0( a ); 6038 aExp = extractFloat128Exp( a ); 6039 aSign = extractFloat128Sign( a ); 6040 if ( aExp == 0x7FFF ) { 6041 if ( aSig0 | aSig1 ) { 6042 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6043 } 6044 return packFloat32( aSign, 0xFF, 0 ); 6045 } 6046 aSig0 |= ( aSig1 != 0 ); 6047 shift64RightJamming( aSig0, 18, &aSig0 ); 6048 zSig = aSig0; 6049 if ( aExp || zSig ) { 6050 zSig |= 0x40000000; 6051 aExp -= 0x3F81; 6052 } 6053 return roundAndPackFloat32(aSign, aExp, zSig, status); 6054 6055 } 6056 6057 /*---------------------------------------------------------------------------- 6058 | Returns the result of converting the quadruple-precision floating-point 6059 | value `a' to the double-precision floating-point format. The conversion 6060 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6061 | Arithmetic. 6062 *----------------------------------------------------------------------------*/ 6063 6064 float64 float128_to_float64(float128 a, float_status *status) 6065 { 6066 flag aSign; 6067 int32_t aExp; 6068 uint64_t aSig0, aSig1; 6069 6070 aSig1 = extractFloat128Frac1( a ); 6071 aSig0 = extractFloat128Frac0( a ); 6072 aExp = extractFloat128Exp( a ); 6073 aSign = extractFloat128Sign( a ); 6074 if ( aExp == 0x7FFF ) { 6075 if ( aSig0 | aSig1 ) { 6076 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6077 } 6078 return packFloat64( aSign, 0x7FF, 0 ); 6079 } 6080 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6081 aSig0 |= ( aSig1 != 0 ); 6082 if ( aExp || aSig0 ) { 6083 aSig0 |= LIT64( 0x4000000000000000 ); 6084 aExp -= 0x3C01; 6085 } 6086 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6087 6088 } 6089 6090 /*---------------------------------------------------------------------------- 6091 | Returns the result of converting the quadruple-precision floating-point 6092 | value `a' to the extended double-precision floating-point format. The 6093 | conversion is performed according to the IEC/IEEE Standard for Binary 6094 | Floating-Point Arithmetic. 6095 *----------------------------------------------------------------------------*/ 6096 6097 floatx80 float128_to_floatx80(float128 a, float_status *status) 6098 { 6099 flag aSign; 6100 int32_t aExp; 6101 uint64_t aSig0, aSig1; 6102 6103 aSig1 = extractFloat128Frac1( a ); 6104 aSig0 = extractFloat128Frac0( a ); 6105 aExp = extractFloat128Exp( a ); 6106 aSign = extractFloat128Sign( a ); 6107 if ( aExp == 0x7FFF ) { 6108 if ( aSig0 | aSig1 ) { 6109 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6110 } 6111 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6112 } 6113 if ( aExp == 0 ) { 6114 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6115 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6116 } 6117 else { 6118 aSig0 |= LIT64( 0x0001000000000000 ); 6119 } 6120 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6121 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6122 6123 } 6124 6125 /*---------------------------------------------------------------------------- 6126 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6127 | returns the result as a quadruple-precision floating-point value. The 6128 | operation is performed according to the IEC/IEEE Standard for Binary 6129 | Floating-Point Arithmetic. 6130 *----------------------------------------------------------------------------*/ 6131 6132 float128 float128_round_to_int(float128 a, float_status *status) 6133 { 6134 flag aSign; 6135 int32_t aExp; 6136 uint64_t lastBitMask, roundBitsMask; 6137 float128 z; 6138 6139 aExp = extractFloat128Exp( a ); 6140 if ( 0x402F <= aExp ) { 6141 if ( 0x406F <= aExp ) { 6142 if ( ( aExp == 0x7FFF ) 6143 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6144 ) { 6145 return propagateFloat128NaN(a, a, status); 6146 } 6147 return a; 6148 } 6149 lastBitMask = 1; 6150 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6151 roundBitsMask = lastBitMask - 1; 6152 z = a; 6153 switch (status->float_rounding_mode) { 6154 case float_round_nearest_even: 6155 if ( lastBitMask ) { 6156 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6157 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6158 } 6159 else { 6160 if ( (int64_t) z.low < 0 ) { 6161 ++z.high; 6162 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6163 } 6164 } 6165 break; 6166 case float_round_ties_away: 6167 if (lastBitMask) { 6168 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6169 } else { 6170 if ((int64_t) z.low < 0) { 6171 ++z.high; 6172 } 6173 } 6174 break; 6175 case float_round_to_zero: 6176 break; 6177 case float_round_up: 6178 if (!extractFloat128Sign(z)) { 6179 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6180 } 6181 break; 6182 case float_round_down: 6183 if (extractFloat128Sign(z)) { 6184 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6185 } 6186 break; 6187 default: 6188 abort(); 6189 } 6190 z.low &= ~ roundBitsMask; 6191 } 6192 else { 6193 if ( aExp < 0x3FFF ) { 6194 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6195 status->float_exception_flags |= float_flag_inexact; 6196 aSign = extractFloat128Sign( a ); 6197 switch (status->float_rounding_mode) { 6198 case float_round_nearest_even: 6199 if ( ( aExp == 0x3FFE ) 6200 && ( extractFloat128Frac0( a ) 6201 | extractFloat128Frac1( a ) ) 6202 ) { 6203 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6204 } 6205 break; 6206 case float_round_ties_away: 6207 if (aExp == 0x3FFE) { 6208 return packFloat128(aSign, 0x3FFF, 0, 0); 6209 } 6210 break; 6211 case float_round_down: 6212 return 6213 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6214 : packFloat128( 0, 0, 0, 0 ); 6215 case float_round_up: 6216 return 6217 aSign ? packFloat128( 1, 0, 0, 0 ) 6218 : packFloat128( 0, 0x3FFF, 0, 0 ); 6219 } 6220 return packFloat128( aSign, 0, 0, 0 ); 6221 } 6222 lastBitMask = 1; 6223 lastBitMask <<= 0x402F - aExp; 6224 roundBitsMask = lastBitMask - 1; 6225 z.low = 0; 6226 z.high = a.high; 6227 switch (status->float_rounding_mode) { 6228 case float_round_nearest_even: 6229 z.high += lastBitMask>>1; 6230 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6231 z.high &= ~ lastBitMask; 6232 } 6233 break; 6234 case float_round_ties_away: 6235 z.high += lastBitMask>>1; 6236 break; 6237 case float_round_to_zero: 6238 break; 6239 case float_round_up: 6240 if (!extractFloat128Sign(z)) { 6241 z.high |= ( a.low != 0 ); 6242 z.high += roundBitsMask; 6243 } 6244 break; 6245 case float_round_down: 6246 if (extractFloat128Sign(z)) { 6247 z.high |= (a.low != 0); 6248 z.high += roundBitsMask; 6249 } 6250 break; 6251 default: 6252 abort(); 6253 } 6254 z.high &= ~ roundBitsMask; 6255 } 6256 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6257 status->float_exception_flags |= float_flag_inexact; 6258 } 6259 return z; 6260 6261 } 6262 6263 /*---------------------------------------------------------------------------- 6264 | Returns the result of adding the absolute values of the quadruple-precision 6265 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6266 | before being returned. `zSign' is ignored if the result is a NaN. 6267 | The addition is performed according to the IEC/IEEE Standard for Binary 6268 | Floating-Point Arithmetic. 6269 *----------------------------------------------------------------------------*/ 6270 6271 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6272 float_status *status) 6273 { 6274 int32_t aExp, bExp, zExp; 6275 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6276 int32_t expDiff; 6277 6278 aSig1 = extractFloat128Frac1( a ); 6279 aSig0 = extractFloat128Frac0( a ); 6280 aExp = extractFloat128Exp( a ); 6281 bSig1 = extractFloat128Frac1( b ); 6282 bSig0 = extractFloat128Frac0( b ); 6283 bExp = extractFloat128Exp( b ); 6284 expDiff = aExp - bExp; 6285 if ( 0 < expDiff ) { 6286 if ( aExp == 0x7FFF ) { 6287 if (aSig0 | aSig1) { 6288 return propagateFloat128NaN(a, b, status); 6289 } 6290 return a; 6291 } 6292 if ( bExp == 0 ) { 6293 --expDiff; 6294 } 6295 else { 6296 bSig0 |= LIT64( 0x0001000000000000 ); 6297 } 6298 shift128ExtraRightJamming( 6299 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6300 zExp = aExp; 6301 } 6302 else if ( expDiff < 0 ) { 6303 if ( bExp == 0x7FFF ) { 6304 if (bSig0 | bSig1) { 6305 return propagateFloat128NaN(a, b, status); 6306 } 6307 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6308 } 6309 if ( aExp == 0 ) { 6310 ++expDiff; 6311 } 6312 else { 6313 aSig0 |= LIT64( 0x0001000000000000 ); 6314 } 6315 shift128ExtraRightJamming( 6316 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6317 zExp = bExp; 6318 } 6319 else { 6320 if ( aExp == 0x7FFF ) { 6321 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6322 return propagateFloat128NaN(a, b, status); 6323 } 6324 return a; 6325 } 6326 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6327 if ( aExp == 0 ) { 6328 if (status->flush_to_zero) { 6329 if (zSig0 | zSig1) { 6330 float_raise(float_flag_output_denormal, status); 6331 } 6332 return packFloat128(zSign, 0, 0, 0); 6333 } 6334 return packFloat128( zSign, 0, zSig0, zSig1 ); 6335 } 6336 zSig2 = 0; 6337 zSig0 |= LIT64( 0x0002000000000000 ); 6338 zExp = aExp; 6339 goto shiftRight1; 6340 } 6341 aSig0 |= LIT64( 0x0001000000000000 ); 6342 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6343 --zExp; 6344 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6345 ++zExp; 6346 shiftRight1: 6347 shift128ExtraRightJamming( 6348 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6349 roundAndPack: 6350 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6351 6352 } 6353 6354 /*---------------------------------------------------------------------------- 6355 | Returns the result of subtracting the absolute values of the quadruple- 6356 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6357 | difference is negated before being returned. `zSign' is ignored if the 6358 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6359 | Standard for Binary Floating-Point Arithmetic. 6360 *----------------------------------------------------------------------------*/ 6361 6362 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6363 float_status *status) 6364 { 6365 int32_t aExp, bExp, zExp; 6366 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6367 int32_t expDiff; 6368 6369 aSig1 = extractFloat128Frac1( a ); 6370 aSig0 = extractFloat128Frac0( a ); 6371 aExp = extractFloat128Exp( a ); 6372 bSig1 = extractFloat128Frac1( b ); 6373 bSig0 = extractFloat128Frac0( b ); 6374 bExp = extractFloat128Exp( b ); 6375 expDiff = aExp - bExp; 6376 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6377 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6378 if ( 0 < expDiff ) goto aExpBigger; 6379 if ( expDiff < 0 ) goto bExpBigger; 6380 if ( aExp == 0x7FFF ) { 6381 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6382 return propagateFloat128NaN(a, b, status); 6383 } 6384 float_raise(float_flag_invalid, status); 6385 return float128_default_nan(status); 6386 } 6387 if ( aExp == 0 ) { 6388 aExp = 1; 6389 bExp = 1; 6390 } 6391 if ( bSig0 < aSig0 ) goto aBigger; 6392 if ( aSig0 < bSig0 ) goto bBigger; 6393 if ( bSig1 < aSig1 ) goto aBigger; 6394 if ( aSig1 < bSig1 ) goto bBigger; 6395 return packFloat128(status->float_rounding_mode == float_round_down, 6396 0, 0, 0); 6397 bExpBigger: 6398 if ( bExp == 0x7FFF ) { 6399 if (bSig0 | bSig1) { 6400 return propagateFloat128NaN(a, b, status); 6401 } 6402 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6403 } 6404 if ( aExp == 0 ) { 6405 ++expDiff; 6406 } 6407 else { 6408 aSig0 |= LIT64( 0x4000000000000000 ); 6409 } 6410 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6411 bSig0 |= LIT64( 0x4000000000000000 ); 6412 bBigger: 6413 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6414 zExp = bExp; 6415 zSign ^= 1; 6416 goto normalizeRoundAndPack; 6417 aExpBigger: 6418 if ( aExp == 0x7FFF ) { 6419 if (aSig0 | aSig1) { 6420 return propagateFloat128NaN(a, b, status); 6421 } 6422 return a; 6423 } 6424 if ( bExp == 0 ) { 6425 --expDiff; 6426 } 6427 else { 6428 bSig0 |= LIT64( 0x4000000000000000 ); 6429 } 6430 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6431 aSig0 |= LIT64( 0x4000000000000000 ); 6432 aBigger: 6433 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6434 zExp = aExp; 6435 normalizeRoundAndPack: 6436 --zExp; 6437 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6438 status); 6439 6440 } 6441 6442 /*---------------------------------------------------------------------------- 6443 | Returns the result of adding the quadruple-precision floating-point values 6444 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6445 | for Binary Floating-Point Arithmetic. 6446 *----------------------------------------------------------------------------*/ 6447 6448 float128 float128_add(float128 a, float128 b, float_status *status) 6449 { 6450 flag aSign, bSign; 6451 6452 aSign = extractFloat128Sign( a ); 6453 bSign = extractFloat128Sign( b ); 6454 if ( aSign == bSign ) { 6455 return addFloat128Sigs(a, b, aSign, status); 6456 } 6457 else { 6458 return subFloat128Sigs(a, b, aSign, status); 6459 } 6460 6461 } 6462 6463 /*---------------------------------------------------------------------------- 6464 | Returns the result of subtracting the quadruple-precision floating-point 6465 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6466 | Standard for Binary Floating-Point Arithmetic. 6467 *----------------------------------------------------------------------------*/ 6468 6469 float128 float128_sub(float128 a, float128 b, float_status *status) 6470 { 6471 flag aSign, bSign; 6472 6473 aSign = extractFloat128Sign( a ); 6474 bSign = extractFloat128Sign( b ); 6475 if ( aSign == bSign ) { 6476 return subFloat128Sigs(a, b, aSign, status); 6477 } 6478 else { 6479 return addFloat128Sigs(a, b, aSign, status); 6480 } 6481 6482 } 6483 6484 /*---------------------------------------------------------------------------- 6485 | Returns the result of multiplying the quadruple-precision floating-point 6486 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6487 | Standard for Binary Floating-Point Arithmetic. 6488 *----------------------------------------------------------------------------*/ 6489 6490 float128 float128_mul(float128 a, float128 b, float_status *status) 6491 { 6492 flag aSign, bSign, zSign; 6493 int32_t aExp, bExp, zExp; 6494 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6495 6496 aSig1 = extractFloat128Frac1( a ); 6497 aSig0 = extractFloat128Frac0( a ); 6498 aExp = extractFloat128Exp( a ); 6499 aSign = extractFloat128Sign( a ); 6500 bSig1 = extractFloat128Frac1( b ); 6501 bSig0 = extractFloat128Frac0( b ); 6502 bExp = extractFloat128Exp( b ); 6503 bSign = extractFloat128Sign( b ); 6504 zSign = aSign ^ bSign; 6505 if ( aExp == 0x7FFF ) { 6506 if ( ( aSig0 | aSig1 ) 6507 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6508 return propagateFloat128NaN(a, b, status); 6509 } 6510 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6511 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6512 } 6513 if ( bExp == 0x7FFF ) { 6514 if (bSig0 | bSig1) { 6515 return propagateFloat128NaN(a, b, status); 6516 } 6517 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6518 invalid: 6519 float_raise(float_flag_invalid, status); 6520 return float128_default_nan(status); 6521 } 6522 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6523 } 6524 if ( aExp == 0 ) { 6525 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6526 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6527 } 6528 if ( bExp == 0 ) { 6529 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6530 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6531 } 6532 zExp = aExp + bExp - 0x4000; 6533 aSig0 |= LIT64( 0x0001000000000000 ); 6534 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6535 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6536 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6537 zSig2 |= ( zSig3 != 0 ); 6538 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6539 shift128ExtraRightJamming( 6540 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6541 ++zExp; 6542 } 6543 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6544 6545 } 6546 6547 /*---------------------------------------------------------------------------- 6548 | Returns the result of dividing the quadruple-precision floating-point value 6549 | `a' by the corresponding value `b'. The operation is performed according to 6550 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6551 *----------------------------------------------------------------------------*/ 6552 6553 float128 float128_div(float128 a, float128 b, float_status *status) 6554 { 6555 flag aSign, bSign, zSign; 6556 int32_t aExp, bExp, zExp; 6557 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6558 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6559 6560 aSig1 = extractFloat128Frac1( a ); 6561 aSig0 = extractFloat128Frac0( a ); 6562 aExp = extractFloat128Exp( a ); 6563 aSign = extractFloat128Sign( a ); 6564 bSig1 = extractFloat128Frac1( b ); 6565 bSig0 = extractFloat128Frac0( b ); 6566 bExp = extractFloat128Exp( b ); 6567 bSign = extractFloat128Sign( b ); 6568 zSign = aSign ^ bSign; 6569 if ( aExp == 0x7FFF ) { 6570 if (aSig0 | aSig1) { 6571 return propagateFloat128NaN(a, b, status); 6572 } 6573 if ( bExp == 0x7FFF ) { 6574 if (bSig0 | bSig1) { 6575 return propagateFloat128NaN(a, b, status); 6576 } 6577 goto invalid; 6578 } 6579 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6580 } 6581 if ( bExp == 0x7FFF ) { 6582 if (bSig0 | bSig1) { 6583 return propagateFloat128NaN(a, b, status); 6584 } 6585 return packFloat128( zSign, 0, 0, 0 ); 6586 } 6587 if ( bExp == 0 ) { 6588 if ( ( bSig0 | bSig1 ) == 0 ) { 6589 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6590 invalid: 6591 float_raise(float_flag_invalid, status); 6592 return float128_default_nan(status); 6593 } 6594 float_raise(float_flag_divbyzero, status); 6595 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6596 } 6597 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6598 } 6599 if ( aExp == 0 ) { 6600 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6601 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6602 } 6603 zExp = aExp - bExp + 0x3FFD; 6604 shortShift128Left( 6605 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6606 shortShift128Left( 6607 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6608 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6609 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6610 ++zExp; 6611 } 6612 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6613 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6614 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6615 while ( (int64_t) rem0 < 0 ) { 6616 --zSig0; 6617 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6618 } 6619 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6620 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6621 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6622 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6623 while ( (int64_t) rem1 < 0 ) { 6624 --zSig1; 6625 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6626 } 6627 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6628 } 6629 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6630 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6631 6632 } 6633 6634 /*---------------------------------------------------------------------------- 6635 | Returns the remainder of the quadruple-precision floating-point value `a' 6636 | with respect to the corresponding value `b'. The operation is performed 6637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6638 *----------------------------------------------------------------------------*/ 6639 6640 float128 float128_rem(float128 a, float128 b, float_status *status) 6641 { 6642 flag aSign, zSign; 6643 int32_t aExp, bExp, expDiff; 6644 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6645 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6646 int64_t sigMean0; 6647 6648 aSig1 = extractFloat128Frac1( a ); 6649 aSig0 = extractFloat128Frac0( a ); 6650 aExp = extractFloat128Exp( a ); 6651 aSign = extractFloat128Sign( a ); 6652 bSig1 = extractFloat128Frac1( b ); 6653 bSig0 = extractFloat128Frac0( b ); 6654 bExp = extractFloat128Exp( b ); 6655 if ( aExp == 0x7FFF ) { 6656 if ( ( aSig0 | aSig1 ) 6657 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6658 return propagateFloat128NaN(a, b, status); 6659 } 6660 goto invalid; 6661 } 6662 if ( bExp == 0x7FFF ) { 6663 if (bSig0 | bSig1) { 6664 return propagateFloat128NaN(a, b, status); 6665 } 6666 return a; 6667 } 6668 if ( bExp == 0 ) { 6669 if ( ( bSig0 | bSig1 ) == 0 ) { 6670 invalid: 6671 float_raise(float_flag_invalid, status); 6672 return float128_default_nan(status); 6673 } 6674 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6675 } 6676 if ( aExp == 0 ) { 6677 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6678 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6679 } 6680 expDiff = aExp - bExp; 6681 if ( expDiff < -1 ) return a; 6682 shortShift128Left( 6683 aSig0 | LIT64( 0x0001000000000000 ), 6684 aSig1, 6685 15 - ( expDiff < 0 ), 6686 &aSig0, 6687 &aSig1 6688 ); 6689 shortShift128Left( 6690 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6691 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6692 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6693 expDiff -= 64; 6694 while ( 0 < expDiff ) { 6695 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6696 q = ( 4 < q ) ? q - 4 : 0; 6697 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6698 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6699 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6700 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6701 expDiff -= 61; 6702 } 6703 if ( -64 < expDiff ) { 6704 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6705 q = ( 4 < q ) ? q - 4 : 0; 6706 q >>= - expDiff; 6707 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6708 expDiff += 52; 6709 if ( expDiff < 0 ) { 6710 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6711 } 6712 else { 6713 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6714 } 6715 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6716 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6717 } 6718 else { 6719 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6720 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6721 } 6722 do { 6723 alternateASig0 = aSig0; 6724 alternateASig1 = aSig1; 6725 ++q; 6726 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6727 } while ( 0 <= (int64_t) aSig0 ); 6728 add128( 6729 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6730 if ( ( sigMean0 < 0 ) 6731 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6732 aSig0 = alternateASig0; 6733 aSig1 = alternateASig1; 6734 } 6735 zSign = ( (int64_t) aSig0 < 0 ); 6736 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6737 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6738 status); 6739 } 6740 6741 /*---------------------------------------------------------------------------- 6742 | Returns the square root of the quadruple-precision floating-point value `a'. 6743 | The operation is performed according to the IEC/IEEE Standard for Binary 6744 | Floating-Point Arithmetic. 6745 *----------------------------------------------------------------------------*/ 6746 6747 float128 float128_sqrt(float128 a, float_status *status) 6748 { 6749 flag aSign; 6750 int32_t aExp, zExp; 6751 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6752 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6753 6754 aSig1 = extractFloat128Frac1( a ); 6755 aSig0 = extractFloat128Frac0( a ); 6756 aExp = extractFloat128Exp( a ); 6757 aSign = extractFloat128Sign( a ); 6758 if ( aExp == 0x7FFF ) { 6759 if (aSig0 | aSig1) { 6760 return propagateFloat128NaN(a, a, status); 6761 } 6762 if ( ! aSign ) return a; 6763 goto invalid; 6764 } 6765 if ( aSign ) { 6766 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6767 invalid: 6768 float_raise(float_flag_invalid, status); 6769 return float128_default_nan(status); 6770 } 6771 if ( aExp == 0 ) { 6772 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6773 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6774 } 6775 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6776 aSig0 |= LIT64( 0x0001000000000000 ); 6777 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6778 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6779 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6780 doubleZSig0 = zSig0<<1; 6781 mul64To128( zSig0, zSig0, &term0, &term1 ); 6782 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6783 while ( (int64_t) rem0 < 0 ) { 6784 --zSig0; 6785 doubleZSig0 -= 2; 6786 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6787 } 6788 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6789 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6790 if ( zSig1 == 0 ) zSig1 = 1; 6791 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6792 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6793 mul64To128( zSig1, zSig1, &term2, &term3 ); 6794 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6795 while ( (int64_t) rem1 < 0 ) { 6796 --zSig1; 6797 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6798 term3 |= 1; 6799 term2 |= doubleZSig0; 6800 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6801 } 6802 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6803 } 6804 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6805 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6806 6807 } 6808 6809 /*---------------------------------------------------------------------------- 6810 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6811 | the corresponding value `b', and 0 otherwise. The invalid exception is 6812 | raised if either operand is a NaN. Otherwise, the comparison is performed 6813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6814 *----------------------------------------------------------------------------*/ 6815 6816 int float128_eq(float128 a, float128 b, float_status *status) 6817 { 6818 6819 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6820 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6821 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6822 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6823 ) { 6824 float_raise(float_flag_invalid, status); 6825 return 0; 6826 } 6827 return 6828 ( a.low == b.low ) 6829 && ( ( a.high == b.high ) 6830 || ( ( a.low == 0 ) 6831 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6832 ); 6833 6834 } 6835 6836 /*---------------------------------------------------------------------------- 6837 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6838 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6839 | exception is raised if either operand is a NaN. The comparison is performed 6840 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6841 *----------------------------------------------------------------------------*/ 6842 6843 int float128_le(float128 a, float128 b, float_status *status) 6844 { 6845 flag aSign, bSign; 6846 6847 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6848 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6849 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6850 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6851 ) { 6852 float_raise(float_flag_invalid, status); 6853 return 0; 6854 } 6855 aSign = extractFloat128Sign( a ); 6856 bSign = extractFloat128Sign( b ); 6857 if ( aSign != bSign ) { 6858 return 6859 aSign 6860 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6861 == 0 ); 6862 } 6863 return 6864 aSign ? le128( b.high, b.low, a.high, a.low ) 6865 : le128( a.high, a.low, b.high, b.low ); 6866 6867 } 6868 6869 /*---------------------------------------------------------------------------- 6870 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6871 | the corresponding value `b', and 0 otherwise. The invalid exception is 6872 | raised if either operand is a NaN. The comparison is performed according 6873 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6874 *----------------------------------------------------------------------------*/ 6875 6876 int float128_lt(float128 a, float128 b, float_status *status) 6877 { 6878 flag aSign, bSign; 6879 6880 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6881 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6882 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6883 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6884 ) { 6885 float_raise(float_flag_invalid, status); 6886 return 0; 6887 } 6888 aSign = extractFloat128Sign( a ); 6889 bSign = extractFloat128Sign( b ); 6890 if ( aSign != bSign ) { 6891 return 6892 aSign 6893 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6894 != 0 ); 6895 } 6896 return 6897 aSign ? lt128( b.high, b.low, a.high, a.low ) 6898 : lt128( a.high, a.low, b.high, b.low ); 6899 6900 } 6901 6902 /*---------------------------------------------------------------------------- 6903 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6904 | be compared, and 0 otherwise. The invalid exception is raised if either 6905 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6906 | Standard for Binary Floating-Point Arithmetic. 6907 *----------------------------------------------------------------------------*/ 6908 6909 int float128_unordered(float128 a, float128 b, float_status *status) 6910 { 6911 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6912 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6913 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6914 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6915 ) { 6916 float_raise(float_flag_invalid, status); 6917 return 1; 6918 } 6919 return 0; 6920 } 6921 6922 /*---------------------------------------------------------------------------- 6923 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6924 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6925 | exception. The comparison is performed according to the IEC/IEEE Standard 6926 | for Binary Floating-Point Arithmetic. 6927 *----------------------------------------------------------------------------*/ 6928 6929 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6930 { 6931 6932 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6933 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6934 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6935 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6936 ) { 6937 if (float128_is_signaling_nan(a, status) 6938 || float128_is_signaling_nan(b, status)) { 6939 float_raise(float_flag_invalid, status); 6940 } 6941 return 0; 6942 } 6943 return 6944 ( a.low == b.low ) 6945 && ( ( a.high == b.high ) 6946 || ( ( a.low == 0 ) 6947 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6948 ); 6949 6950 } 6951 6952 /*---------------------------------------------------------------------------- 6953 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6954 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6955 | cause an exception. Otherwise, the comparison is performed according to the 6956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6957 *----------------------------------------------------------------------------*/ 6958 6959 int float128_le_quiet(float128 a, float128 b, float_status *status) 6960 { 6961 flag aSign, bSign; 6962 6963 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6964 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6965 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6966 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6967 ) { 6968 if (float128_is_signaling_nan(a, status) 6969 || float128_is_signaling_nan(b, status)) { 6970 float_raise(float_flag_invalid, status); 6971 } 6972 return 0; 6973 } 6974 aSign = extractFloat128Sign( a ); 6975 bSign = extractFloat128Sign( b ); 6976 if ( aSign != bSign ) { 6977 return 6978 aSign 6979 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6980 == 0 ); 6981 } 6982 return 6983 aSign ? le128( b.high, b.low, a.high, a.low ) 6984 : le128( a.high, a.low, b.high, b.low ); 6985 6986 } 6987 6988 /*---------------------------------------------------------------------------- 6989 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6990 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6991 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6992 | Standard for Binary Floating-Point Arithmetic. 6993 *----------------------------------------------------------------------------*/ 6994 6995 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6996 { 6997 flag aSign, bSign; 6998 6999 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7000 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7001 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7002 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7003 ) { 7004 if (float128_is_signaling_nan(a, status) 7005 || float128_is_signaling_nan(b, status)) { 7006 float_raise(float_flag_invalid, status); 7007 } 7008 return 0; 7009 } 7010 aSign = extractFloat128Sign( a ); 7011 bSign = extractFloat128Sign( b ); 7012 if ( aSign != bSign ) { 7013 return 7014 aSign 7015 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7016 != 0 ); 7017 } 7018 return 7019 aSign ? lt128( b.high, b.low, a.high, a.low ) 7020 : lt128( a.high, a.low, b.high, b.low ); 7021 7022 } 7023 7024 /*---------------------------------------------------------------------------- 7025 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7026 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7027 | comparison is performed according to the IEC/IEEE Standard for Binary 7028 | Floating-Point Arithmetic. 7029 *----------------------------------------------------------------------------*/ 7030 7031 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7032 { 7033 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7034 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7035 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7036 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7037 ) { 7038 if (float128_is_signaling_nan(a, status) 7039 || float128_is_signaling_nan(b, status)) { 7040 float_raise(float_flag_invalid, status); 7041 } 7042 return 1; 7043 } 7044 return 0; 7045 } 7046 7047 /* misc functions */ 7048 float32 uint32_to_float32(uint32_t a, float_status *status) 7049 { 7050 return int64_to_float32(a, status); 7051 } 7052 7053 float64 uint32_to_float64(uint32_t a, float_status *status) 7054 { 7055 return int64_to_float64(a, status); 7056 } 7057 7058 uint32_t float32_to_uint32(float32 a, float_status *status) 7059 { 7060 int64_t v; 7061 uint32_t res; 7062 int old_exc_flags = get_float_exception_flags(status); 7063 7064 v = float32_to_int64(a, status); 7065 if (v < 0) { 7066 res = 0; 7067 } else if (v > 0xffffffff) { 7068 res = 0xffffffff; 7069 } else { 7070 return v; 7071 } 7072 set_float_exception_flags(old_exc_flags, status); 7073 float_raise(float_flag_invalid, status); 7074 return res; 7075 } 7076 7077 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7078 { 7079 int64_t v; 7080 uint32_t res; 7081 int old_exc_flags = get_float_exception_flags(status); 7082 7083 v = float32_to_int64_round_to_zero(a, status); 7084 if (v < 0) { 7085 res = 0; 7086 } else if (v > 0xffffffff) { 7087 res = 0xffffffff; 7088 } else { 7089 return v; 7090 } 7091 set_float_exception_flags(old_exc_flags, status); 7092 float_raise(float_flag_invalid, status); 7093 return res; 7094 } 7095 7096 int16_t float32_to_int16(float32 a, float_status *status) 7097 { 7098 int32_t v; 7099 int16_t res; 7100 int old_exc_flags = get_float_exception_flags(status); 7101 7102 v = float32_to_int32(a, status); 7103 if (v < -0x8000) { 7104 res = -0x8000; 7105 } else if (v > 0x7fff) { 7106 res = 0x7fff; 7107 } else { 7108 return v; 7109 } 7110 7111 set_float_exception_flags(old_exc_flags, status); 7112 float_raise(float_flag_invalid, status); 7113 return res; 7114 } 7115 7116 uint16_t float32_to_uint16(float32 a, float_status *status) 7117 { 7118 int32_t v; 7119 uint16_t res; 7120 int old_exc_flags = get_float_exception_flags(status); 7121 7122 v = float32_to_int32(a, status); 7123 if (v < 0) { 7124 res = 0; 7125 } else if (v > 0xffff) { 7126 res = 0xffff; 7127 } else { 7128 return v; 7129 } 7130 7131 set_float_exception_flags(old_exc_flags, status); 7132 float_raise(float_flag_invalid, status); 7133 return res; 7134 } 7135 7136 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7137 { 7138 int64_t v; 7139 uint16_t res; 7140 int old_exc_flags = get_float_exception_flags(status); 7141 7142 v = float32_to_int64_round_to_zero(a, status); 7143 if (v < 0) { 7144 res = 0; 7145 } else if (v > 0xffff) { 7146 res = 0xffff; 7147 } else { 7148 return v; 7149 } 7150 set_float_exception_flags(old_exc_flags, status); 7151 float_raise(float_flag_invalid, status); 7152 return res; 7153 } 7154 7155 uint32_t float64_to_uint32(float64 a, float_status *status) 7156 { 7157 uint64_t v; 7158 uint32_t res; 7159 int old_exc_flags = get_float_exception_flags(status); 7160 7161 v = float64_to_uint64(a, status); 7162 if (v > 0xffffffff) { 7163 res = 0xffffffff; 7164 } else { 7165 return v; 7166 } 7167 set_float_exception_flags(old_exc_flags, status); 7168 float_raise(float_flag_invalid, status); 7169 return res; 7170 } 7171 7172 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7173 { 7174 uint64_t v; 7175 uint32_t res; 7176 int old_exc_flags = get_float_exception_flags(status); 7177 7178 v = float64_to_uint64_round_to_zero(a, status); 7179 if (v > 0xffffffff) { 7180 res = 0xffffffff; 7181 } else { 7182 return v; 7183 } 7184 set_float_exception_flags(old_exc_flags, status); 7185 float_raise(float_flag_invalid, status); 7186 return res; 7187 } 7188 7189 int16_t float64_to_int16(float64 a, float_status *status) 7190 { 7191 int64_t v; 7192 int16_t res; 7193 int old_exc_flags = get_float_exception_flags(status); 7194 7195 v = float64_to_int32(a, status); 7196 if (v < -0x8000) { 7197 res = -0x8000; 7198 } else if (v > 0x7fff) { 7199 res = 0x7fff; 7200 } else { 7201 return v; 7202 } 7203 7204 set_float_exception_flags(old_exc_flags, status); 7205 float_raise(float_flag_invalid, status); 7206 return res; 7207 } 7208 7209 uint16_t float64_to_uint16(float64 a, float_status *status) 7210 { 7211 int64_t v; 7212 uint16_t res; 7213 int old_exc_flags = get_float_exception_flags(status); 7214 7215 v = float64_to_int32(a, status); 7216 if (v < 0) { 7217 res = 0; 7218 } else if (v > 0xffff) { 7219 res = 0xffff; 7220 } else { 7221 return v; 7222 } 7223 7224 set_float_exception_flags(old_exc_flags, status); 7225 float_raise(float_flag_invalid, status); 7226 return res; 7227 } 7228 7229 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7230 { 7231 int64_t v; 7232 uint16_t res; 7233 int old_exc_flags = get_float_exception_flags(status); 7234 7235 v = float64_to_int64_round_to_zero(a, status); 7236 if (v < 0) { 7237 res = 0; 7238 } else if (v > 0xffff) { 7239 res = 0xffff; 7240 } else { 7241 return v; 7242 } 7243 set_float_exception_flags(old_exc_flags, status); 7244 float_raise(float_flag_invalid, status); 7245 return res; 7246 } 7247 7248 /*---------------------------------------------------------------------------- 7249 | Returns the result of converting the double-precision floating-point value 7250 | `a' to the 64-bit unsigned integer format. The conversion is 7251 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7252 | Arithmetic---which means in particular that the conversion is rounded 7253 | according to the current rounding mode. If `a' is a NaN, the largest 7254 | positive integer is returned. If the conversion overflows, the 7255 | largest unsigned integer is returned. If 'a' is negative, the value is 7256 | rounded and zero is returned; negative values that do not round to zero 7257 | will raise the inexact exception. 7258 *----------------------------------------------------------------------------*/ 7259 7260 uint64_t float64_to_uint64(float64 a, float_status *status) 7261 { 7262 flag aSign; 7263 int aExp; 7264 int shiftCount; 7265 uint64_t aSig, aSigExtra; 7266 a = float64_squash_input_denormal(a, status); 7267 7268 aSig = extractFloat64Frac(a); 7269 aExp = extractFloat64Exp(a); 7270 aSign = extractFloat64Sign(a); 7271 if (aSign && (aExp > 1022)) { 7272 float_raise(float_flag_invalid, status); 7273 if (float64_is_any_nan(a)) { 7274 return LIT64(0xFFFFFFFFFFFFFFFF); 7275 } else { 7276 return 0; 7277 } 7278 } 7279 if (aExp) { 7280 aSig |= LIT64(0x0010000000000000); 7281 } 7282 shiftCount = 0x433 - aExp; 7283 if (shiftCount <= 0) { 7284 if (0x43E < aExp) { 7285 float_raise(float_flag_invalid, status); 7286 return LIT64(0xFFFFFFFFFFFFFFFF); 7287 } 7288 aSigExtra = 0; 7289 aSig <<= -shiftCount; 7290 } else { 7291 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7292 } 7293 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7294 } 7295 7296 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7297 { 7298 signed char current_rounding_mode = status->float_rounding_mode; 7299 set_float_rounding_mode(float_round_to_zero, status); 7300 uint64_t v = float64_to_uint64(a, status); 7301 set_float_rounding_mode(current_rounding_mode, status); 7302 return v; 7303 } 7304 7305 #define COMPARE(s, nan_exp) \ 7306 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7307 int is_quiet, float_status *status) \ 7308 { \ 7309 flag aSign, bSign; \ 7310 uint ## s ## _t av, bv; \ 7311 a = float ## s ## _squash_input_denormal(a, status); \ 7312 b = float ## s ## _squash_input_denormal(b, status); \ 7313 \ 7314 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7315 extractFloat ## s ## Frac( a ) ) || \ 7316 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7317 extractFloat ## s ## Frac( b ) )) { \ 7318 if (!is_quiet || \ 7319 float ## s ## _is_signaling_nan(a, status) || \ 7320 float ## s ## _is_signaling_nan(b, status)) { \ 7321 float_raise(float_flag_invalid, status); \ 7322 } \ 7323 return float_relation_unordered; \ 7324 } \ 7325 aSign = extractFloat ## s ## Sign( a ); \ 7326 bSign = extractFloat ## s ## Sign( b ); \ 7327 av = float ## s ## _val(a); \ 7328 bv = float ## s ## _val(b); \ 7329 if ( aSign != bSign ) { \ 7330 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7331 /* zero case */ \ 7332 return float_relation_equal; \ 7333 } else { \ 7334 return 1 - (2 * aSign); \ 7335 } \ 7336 } else { \ 7337 if (av == bv) { \ 7338 return float_relation_equal; \ 7339 } else { \ 7340 return 1 - 2 * (aSign ^ ( av < bv )); \ 7341 } \ 7342 } \ 7343 } \ 7344 \ 7345 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7346 { \ 7347 return float ## s ## _compare_internal(a, b, 0, status); \ 7348 } \ 7349 \ 7350 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7351 float_status *status) \ 7352 { \ 7353 return float ## s ## _compare_internal(a, b, 1, status); \ 7354 } 7355 7356 COMPARE(32, 0xff) 7357 COMPARE(64, 0x7ff) 7358 7359 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7360 int is_quiet, float_status *status) 7361 { 7362 flag aSign, bSign; 7363 7364 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7365 float_raise(float_flag_invalid, status); 7366 return float_relation_unordered; 7367 } 7368 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7369 ( extractFloatx80Frac( a )<<1 ) ) || 7370 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7371 ( extractFloatx80Frac( b )<<1 ) )) { 7372 if (!is_quiet || 7373 floatx80_is_signaling_nan(a, status) || 7374 floatx80_is_signaling_nan(b, status)) { 7375 float_raise(float_flag_invalid, status); 7376 } 7377 return float_relation_unordered; 7378 } 7379 aSign = extractFloatx80Sign( a ); 7380 bSign = extractFloatx80Sign( b ); 7381 if ( aSign != bSign ) { 7382 7383 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7384 ( ( a.low | b.low ) == 0 ) ) { 7385 /* zero case */ 7386 return float_relation_equal; 7387 } else { 7388 return 1 - (2 * aSign); 7389 } 7390 } else { 7391 if (a.low == b.low && a.high == b.high) { 7392 return float_relation_equal; 7393 } else { 7394 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7395 } 7396 } 7397 } 7398 7399 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7400 { 7401 return floatx80_compare_internal(a, b, 0, status); 7402 } 7403 7404 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7405 { 7406 return floatx80_compare_internal(a, b, 1, status); 7407 } 7408 7409 static inline int float128_compare_internal(float128 a, float128 b, 7410 int is_quiet, float_status *status) 7411 { 7412 flag aSign, bSign; 7413 7414 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7415 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7416 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7417 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7418 if (!is_quiet || 7419 float128_is_signaling_nan(a, status) || 7420 float128_is_signaling_nan(b, status)) { 7421 float_raise(float_flag_invalid, status); 7422 } 7423 return float_relation_unordered; 7424 } 7425 aSign = extractFloat128Sign( a ); 7426 bSign = extractFloat128Sign( b ); 7427 if ( aSign != bSign ) { 7428 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7429 /* zero case */ 7430 return float_relation_equal; 7431 } else { 7432 return 1 - (2 * aSign); 7433 } 7434 } else { 7435 if (a.low == b.low && a.high == b.high) { 7436 return float_relation_equal; 7437 } else { 7438 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7439 } 7440 } 7441 } 7442 7443 int float128_compare(float128 a, float128 b, float_status *status) 7444 { 7445 return float128_compare_internal(a, b, 0, status); 7446 } 7447 7448 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7449 { 7450 return float128_compare_internal(a, b, 1, status); 7451 } 7452 7453 /* min() and max() functions. These can't be implemented as 7454 * 'compare and pick one input' because that would mishandle 7455 * NaNs and +0 vs -0. 7456 * 7457 * minnum() and maxnum() functions. These are similar to the min() 7458 * and max() functions but if one of the arguments is a QNaN and 7459 * the other is numerical then the numerical argument is returned. 7460 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7461 * and maxNum() operations. min() and max() are the typical min/max 7462 * semantics provided by many CPUs which predate that specification. 7463 * 7464 * minnummag() and maxnummag() functions correspond to minNumMag() 7465 * and minNumMag() from the IEEE-754 2008. 7466 */ 7467 #define MINMAX(s) \ 7468 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7469 int ismin, int isieee, \ 7470 int ismag, \ 7471 float_status *status) \ 7472 { \ 7473 flag aSign, bSign; \ 7474 uint ## s ## _t av, bv, aav, abv; \ 7475 a = float ## s ## _squash_input_denormal(a, status); \ 7476 b = float ## s ## _squash_input_denormal(b, status); \ 7477 if (float ## s ## _is_any_nan(a) || \ 7478 float ## s ## _is_any_nan(b)) { \ 7479 if (isieee) { \ 7480 if (float ## s ## _is_quiet_nan(a, status) && \ 7481 !float ## s ##_is_any_nan(b)) { \ 7482 return b; \ 7483 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7484 !float ## s ## _is_any_nan(a)) { \ 7485 return a; \ 7486 } \ 7487 } \ 7488 return propagateFloat ## s ## NaN(a, b, status); \ 7489 } \ 7490 aSign = extractFloat ## s ## Sign(a); \ 7491 bSign = extractFloat ## s ## Sign(b); \ 7492 av = float ## s ## _val(a); \ 7493 bv = float ## s ## _val(b); \ 7494 if (ismag) { \ 7495 aav = float ## s ## _abs(av); \ 7496 abv = float ## s ## _abs(bv); \ 7497 if (aav != abv) { \ 7498 if (ismin) { \ 7499 return (aav < abv) ? a : b; \ 7500 } else { \ 7501 return (aav < abv) ? b : a; \ 7502 } \ 7503 } \ 7504 } \ 7505 if (aSign != bSign) { \ 7506 if (ismin) { \ 7507 return aSign ? a : b; \ 7508 } else { \ 7509 return aSign ? b : a; \ 7510 } \ 7511 } else { \ 7512 if (ismin) { \ 7513 return (aSign ^ (av < bv)) ? a : b; \ 7514 } else { \ 7515 return (aSign ^ (av < bv)) ? b : a; \ 7516 } \ 7517 } \ 7518 } \ 7519 \ 7520 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7521 float_status *status) \ 7522 { \ 7523 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7524 } \ 7525 \ 7526 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7527 float_status *status) \ 7528 { \ 7529 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7530 } \ 7531 \ 7532 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7533 float_status *status) \ 7534 { \ 7535 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7536 } \ 7537 \ 7538 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7539 float_status *status) \ 7540 { \ 7541 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7542 } \ 7543 \ 7544 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7545 float_status *status) \ 7546 { \ 7547 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7548 } \ 7549 \ 7550 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7551 float_status *status) \ 7552 { \ 7553 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7554 } 7555 7556 MINMAX(32) 7557 MINMAX(64) 7558 7559 7560 /* Multiply A by 2 raised to the power N. */ 7561 float32 float32_scalbn(float32 a, int n, float_status *status) 7562 { 7563 flag aSign; 7564 int16_t aExp; 7565 uint32_t aSig; 7566 7567 a = float32_squash_input_denormal(a, status); 7568 aSig = extractFloat32Frac( a ); 7569 aExp = extractFloat32Exp( a ); 7570 aSign = extractFloat32Sign( a ); 7571 7572 if ( aExp == 0xFF ) { 7573 if ( aSig ) { 7574 return propagateFloat32NaN(a, a, status); 7575 } 7576 return a; 7577 } 7578 if (aExp != 0) { 7579 aSig |= 0x00800000; 7580 } else if (aSig == 0) { 7581 return a; 7582 } else { 7583 aExp++; 7584 } 7585 7586 if (n > 0x200) { 7587 n = 0x200; 7588 } else if (n < -0x200) { 7589 n = -0x200; 7590 } 7591 7592 aExp += n - 1; 7593 aSig <<= 7; 7594 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7595 } 7596 7597 float64 float64_scalbn(float64 a, int n, float_status *status) 7598 { 7599 flag aSign; 7600 int16_t aExp; 7601 uint64_t aSig; 7602 7603 a = float64_squash_input_denormal(a, status); 7604 aSig = extractFloat64Frac( a ); 7605 aExp = extractFloat64Exp( a ); 7606 aSign = extractFloat64Sign( a ); 7607 7608 if ( aExp == 0x7FF ) { 7609 if ( aSig ) { 7610 return propagateFloat64NaN(a, a, status); 7611 } 7612 return a; 7613 } 7614 if (aExp != 0) { 7615 aSig |= LIT64( 0x0010000000000000 ); 7616 } else if (aSig == 0) { 7617 return a; 7618 } else { 7619 aExp++; 7620 } 7621 7622 if (n > 0x1000) { 7623 n = 0x1000; 7624 } else if (n < -0x1000) { 7625 n = -0x1000; 7626 } 7627 7628 aExp += n - 1; 7629 aSig <<= 10; 7630 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7631 } 7632 7633 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7634 { 7635 flag aSign; 7636 int32_t aExp; 7637 uint64_t aSig; 7638 7639 if (floatx80_invalid_encoding(a)) { 7640 float_raise(float_flag_invalid, status); 7641 return floatx80_default_nan(status); 7642 } 7643 aSig = extractFloatx80Frac( a ); 7644 aExp = extractFloatx80Exp( a ); 7645 aSign = extractFloatx80Sign( a ); 7646 7647 if ( aExp == 0x7FFF ) { 7648 if ( aSig<<1 ) { 7649 return propagateFloatx80NaN(a, a, status); 7650 } 7651 return a; 7652 } 7653 7654 if (aExp == 0) { 7655 if (aSig == 0) { 7656 return a; 7657 } 7658 aExp++; 7659 } 7660 7661 if (n > 0x10000) { 7662 n = 0x10000; 7663 } else if (n < -0x10000) { 7664 n = -0x10000; 7665 } 7666 7667 aExp += n; 7668 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7669 aSign, aExp, aSig, 0, status); 7670 } 7671 7672 float128 float128_scalbn(float128 a, int n, float_status *status) 7673 { 7674 flag aSign; 7675 int32_t aExp; 7676 uint64_t aSig0, aSig1; 7677 7678 aSig1 = extractFloat128Frac1( a ); 7679 aSig0 = extractFloat128Frac0( a ); 7680 aExp = extractFloat128Exp( a ); 7681 aSign = extractFloat128Sign( a ); 7682 if ( aExp == 0x7FFF ) { 7683 if ( aSig0 | aSig1 ) { 7684 return propagateFloat128NaN(a, a, status); 7685 } 7686 return a; 7687 } 7688 if (aExp != 0) { 7689 aSig0 |= LIT64( 0x0001000000000000 ); 7690 } else if (aSig0 == 0 && aSig1 == 0) { 7691 return a; 7692 } else { 7693 aExp++; 7694 } 7695 7696 if (n > 0x10000) { 7697 n = 0x10000; 7698 } else if (n < -0x10000) { 7699 n = -0x10000; 7700 } 7701 7702 aExp += n - 1; 7703 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7704 , status); 7705 7706 } 7707