1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 564 { 565 if (is_snan(a.cls) || is_snan(b.cls)) { 566 s->float_exception_flags |= float_flag_invalid; 567 } 568 569 if (s->default_nan_mode) { 570 a.cls = float_class_dnan; 571 } else { 572 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 573 is_qnan(b.cls), is_snan(b.cls), 574 a.frac > b.frac || 575 (a.frac == b.frac && a.sign < b.sign))) { 576 a = b; 577 } 578 a.cls = float_class_msnan; 579 } 580 return a; 581 } 582 583 /* 584 * Returns the result of adding or subtracting the values of the 585 * floating-point values `a' and `b'. The operation is performed 586 * according to the IEC/IEEE Standard for Binary Floating-Point 587 * Arithmetic. 588 */ 589 590 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 591 float_status *s) 592 { 593 bool a_sign = a.sign; 594 bool b_sign = b.sign ^ subtract; 595 596 if (a_sign != b_sign) { 597 /* Subtraction */ 598 599 if (a.cls == float_class_normal && b.cls == float_class_normal) { 600 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 601 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 602 a.frac = a.frac - b.frac; 603 } else { 604 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 605 a.frac = b.frac - a.frac; 606 a.exp = b.exp; 607 a_sign ^= 1; 608 } 609 610 if (a.frac == 0) { 611 a.cls = float_class_zero; 612 a.sign = s->float_rounding_mode == float_round_down; 613 } else { 614 int shift = clz64(a.frac) - 1; 615 a.frac = a.frac << shift; 616 a.exp = a.exp - shift; 617 a.sign = a_sign; 618 } 619 return a; 620 } 621 if (is_nan(a.cls) || is_nan(b.cls)) { 622 return pick_nan(a, b, s); 623 } 624 if (a.cls == float_class_inf) { 625 if (b.cls == float_class_inf) { 626 float_raise(float_flag_invalid, s); 627 a.cls = float_class_dnan; 628 } 629 return a; 630 } 631 if (a.cls == float_class_zero && b.cls == float_class_zero) { 632 a.sign = s->float_rounding_mode == float_round_down; 633 return a; 634 } 635 if (a.cls == float_class_zero || b.cls == float_class_inf) { 636 b.sign = a_sign ^ 1; 637 return b; 638 } 639 if (b.cls == float_class_zero) { 640 return a; 641 } 642 } else { 643 /* Addition */ 644 if (a.cls == float_class_normal && b.cls == float_class_normal) { 645 if (a.exp > b.exp) { 646 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 647 } else if (a.exp < b.exp) { 648 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 649 a.exp = b.exp; 650 } 651 a.frac += b.frac; 652 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 653 a.frac >>= 1; 654 a.exp += 1; 655 } 656 return a; 657 } 658 if (is_nan(a.cls) || is_nan(b.cls)) { 659 return pick_nan(a, b, s); 660 } 661 if (a.cls == float_class_inf || b.cls == float_class_zero) { 662 return a; 663 } 664 if (b.cls == float_class_inf || a.cls == float_class_zero) { 665 b.sign = b_sign; 666 return b; 667 } 668 } 669 g_assert_not_reached(); 670 } 671 672 /* 673 * Returns the result of adding or subtracting the floating-point 674 * values `a' and `b'. The operation is performed according to the 675 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 676 */ 677 678 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 679 float_status *status) 680 { 681 FloatParts pa = float16_unpack_canonical(a, status); 682 FloatParts pb = float16_unpack_canonical(b, status); 683 FloatParts pr = addsub_floats(pa, pb, false, status); 684 685 return float16_round_pack_canonical(pr, status); 686 } 687 688 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 689 float_status *status) 690 { 691 FloatParts pa = float32_unpack_canonical(a, status); 692 FloatParts pb = float32_unpack_canonical(b, status); 693 FloatParts pr = addsub_floats(pa, pb, false, status); 694 695 return float32_round_pack_canonical(pr, status); 696 } 697 698 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 699 float_status *status) 700 { 701 FloatParts pa = float64_unpack_canonical(a, status); 702 FloatParts pb = float64_unpack_canonical(b, status); 703 FloatParts pr = addsub_floats(pa, pb, false, status); 704 705 return float64_round_pack_canonical(pr, status); 706 } 707 708 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 709 float_status *status) 710 { 711 FloatParts pa = float16_unpack_canonical(a, status); 712 FloatParts pb = float16_unpack_canonical(b, status); 713 FloatParts pr = addsub_floats(pa, pb, true, status); 714 715 return float16_round_pack_canonical(pr, status); 716 } 717 718 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 719 float_status *status) 720 { 721 FloatParts pa = float32_unpack_canonical(a, status); 722 FloatParts pb = float32_unpack_canonical(b, status); 723 FloatParts pr = addsub_floats(pa, pb, true, status); 724 725 return float32_round_pack_canonical(pr, status); 726 } 727 728 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 729 float_status *status) 730 { 731 FloatParts pa = float64_unpack_canonical(a, status); 732 FloatParts pb = float64_unpack_canonical(b, status); 733 FloatParts pr = addsub_floats(pa, pb, true, status); 734 735 return float64_round_pack_canonical(pr, status); 736 } 737 738 /*---------------------------------------------------------------------------- 739 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 740 | and 7, and returns the properly rounded 32-bit integer corresponding to the 741 | input. If `zSign' is 1, the input is negated before being converted to an 742 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 743 | is simply rounded to an integer, with the inexact exception raised if the 744 | input cannot be represented exactly as an integer. However, if the fixed- 745 | point input is too large, the invalid exception is raised and the largest 746 | positive or negative integer is returned. 747 *----------------------------------------------------------------------------*/ 748 749 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 750 { 751 int8_t roundingMode; 752 flag roundNearestEven; 753 int8_t roundIncrement, roundBits; 754 int32_t z; 755 756 roundingMode = status->float_rounding_mode; 757 roundNearestEven = ( roundingMode == float_round_nearest_even ); 758 switch (roundingMode) { 759 case float_round_nearest_even: 760 case float_round_ties_away: 761 roundIncrement = 0x40; 762 break; 763 case float_round_to_zero: 764 roundIncrement = 0; 765 break; 766 case float_round_up: 767 roundIncrement = zSign ? 0 : 0x7f; 768 break; 769 case float_round_down: 770 roundIncrement = zSign ? 0x7f : 0; 771 break; 772 default: 773 abort(); 774 } 775 roundBits = absZ & 0x7F; 776 absZ = ( absZ + roundIncrement )>>7; 777 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 778 z = absZ; 779 if ( zSign ) z = - z; 780 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 781 float_raise(float_flag_invalid, status); 782 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 783 } 784 if (roundBits) { 785 status->float_exception_flags |= float_flag_inexact; 786 } 787 return z; 788 789 } 790 791 /*---------------------------------------------------------------------------- 792 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 793 | `absZ1', with binary point between bits 63 and 64 (between the input words), 794 | and returns the properly rounded 64-bit integer corresponding to the input. 795 | If `zSign' is 1, the input is negated before being converted to an integer. 796 | Ordinarily, the fixed-point input is simply rounded to an integer, with 797 | the inexact exception raised if the input cannot be represented exactly as 798 | an integer. However, if the fixed-point input is too large, the invalid 799 | exception is raised and the largest positive or negative integer is 800 | returned. 801 *----------------------------------------------------------------------------*/ 802 803 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 804 float_status *status) 805 { 806 int8_t roundingMode; 807 flag roundNearestEven, increment; 808 int64_t z; 809 810 roundingMode = status->float_rounding_mode; 811 roundNearestEven = ( roundingMode == float_round_nearest_even ); 812 switch (roundingMode) { 813 case float_round_nearest_even: 814 case float_round_ties_away: 815 increment = ((int64_t) absZ1 < 0); 816 break; 817 case float_round_to_zero: 818 increment = 0; 819 break; 820 case float_round_up: 821 increment = !zSign && absZ1; 822 break; 823 case float_round_down: 824 increment = zSign && absZ1; 825 break; 826 default: 827 abort(); 828 } 829 if ( increment ) { 830 ++absZ0; 831 if ( absZ0 == 0 ) goto overflow; 832 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 833 } 834 z = absZ0; 835 if ( zSign ) z = - z; 836 if ( z && ( ( z < 0 ) ^ zSign ) ) { 837 overflow: 838 float_raise(float_flag_invalid, status); 839 return 840 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 841 : LIT64( 0x7FFFFFFFFFFFFFFF ); 842 } 843 if (absZ1) { 844 status->float_exception_flags |= float_flag_inexact; 845 } 846 return z; 847 848 } 849 850 /*---------------------------------------------------------------------------- 851 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 852 | `absZ1', with binary point between bits 63 and 64 (between the input words), 853 | and returns the properly rounded 64-bit unsigned integer corresponding to the 854 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 855 | with the inexact exception raised if the input cannot be represented exactly 856 | as an integer. However, if the fixed-point input is too large, the invalid 857 | exception is raised and the largest unsigned integer is returned. 858 *----------------------------------------------------------------------------*/ 859 860 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 861 uint64_t absZ1, float_status *status) 862 { 863 int8_t roundingMode; 864 flag roundNearestEven, increment; 865 866 roundingMode = status->float_rounding_mode; 867 roundNearestEven = (roundingMode == float_round_nearest_even); 868 switch (roundingMode) { 869 case float_round_nearest_even: 870 case float_round_ties_away: 871 increment = ((int64_t)absZ1 < 0); 872 break; 873 case float_round_to_zero: 874 increment = 0; 875 break; 876 case float_round_up: 877 increment = !zSign && absZ1; 878 break; 879 case float_round_down: 880 increment = zSign && absZ1; 881 break; 882 default: 883 abort(); 884 } 885 if (increment) { 886 ++absZ0; 887 if (absZ0 == 0) { 888 float_raise(float_flag_invalid, status); 889 return LIT64(0xFFFFFFFFFFFFFFFF); 890 } 891 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 892 } 893 894 if (zSign && absZ0) { 895 float_raise(float_flag_invalid, status); 896 return 0; 897 } 898 899 if (absZ1) { 900 status->float_exception_flags |= float_flag_inexact; 901 } 902 return absZ0; 903 } 904 905 /*---------------------------------------------------------------------------- 906 | If `a' is denormal and we are in flush-to-zero mode then set the 907 | input-denormal exception and return zero. Otherwise just return the value. 908 *----------------------------------------------------------------------------*/ 909 float32 float32_squash_input_denormal(float32 a, float_status *status) 910 { 911 if (status->flush_inputs_to_zero) { 912 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 913 float_raise(float_flag_input_denormal, status); 914 return make_float32(float32_val(a) & 0x80000000); 915 } 916 } 917 return a; 918 } 919 920 /*---------------------------------------------------------------------------- 921 | Normalizes the subnormal single-precision floating-point value represented 922 | by the denormalized significand `aSig'. The normalized exponent and 923 | significand are stored at the locations pointed to by `zExpPtr' and 924 | `zSigPtr', respectively. 925 *----------------------------------------------------------------------------*/ 926 927 static void 928 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 929 { 930 int8_t shiftCount; 931 932 shiftCount = countLeadingZeros32( aSig ) - 8; 933 *zSigPtr = aSig<<shiftCount; 934 *zExpPtr = 1 - shiftCount; 935 936 } 937 938 /*---------------------------------------------------------------------------- 939 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 940 | single-precision floating-point value, returning the result. After being 941 | shifted into the proper positions, the three fields are simply added 942 | together to form the result. This means that any integer portion of `zSig' 943 | will be added into the exponent. Since a properly normalized significand 944 | will have an integer portion equal to 1, the `zExp' input should be 1 less 945 | than the desired result exponent whenever `zSig' is a complete, normalized 946 | significand. 947 *----------------------------------------------------------------------------*/ 948 949 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 950 { 951 952 return make_float32( 953 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 954 955 } 956 957 /*---------------------------------------------------------------------------- 958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 959 | and significand `zSig', and returns the proper single-precision floating- 960 | point value corresponding to the abstract input. Ordinarily, the abstract 961 | value is simply rounded and packed into the single-precision format, with 962 | the inexact exception raised if the abstract input cannot be represented 963 | exactly. However, if the abstract value is too large, the overflow and 964 | inexact exceptions are raised and an infinity or maximal finite value is 965 | returned. If the abstract value is too small, the input value is rounded to 966 | a subnormal number, and the underflow and inexact exceptions are raised if 967 | the abstract input cannot be represented exactly as a subnormal single- 968 | precision floating-point number. 969 | The input significand `zSig' has its binary point between bits 30 970 | and 29, which is 7 bits to the left of the usual location. This shifted 971 | significand must be normalized or smaller. If `zSig' is not normalized, 972 | `zExp' must be 0; in that case, the result returned is a subnormal number, 973 | and it must not require rounding. In the usual case that `zSig' is 974 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 975 | The handling of underflow and overflow follows the IEC/IEEE Standard for 976 | Binary Floating-Point Arithmetic. 977 *----------------------------------------------------------------------------*/ 978 979 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 980 float_status *status) 981 { 982 int8_t roundingMode; 983 flag roundNearestEven; 984 int8_t roundIncrement, roundBits; 985 flag isTiny; 986 987 roundingMode = status->float_rounding_mode; 988 roundNearestEven = ( roundingMode == float_round_nearest_even ); 989 switch (roundingMode) { 990 case float_round_nearest_even: 991 case float_round_ties_away: 992 roundIncrement = 0x40; 993 break; 994 case float_round_to_zero: 995 roundIncrement = 0; 996 break; 997 case float_round_up: 998 roundIncrement = zSign ? 0 : 0x7f; 999 break; 1000 case float_round_down: 1001 roundIncrement = zSign ? 0x7f : 0; 1002 break; 1003 default: 1004 abort(); 1005 break; 1006 } 1007 roundBits = zSig & 0x7F; 1008 if ( 0xFD <= (uint16_t) zExp ) { 1009 if ( ( 0xFD < zExp ) 1010 || ( ( zExp == 0xFD ) 1011 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 1012 ) { 1013 float_raise(float_flag_overflow | float_flag_inexact, status); 1014 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 1015 } 1016 if ( zExp < 0 ) { 1017 if (status->flush_to_zero) { 1018 float_raise(float_flag_output_denormal, status); 1019 return packFloat32(zSign, 0, 0); 1020 } 1021 isTiny = 1022 (status->float_detect_tininess 1023 == float_tininess_before_rounding) 1024 || ( zExp < -1 ) 1025 || ( zSig + roundIncrement < 0x80000000 ); 1026 shift32RightJamming( zSig, - zExp, &zSig ); 1027 zExp = 0; 1028 roundBits = zSig & 0x7F; 1029 if (isTiny && roundBits) { 1030 float_raise(float_flag_underflow, status); 1031 } 1032 } 1033 } 1034 if (roundBits) { 1035 status->float_exception_flags |= float_flag_inexact; 1036 } 1037 zSig = ( zSig + roundIncrement )>>7; 1038 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1039 if ( zSig == 0 ) zExp = 0; 1040 return packFloat32( zSign, zExp, zSig ); 1041 1042 } 1043 1044 /*---------------------------------------------------------------------------- 1045 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1046 | and significand `zSig', and returns the proper single-precision floating- 1047 | point value corresponding to the abstract input. This routine is just like 1048 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 1049 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1050 | floating-point exponent. 1051 *----------------------------------------------------------------------------*/ 1052 1053 static float32 1054 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1055 float_status *status) 1056 { 1057 int8_t shiftCount; 1058 1059 shiftCount = countLeadingZeros32( zSig ) - 1; 1060 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 1061 status); 1062 1063 } 1064 1065 /*---------------------------------------------------------------------------- 1066 | If `a' is denormal and we are in flush-to-zero mode then set the 1067 | input-denormal exception and return zero. Otherwise just return the value. 1068 *----------------------------------------------------------------------------*/ 1069 float64 float64_squash_input_denormal(float64 a, float_status *status) 1070 { 1071 if (status->flush_inputs_to_zero) { 1072 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 1073 float_raise(float_flag_input_denormal, status); 1074 return make_float64(float64_val(a) & (1ULL << 63)); 1075 } 1076 } 1077 return a; 1078 } 1079 1080 /*---------------------------------------------------------------------------- 1081 | Normalizes the subnormal double-precision floating-point value represented 1082 | by the denormalized significand `aSig'. The normalized exponent and 1083 | significand are stored at the locations pointed to by `zExpPtr' and 1084 | `zSigPtr', respectively. 1085 *----------------------------------------------------------------------------*/ 1086 1087 static void 1088 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 1089 { 1090 int8_t shiftCount; 1091 1092 shiftCount = countLeadingZeros64( aSig ) - 11; 1093 *zSigPtr = aSig<<shiftCount; 1094 *zExpPtr = 1 - shiftCount; 1095 1096 } 1097 1098 /*---------------------------------------------------------------------------- 1099 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1100 | double-precision floating-point value, returning the result. After being 1101 | shifted into the proper positions, the three fields are simply added 1102 | together to form the result. This means that any integer portion of `zSig' 1103 | will be added into the exponent. Since a properly normalized significand 1104 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1105 | than the desired result exponent whenever `zSig' is a complete, normalized 1106 | significand. 1107 *----------------------------------------------------------------------------*/ 1108 1109 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 1110 { 1111 1112 return make_float64( 1113 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 1114 1115 } 1116 1117 /*---------------------------------------------------------------------------- 1118 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1119 | and significand `zSig', and returns the proper double-precision floating- 1120 | point value corresponding to the abstract input. Ordinarily, the abstract 1121 | value is simply rounded and packed into the double-precision format, with 1122 | the inexact exception raised if the abstract input cannot be represented 1123 | exactly. However, if the abstract value is too large, the overflow and 1124 | inexact exceptions are raised and an infinity or maximal finite value is 1125 | returned. If the abstract value is too small, the input value is rounded to 1126 | a subnormal number, and the underflow and inexact exceptions are raised if 1127 | the abstract input cannot be represented exactly as a subnormal double- 1128 | precision floating-point number. 1129 | The input significand `zSig' has its binary point between bits 62 1130 | and 61, which is 10 bits to the left of the usual location. This shifted 1131 | significand must be normalized or smaller. If `zSig' is not normalized, 1132 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1133 | and it must not require rounding. In the usual case that `zSig' is 1134 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1135 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1136 | Binary Floating-Point Arithmetic. 1137 *----------------------------------------------------------------------------*/ 1138 1139 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1140 float_status *status) 1141 { 1142 int8_t roundingMode; 1143 flag roundNearestEven; 1144 int roundIncrement, roundBits; 1145 flag isTiny; 1146 1147 roundingMode = status->float_rounding_mode; 1148 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1149 switch (roundingMode) { 1150 case float_round_nearest_even: 1151 case float_round_ties_away: 1152 roundIncrement = 0x200; 1153 break; 1154 case float_round_to_zero: 1155 roundIncrement = 0; 1156 break; 1157 case float_round_up: 1158 roundIncrement = zSign ? 0 : 0x3ff; 1159 break; 1160 case float_round_down: 1161 roundIncrement = zSign ? 0x3ff : 0; 1162 break; 1163 case float_round_to_odd: 1164 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1165 break; 1166 default: 1167 abort(); 1168 } 1169 roundBits = zSig & 0x3FF; 1170 if ( 0x7FD <= (uint16_t) zExp ) { 1171 if ( ( 0x7FD < zExp ) 1172 || ( ( zExp == 0x7FD ) 1173 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 1174 ) { 1175 bool overflow_to_inf = roundingMode != float_round_to_odd && 1176 roundIncrement != 0; 1177 float_raise(float_flag_overflow | float_flag_inexact, status); 1178 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 1179 } 1180 if ( zExp < 0 ) { 1181 if (status->flush_to_zero) { 1182 float_raise(float_flag_output_denormal, status); 1183 return packFloat64(zSign, 0, 0); 1184 } 1185 isTiny = 1186 (status->float_detect_tininess 1187 == float_tininess_before_rounding) 1188 || ( zExp < -1 ) 1189 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 1190 shift64RightJamming( zSig, - zExp, &zSig ); 1191 zExp = 0; 1192 roundBits = zSig & 0x3FF; 1193 if (isTiny && roundBits) { 1194 float_raise(float_flag_underflow, status); 1195 } 1196 if (roundingMode == float_round_to_odd) { 1197 /* 1198 * For round-to-odd case, the roundIncrement depends on 1199 * zSig which just changed. 1200 */ 1201 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1202 } 1203 } 1204 } 1205 if (roundBits) { 1206 status->float_exception_flags |= float_flag_inexact; 1207 } 1208 zSig = ( zSig + roundIncrement )>>10; 1209 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 1210 if ( zSig == 0 ) zExp = 0; 1211 return packFloat64( zSign, zExp, zSig ); 1212 1213 } 1214 1215 /*---------------------------------------------------------------------------- 1216 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1217 | and significand `zSig', and returns the proper double-precision floating- 1218 | point value corresponding to the abstract input. This routine is just like 1219 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 1220 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1221 | floating-point exponent. 1222 *----------------------------------------------------------------------------*/ 1223 1224 static float64 1225 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1226 float_status *status) 1227 { 1228 int8_t shiftCount; 1229 1230 shiftCount = countLeadingZeros64( zSig ) - 1; 1231 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 1232 status); 1233 1234 } 1235 1236 /*---------------------------------------------------------------------------- 1237 | Returns the fraction bits of the extended double-precision floating-point 1238 | value `a'. 1239 *----------------------------------------------------------------------------*/ 1240 1241 static inline uint64_t extractFloatx80Frac( floatx80 a ) 1242 { 1243 1244 return a.low; 1245 1246 } 1247 1248 /*---------------------------------------------------------------------------- 1249 | Returns the exponent bits of the extended double-precision floating-point 1250 | value `a'. 1251 *----------------------------------------------------------------------------*/ 1252 1253 static inline int32_t extractFloatx80Exp( floatx80 a ) 1254 { 1255 1256 return a.high & 0x7FFF; 1257 1258 } 1259 1260 /*---------------------------------------------------------------------------- 1261 | Returns the sign bit of the extended double-precision floating-point value 1262 | `a'. 1263 *----------------------------------------------------------------------------*/ 1264 1265 static inline flag extractFloatx80Sign( floatx80 a ) 1266 { 1267 1268 return a.high>>15; 1269 1270 } 1271 1272 /*---------------------------------------------------------------------------- 1273 | Normalizes the subnormal extended double-precision floating-point value 1274 | represented by the denormalized significand `aSig'. The normalized exponent 1275 | and significand are stored at the locations pointed to by `zExpPtr' and 1276 | `zSigPtr', respectively. 1277 *----------------------------------------------------------------------------*/ 1278 1279 static void 1280 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 1281 { 1282 int8_t shiftCount; 1283 1284 shiftCount = countLeadingZeros64( aSig ); 1285 *zSigPtr = aSig<<shiftCount; 1286 *zExpPtr = 1 - shiftCount; 1287 1288 } 1289 1290 /*---------------------------------------------------------------------------- 1291 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 1292 | extended double-precision floating-point value, returning the result. 1293 *----------------------------------------------------------------------------*/ 1294 1295 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 1296 { 1297 floatx80 z; 1298 1299 z.low = zSig; 1300 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 1301 return z; 1302 1303 } 1304 1305 /*---------------------------------------------------------------------------- 1306 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1307 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 1308 | and returns the proper extended double-precision floating-point value 1309 | corresponding to the abstract input. Ordinarily, the abstract value is 1310 | rounded and packed into the extended double-precision format, with the 1311 | inexact exception raised if the abstract input cannot be represented 1312 | exactly. However, if the abstract value is too large, the overflow and 1313 | inexact exceptions are raised and an infinity or maximal finite value is 1314 | returned. If the abstract value is too small, the input value is rounded to 1315 | a subnormal number, and the underflow and inexact exceptions are raised if 1316 | the abstract input cannot be represented exactly as a subnormal extended 1317 | double-precision floating-point number. 1318 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 1319 | number of bits as single or double precision, respectively. Otherwise, the 1320 | result is rounded to the full precision of the extended double-precision 1321 | format. 1322 | The input significand must be normalized or smaller. If the input 1323 | significand is not normalized, `zExp' must be 0; in that case, the result 1324 | returned is a subnormal number, and it must not require rounding. The 1325 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 1326 | Floating-Point Arithmetic. 1327 *----------------------------------------------------------------------------*/ 1328 1329 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 1330 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 1331 float_status *status) 1332 { 1333 int8_t roundingMode; 1334 flag roundNearestEven, increment, isTiny; 1335 int64_t roundIncrement, roundMask, roundBits; 1336 1337 roundingMode = status->float_rounding_mode; 1338 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1339 if ( roundingPrecision == 80 ) goto precision80; 1340 if ( roundingPrecision == 64 ) { 1341 roundIncrement = LIT64( 0x0000000000000400 ); 1342 roundMask = LIT64( 0x00000000000007FF ); 1343 } 1344 else if ( roundingPrecision == 32 ) { 1345 roundIncrement = LIT64( 0x0000008000000000 ); 1346 roundMask = LIT64( 0x000000FFFFFFFFFF ); 1347 } 1348 else { 1349 goto precision80; 1350 } 1351 zSig0 |= ( zSig1 != 0 ); 1352 switch (roundingMode) { 1353 case float_round_nearest_even: 1354 case float_round_ties_away: 1355 break; 1356 case float_round_to_zero: 1357 roundIncrement = 0; 1358 break; 1359 case float_round_up: 1360 roundIncrement = zSign ? 0 : roundMask; 1361 break; 1362 case float_round_down: 1363 roundIncrement = zSign ? roundMask : 0; 1364 break; 1365 default: 1366 abort(); 1367 } 1368 roundBits = zSig0 & roundMask; 1369 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 1370 if ( ( 0x7FFE < zExp ) 1371 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 1372 ) { 1373 goto overflow; 1374 } 1375 if ( zExp <= 0 ) { 1376 if (status->flush_to_zero) { 1377 float_raise(float_flag_output_denormal, status); 1378 return packFloatx80(zSign, 0, 0); 1379 } 1380 isTiny = 1381 (status->float_detect_tininess 1382 == float_tininess_before_rounding) 1383 || ( zExp < 0 ) 1384 || ( zSig0 <= zSig0 + roundIncrement ); 1385 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 1386 zExp = 0; 1387 roundBits = zSig0 & roundMask; 1388 if (isTiny && roundBits) { 1389 float_raise(float_flag_underflow, status); 1390 } 1391 if (roundBits) { 1392 status->float_exception_flags |= float_flag_inexact; 1393 } 1394 zSig0 += roundIncrement; 1395 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1396 roundIncrement = roundMask + 1; 1397 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1398 roundMask |= roundIncrement; 1399 } 1400 zSig0 &= ~ roundMask; 1401 return packFloatx80( zSign, zExp, zSig0 ); 1402 } 1403 } 1404 if (roundBits) { 1405 status->float_exception_flags |= float_flag_inexact; 1406 } 1407 zSig0 += roundIncrement; 1408 if ( zSig0 < roundIncrement ) { 1409 ++zExp; 1410 zSig0 = LIT64( 0x8000000000000000 ); 1411 } 1412 roundIncrement = roundMask + 1; 1413 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1414 roundMask |= roundIncrement; 1415 } 1416 zSig0 &= ~ roundMask; 1417 if ( zSig0 == 0 ) zExp = 0; 1418 return packFloatx80( zSign, zExp, zSig0 ); 1419 precision80: 1420 switch (roundingMode) { 1421 case float_round_nearest_even: 1422 case float_round_ties_away: 1423 increment = ((int64_t)zSig1 < 0); 1424 break; 1425 case float_round_to_zero: 1426 increment = 0; 1427 break; 1428 case float_round_up: 1429 increment = !zSign && zSig1; 1430 break; 1431 case float_round_down: 1432 increment = zSign && zSig1; 1433 break; 1434 default: 1435 abort(); 1436 } 1437 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 1438 if ( ( 0x7FFE < zExp ) 1439 || ( ( zExp == 0x7FFE ) 1440 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 1441 && increment 1442 ) 1443 ) { 1444 roundMask = 0; 1445 overflow: 1446 float_raise(float_flag_overflow | float_flag_inexact, status); 1447 if ( ( roundingMode == float_round_to_zero ) 1448 || ( zSign && ( roundingMode == float_round_up ) ) 1449 || ( ! zSign && ( roundingMode == float_round_down ) ) 1450 ) { 1451 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 1452 } 1453 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1454 } 1455 if ( zExp <= 0 ) { 1456 isTiny = 1457 (status->float_detect_tininess 1458 == float_tininess_before_rounding) 1459 || ( zExp < 0 ) 1460 || ! increment 1461 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 1462 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 1463 zExp = 0; 1464 if (isTiny && zSig1) { 1465 float_raise(float_flag_underflow, status); 1466 } 1467 if (zSig1) { 1468 status->float_exception_flags |= float_flag_inexact; 1469 } 1470 switch (roundingMode) { 1471 case float_round_nearest_even: 1472 case float_round_ties_away: 1473 increment = ((int64_t)zSig1 < 0); 1474 break; 1475 case float_round_to_zero: 1476 increment = 0; 1477 break; 1478 case float_round_up: 1479 increment = !zSign && zSig1; 1480 break; 1481 case float_round_down: 1482 increment = zSign && zSig1; 1483 break; 1484 default: 1485 abort(); 1486 } 1487 if ( increment ) { 1488 ++zSig0; 1489 zSig0 &= 1490 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1491 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1492 } 1493 return packFloatx80( zSign, zExp, zSig0 ); 1494 } 1495 } 1496 if (zSig1) { 1497 status->float_exception_flags |= float_flag_inexact; 1498 } 1499 if ( increment ) { 1500 ++zSig0; 1501 if ( zSig0 == 0 ) { 1502 ++zExp; 1503 zSig0 = LIT64( 0x8000000000000000 ); 1504 } 1505 else { 1506 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1507 } 1508 } 1509 else { 1510 if ( zSig0 == 0 ) zExp = 0; 1511 } 1512 return packFloatx80( zSign, zExp, zSig0 ); 1513 1514 } 1515 1516 /*---------------------------------------------------------------------------- 1517 | Takes an abstract floating-point value having sign `zSign', exponent 1518 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 1519 | and returns the proper extended double-precision floating-point value 1520 | corresponding to the abstract input. This routine is just like 1521 | `roundAndPackFloatx80' except that the input significand does not have to be 1522 | normalized. 1523 *----------------------------------------------------------------------------*/ 1524 1525 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 1526 flag zSign, int32_t zExp, 1527 uint64_t zSig0, uint64_t zSig1, 1528 float_status *status) 1529 { 1530 int8_t shiftCount; 1531 1532 if ( zSig0 == 0 ) { 1533 zSig0 = zSig1; 1534 zSig1 = 0; 1535 zExp -= 64; 1536 } 1537 shiftCount = countLeadingZeros64( zSig0 ); 1538 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1539 zExp -= shiftCount; 1540 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 1541 zSig0, zSig1, status); 1542 1543 } 1544 1545 /*---------------------------------------------------------------------------- 1546 | Returns the least-significant 64 fraction bits of the quadruple-precision 1547 | floating-point value `a'. 1548 *----------------------------------------------------------------------------*/ 1549 1550 static inline uint64_t extractFloat128Frac1( float128 a ) 1551 { 1552 1553 return a.low; 1554 1555 } 1556 1557 /*---------------------------------------------------------------------------- 1558 | Returns the most-significant 48 fraction bits of the quadruple-precision 1559 | floating-point value `a'. 1560 *----------------------------------------------------------------------------*/ 1561 1562 static inline uint64_t extractFloat128Frac0( float128 a ) 1563 { 1564 1565 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1566 1567 } 1568 1569 /*---------------------------------------------------------------------------- 1570 | Returns the exponent bits of the quadruple-precision floating-point value 1571 | `a'. 1572 *----------------------------------------------------------------------------*/ 1573 1574 static inline int32_t extractFloat128Exp( float128 a ) 1575 { 1576 1577 return ( a.high>>48 ) & 0x7FFF; 1578 1579 } 1580 1581 /*---------------------------------------------------------------------------- 1582 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1583 *----------------------------------------------------------------------------*/ 1584 1585 static inline flag extractFloat128Sign( float128 a ) 1586 { 1587 1588 return a.high>>63; 1589 1590 } 1591 1592 /*---------------------------------------------------------------------------- 1593 | Normalizes the subnormal quadruple-precision floating-point value 1594 | represented by the denormalized significand formed by the concatenation of 1595 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1596 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1597 | significand are stored at the location pointed to by `zSig0Ptr', and the 1598 | least significant 64 bits of the normalized significand are stored at the 1599 | location pointed to by `zSig1Ptr'. 1600 *----------------------------------------------------------------------------*/ 1601 1602 static void 1603 normalizeFloat128Subnormal( 1604 uint64_t aSig0, 1605 uint64_t aSig1, 1606 int32_t *zExpPtr, 1607 uint64_t *zSig0Ptr, 1608 uint64_t *zSig1Ptr 1609 ) 1610 { 1611 int8_t shiftCount; 1612 1613 if ( aSig0 == 0 ) { 1614 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1615 if ( shiftCount < 0 ) { 1616 *zSig0Ptr = aSig1>>( - shiftCount ); 1617 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1618 } 1619 else { 1620 *zSig0Ptr = aSig1<<shiftCount; 1621 *zSig1Ptr = 0; 1622 } 1623 *zExpPtr = - shiftCount - 63; 1624 } 1625 else { 1626 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1627 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1628 *zExpPtr = 1 - shiftCount; 1629 } 1630 1631 } 1632 1633 /*---------------------------------------------------------------------------- 1634 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1635 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1636 | floating-point value, returning the result. After being shifted into the 1637 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1638 | added together to form the most significant 32 bits of the result. This 1639 | means that any integer portion of `zSig0' will be added into the exponent. 1640 | Since a properly normalized significand will have an integer portion equal 1641 | to 1, the `zExp' input should be 1 less than the desired result exponent 1642 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1643 | significand. 1644 *----------------------------------------------------------------------------*/ 1645 1646 static inline float128 1647 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1648 { 1649 float128 z; 1650 1651 z.low = zSig1; 1652 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1653 return z; 1654 1655 } 1656 1657 /*---------------------------------------------------------------------------- 1658 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1659 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1660 | and `zSig2', and returns the proper quadruple-precision floating-point value 1661 | corresponding to the abstract input. Ordinarily, the abstract value is 1662 | simply rounded and packed into the quadruple-precision format, with the 1663 | inexact exception raised if the abstract input cannot be represented 1664 | exactly. However, if the abstract value is too large, the overflow and 1665 | inexact exceptions are raised and an infinity or maximal finite value is 1666 | returned. If the abstract value is too small, the input value is rounded to 1667 | a subnormal number, and the underflow and inexact exceptions are raised if 1668 | the abstract input cannot be represented exactly as a subnormal quadruple- 1669 | precision floating-point number. 1670 | The input significand must be normalized or smaller. If the input 1671 | significand is not normalized, `zExp' must be 0; in that case, the result 1672 | returned is a subnormal number, and it must not require rounding. In the 1673 | usual case that the input significand is normalized, `zExp' must be 1 less 1674 | than the ``true'' floating-point exponent. The handling of underflow and 1675 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1676 *----------------------------------------------------------------------------*/ 1677 1678 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1679 uint64_t zSig0, uint64_t zSig1, 1680 uint64_t zSig2, float_status *status) 1681 { 1682 int8_t roundingMode; 1683 flag roundNearestEven, increment, isTiny; 1684 1685 roundingMode = status->float_rounding_mode; 1686 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1687 switch (roundingMode) { 1688 case float_round_nearest_even: 1689 case float_round_ties_away: 1690 increment = ((int64_t)zSig2 < 0); 1691 break; 1692 case float_round_to_zero: 1693 increment = 0; 1694 break; 1695 case float_round_up: 1696 increment = !zSign && zSig2; 1697 break; 1698 case float_round_down: 1699 increment = zSign && zSig2; 1700 break; 1701 case float_round_to_odd: 1702 increment = !(zSig1 & 0x1) && zSig2; 1703 break; 1704 default: 1705 abort(); 1706 } 1707 if ( 0x7FFD <= (uint32_t) zExp ) { 1708 if ( ( 0x7FFD < zExp ) 1709 || ( ( zExp == 0x7FFD ) 1710 && eq128( 1711 LIT64( 0x0001FFFFFFFFFFFF ), 1712 LIT64( 0xFFFFFFFFFFFFFFFF ), 1713 zSig0, 1714 zSig1 1715 ) 1716 && increment 1717 ) 1718 ) { 1719 float_raise(float_flag_overflow | float_flag_inexact, status); 1720 if ( ( roundingMode == float_round_to_zero ) 1721 || ( zSign && ( roundingMode == float_round_up ) ) 1722 || ( ! zSign && ( roundingMode == float_round_down ) ) 1723 || (roundingMode == float_round_to_odd) 1724 ) { 1725 return 1726 packFloat128( 1727 zSign, 1728 0x7FFE, 1729 LIT64( 0x0000FFFFFFFFFFFF ), 1730 LIT64( 0xFFFFFFFFFFFFFFFF ) 1731 ); 1732 } 1733 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1734 } 1735 if ( zExp < 0 ) { 1736 if (status->flush_to_zero) { 1737 float_raise(float_flag_output_denormal, status); 1738 return packFloat128(zSign, 0, 0, 0); 1739 } 1740 isTiny = 1741 (status->float_detect_tininess 1742 == float_tininess_before_rounding) 1743 || ( zExp < -1 ) 1744 || ! increment 1745 || lt128( 1746 zSig0, 1747 zSig1, 1748 LIT64( 0x0001FFFFFFFFFFFF ), 1749 LIT64( 0xFFFFFFFFFFFFFFFF ) 1750 ); 1751 shift128ExtraRightJamming( 1752 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1753 zExp = 0; 1754 if (isTiny && zSig2) { 1755 float_raise(float_flag_underflow, status); 1756 } 1757 switch (roundingMode) { 1758 case float_round_nearest_even: 1759 case float_round_ties_away: 1760 increment = ((int64_t)zSig2 < 0); 1761 break; 1762 case float_round_to_zero: 1763 increment = 0; 1764 break; 1765 case float_round_up: 1766 increment = !zSign && zSig2; 1767 break; 1768 case float_round_down: 1769 increment = zSign && zSig2; 1770 break; 1771 case float_round_to_odd: 1772 increment = !(zSig1 & 0x1) && zSig2; 1773 break; 1774 default: 1775 abort(); 1776 } 1777 } 1778 } 1779 if (zSig2) { 1780 status->float_exception_flags |= float_flag_inexact; 1781 } 1782 if ( increment ) { 1783 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1784 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1785 } 1786 else { 1787 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1788 } 1789 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1790 1791 } 1792 1793 /*---------------------------------------------------------------------------- 1794 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1795 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1796 | returns the proper quadruple-precision floating-point value corresponding 1797 | to the abstract input. This routine is just like `roundAndPackFloat128' 1798 | except that the input significand has fewer bits and does not have to be 1799 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1800 | point exponent. 1801 *----------------------------------------------------------------------------*/ 1802 1803 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1804 uint64_t zSig0, uint64_t zSig1, 1805 float_status *status) 1806 { 1807 int8_t shiftCount; 1808 uint64_t zSig2; 1809 1810 if ( zSig0 == 0 ) { 1811 zSig0 = zSig1; 1812 zSig1 = 0; 1813 zExp -= 64; 1814 } 1815 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1816 if ( 0 <= shiftCount ) { 1817 zSig2 = 0; 1818 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1819 } 1820 else { 1821 shift128ExtraRightJamming( 1822 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1823 } 1824 zExp -= shiftCount; 1825 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1826 1827 } 1828 1829 /*---------------------------------------------------------------------------- 1830 | Returns the result of converting the 32-bit two's complement integer `a' 1831 | to the single-precision floating-point format. The conversion is performed 1832 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1833 *----------------------------------------------------------------------------*/ 1834 1835 float32 int32_to_float32(int32_t a, float_status *status) 1836 { 1837 flag zSign; 1838 1839 if ( a == 0 ) return float32_zero; 1840 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1841 zSign = ( a < 0 ); 1842 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1843 } 1844 1845 /*---------------------------------------------------------------------------- 1846 | Returns the result of converting the 32-bit two's complement integer `a' 1847 | to the double-precision floating-point format. The conversion is performed 1848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1849 *----------------------------------------------------------------------------*/ 1850 1851 float64 int32_to_float64(int32_t a, float_status *status) 1852 { 1853 flag zSign; 1854 uint32_t absA; 1855 int8_t shiftCount; 1856 uint64_t zSig; 1857 1858 if ( a == 0 ) return float64_zero; 1859 zSign = ( a < 0 ); 1860 absA = zSign ? - a : a; 1861 shiftCount = countLeadingZeros32( absA ) + 21; 1862 zSig = absA; 1863 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1864 1865 } 1866 1867 /*---------------------------------------------------------------------------- 1868 | Returns the result of converting the 32-bit two's complement integer `a' 1869 | to the extended double-precision floating-point format. The conversion 1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1871 | Arithmetic. 1872 *----------------------------------------------------------------------------*/ 1873 1874 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1875 { 1876 flag zSign; 1877 uint32_t absA; 1878 int8_t shiftCount; 1879 uint64_t zSig; 1880 1881 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1882 zSign = ( a < 0 ); 1883 absA = zSign ? - a : a; 1884 shiftCount = countLeadingZeros32( absA ) + 32; 1885 zSig = absA; 1886 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1887 1888 } 1889 1890 /*---------------------------------------------------------------------------- 1891 | Returns the result of converting the 32-bit two's complement integer `a' to 1892 | the quadruple-precision floating-point format. The conversion is performed 1893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1894 *----------------------------------------------------------------------------*/ 1895 1896 float128 int32_to_float128(int32_t a, float_status *status) 1897 { 1898 flag zSign; 1899 uint32_t absA; 1900 int8_t shiftCount; 1901 uint64_t zSig0; 1902 1903 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1904 zSign = ( a < 0 ); 1905 absA = zSign ? - a : a; 1906 shiftCount = countLeadingZeros32( absA ) + 17; 1907 zSig0 = absA; 1908 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1909 1910 } 1911 1912 /*---------------------------------------------------------------------------- 1913 | Returns the result of converting the 64-bit two's complement integer `a' 1914 | to the single-precision floating-point format. The conversion is performed 1915 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1916 *----------------------------------------------------------------------------*/ 1917 1918 float32 int64_to_float32(int64_t a, float_status *status) 1919 { 1920 flag zSign; 1921 uint64_t absA; 1922 int8_t shiftCount; 1923 1924 if ( a == 0 ) return float32_zero; 1925 zSign = ( a < 0 ); 1926 absA = zSign ? - a : a; 1927 shiftCount = countLeadingZeros64( absA ) - 40; 1928 if ( 0 <= shiftCount ) { 1929 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1930 } 1931 else { 1932 shiftCount += 7; 1933 if ( shiftCount < 0 ) { 1934 shift64RightJamming( absA, - shiftCount, &absA ); 1935 } 1936 else { 1937 absA <<= shiftCount; 1938 } 1939 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1940 } 1941 1942 } 1943 1944 /*---------------------------------------------------------------------------- 1945 | Returns the result of converting the 64-bit two's complement integer `a' 1946 | to the double-precision floating-point format. The conversion is performed 1947 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1948 *----------------------------------------------------------------------------*/ 1949 1950 float64 int64_to_float64(int64_t a, float_status *status) 1951 { 1952 flag zSign; 1953 1954 if ( a == 0 ) return float64_zero; 1955 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1956 return packFloat64( 1, 0x43E, 0 ); 1957 } 1958 zSign = ( a < 0 ); 1959 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1960 } 1961 1962 /*---------------------------------------------------------------------------- 1963 | Returns the result of converting the 64-bit two's complement integer `a' 1964 | to the extended double-precision floating-point format. The conversion 1965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1966 | Arithmetic. 1967 *----------------------------------------------------------------------------*/ 1968 1969 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1970 { 1971 flag zSign; 1972 uint64_t absA; 1973 int8_t shiftCount; 1974 1975 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1976 zSign = ( a < 0 ); 1977 absA = zSign ? - a : a; 1978 shiftCount = countLeadingZeros64( absA ); 1979 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1980 1981 } 1982 1983 /*---------------------------------------------------------------------------- 1984 | Returns the result of converting the 64-bit two's complement integer `a' to 1985 | the quadruple-precision floating-point format. The conversion is performed 1986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1987 *----------------------------------------------------------------------------*/ 1988 1989 float128 int64_to_float128(int64_t a, float_status *status) 1990 { 1991 flag zSign; 1992 uint64_t absA; 1993 int8_t shiftCount; 1994 int32_t zExp; 1995 uint64_t zSig0, zSig1; 1996 1997 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1998 zSign = ( a < 0 ); 1999 absA = zSign ? - a : a; 2000 shiftCount = countLeadingZeros64( absA ) + 49; 2001 zExp = 0x406E - shiftCount; 2002 if ( 64 <= shiftCount ) { 2003 zSig1 = 0; 2004 zSig0 = absA; 2005 shiftCount -= 64; 2006 } 2007 else { 2008 zSig1 = absA; 2009 zSig0 = 0; 2010 } 2011 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2012 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2013 2014 } 2015 2016 /*---------------------------------------------------------------------------- 2017 | Returns the result of converting the 64-bit unsigned integer `a' 2018 | to the single-precision floating-point format. The conversion is performed 2019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2020 *----------------------------------------------------------------------------*/ 2021 2022 float32 uint64_to_float32(uint64_t a, float_status *status) 2023 { 2024 int shiftcount; 2025 2026 if (a == 0) { 2027 return float32_zero; 2028 } 2029 2030 /* Determine (left) shift needed to put first set bit into bit posn 23 2031 * (since packFloat32() expects the binary point between bits 23 and 22); 2032 * this is the fast case for smallish numbers. 2033 */ 2034 shiftcount = countLeadingZeros64(a) - 40; 2035 if (shiftcount >= 0) { 2036 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 2037 } 2038 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 2039 * expects the binary point between bits 30 and 29, hence the + 7. 2040 */ 2041 shiftcount += 7; 2042 if (shiftcount < 0) { 2043 shift64RightJamming(a, -shiftcount, &a); 2044 } else { 2045 a <<= shiftcount; 2046 } 2047 2048 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 2049 } 2050 2051 /*---------------------------------------------------------------------------- 2052 | Returns the result of converting the 64-bit unsigned integer `a' 2053 | to the double-precision floating-point format. The conversion is performed 2054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2055 *----------------------------------------------------------------------------*/ 2056 2057 float64 uint64_to_float64(uint64_t a, float_status *status) 2058 { 2059 int exp = 0x43C; 2060 int shiftcount; 2061 2062 if (a == 0) { 2063 return float64_zero; 2064 } 2065 2066 shiftcount = countLeadingZeros64(a) - 1; 2067 if (shiftcount < 0) { 2068 shift64RightJamming(a, -shiftcount, &a); 2069 } else { 2070 a <<= shiftcount; 2071 } 2072 return roundAndPackFloat64(0, exp - shiftcount, a, status); 2073 } 2074 2075 /*---------------------------------------------------------------------------- 2076 | Returns the result of converting the 64-bit unsigned integer `a' 2077 | to the quadruple-precision floating-point format. The conversion is performed 2078 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2079 *----------------------------------------------------------------------------*/ 2080 2081 float128 uint64_to_float128(uint64_t a, float_status *status) 2082 { 2083 if (a == 0) { 2084 return float128_zero; 2085 } 2086 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 2087 } 2088 2089 /*---------------------------------------------------------------------------- 2090 | Returns the result of converting the single-precision floating-point value 2091 | `a' to the 32-bit two's complement integer format. The conversion is 2092 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2093 | Arithmetic---which means in particular that the conversion is rounded 2094 | according to the current rounding mode. If `a' is a NaN, the largest 2095 | positive integer is returned. Otherwise, if the conversion overflows, the 2096 | largest integer with the same sign as `a' is returned. 2097 *----------------------------------------------------------------------------*/ 2098 2099 int32_t float32_to_int32(float32 a, float_status *status) 2100 { 2101 flag aSign; 2102 int aExp; 2103 int shiftCount; 2104 uint32_t aSig; 2105 uint64_t aSig64; 2106 2107 a = float32_squash_input_denormal(a, status); 2108 aSig = extractFloat32Frac( a ); 2109 aExp = extractFloat32Exp( a ); 2110 aSign = extractFloat32Sign( a ); 2111 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 2112 if ( aExp ) aSig |= 0x00800000; 2113 shiftCount = 0xAF - aExp; 2114 aSig64 = aSig; 2115 aSig64 <<= 32; 2116 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 2117 return roundAndPackInt32(aSign, aSig64, status); 2118 2119 } 2120 2121 /*---------------------------------------------------------------------------- 2122 | Returns the result of converting the single-precision floating-point value 2123 | `a' to the 32-bit two's complement integer format. The conversion is 2124 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2125 | Arithmetic, except that the conversion is always rounded toward zero. 2126 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2127 | the conversion overflows, the largest integer with the same sign as `a' is 2128 | returned. 2129 *----------------------------------------------------------------------------*/ 2130 2131 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 2132 { 2133 flag aSign; 2134 int aExp; 2135 int shiftCount; 2136 uint32_t aSig; 2137 int32_t z; 2138 a = float32_squash_input_denormal(a, status); 2139 2140 aSig = extractFloat32Frac( a ); 2141 aExp = extractFloat32Exp( a ); 2142 aSign = extractFloat32Sign( a ); 2143 shiftCount = aExp - 0x9E; 2144 if ( 0 <= shiftCount ) { 2145 if ( float32_val(a) != 0xCF000000 ) { 2146 float_raise(float_flag_invalid, status); 2147 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 2148 } 2149 return (int32_t) 0x80000000; 2150 } 2151 else if ( aExp <= 0x7E ) { 2152 if (aExp | aSig) { 2153 status->float_exception_flags |= float_flag_inexact; 2154 } 2155 return 0; 2156 } 2157 aSig = ( aSig | 0x00800000 )<<8; 2158 z = aSig>>( - shiftCount ); 2159 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2160 status->float_exception_flags |= float_flag_inexact; 2161 } 2162 if ( aSign ) z = - z; 2163 return z; 2164 2165 } 2166 2167 /*---------------------------------------------------------------------------- 2168 | Returns the result of converting the single-precision floating-point value 2169 | `a' to the 16-bit two's complement integer format. The conversion is 2170 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2171 | Arithmetic, except that the conversion is always rounded toward zero. 2172 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2173 | the conversion overflows, the largest integer with the same sign as `a' is 2174 | returned. 2175 *----------------------------------------------------------------------------*/ 2176 2177 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 2178 { 2179 flag aSign; 2180 int aExp; 2181 int shiftCount; 2182 uint32_t aSig; 2183 int32_t z; 2184 2185 aSig = extractFloat32Frac( a ); 2186 aExp = extractFloat32Exp( a ); 2187 aSign = extractFloat32Sign( a ); 2188 shiftCount = aExp - 0x8E; 2189 if ( 0 <= shiftCount ) { 2190 if ( float32_val(a) != 0xC7000000 ) { 2191 float_raise(float_flag_invalid, status); 2192 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2193 return 0x7FFF; 2194 } 2195 } 2196 return (int32_t) 0xffff8000; 2197 } 2198 else if ( aExp <= 0x7E ) { 2199 if ( aExp | aSig ) { 2200 status->float_exception_flags |= float_flag_inexact; 2201 } 2202 return 0; 2203 } 2204 shiftCount -= 0x10; 2205 aSig = ( aSig | 0x00800000 )<<8; 2206 z = aSig>>( - shiftCount ); 2207 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2208 status->float_exception_flags |= float_flag_inexact; 2209 } 2210 if ( aSign ) { 2211 z = - z; 2212 } 2213 return z; 2214 2215 } 2216 2217 /*---------------------------------------------------------------------------- 2218 | Returns the result of converting the single-precision floating-point value 2219 | `a' to the 64-bit two's complement integer format. The conversion is 2220 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2221 | Arithmetic---which means in particular that the conversion is rounded 2222 | according to the current rounding mode. If `a' is a NaN, the largest 2223 | positive integer is returned. Otherwise, if the conversion overflows, the 2224 | largest integer with the same sign as `a' is returned. 2225 *----------------------------------------------------------------------------*/ 2226 2227 int64_t float32_to_int64(float32 a, float_status *status) 2228 { 2229 flag aSign; 2230 int aExp; 2231 int shiftCount; 2232 uint32_t aSig; 2233 uint64_t aSig64, aSigExtra; 2234 a = float32_squash_input_denormal(a, status); 2235 2236 aSig = extractFloat32Frac( a ); 2237 aExp = extractFloat32Exp( a ); 2238 aSign = extractFloat32Sign( a ); 2239 shiftCount = 0xBE - aExp; 2240 if ( shiftCount < 0 ) { 2241 float_raise(float_flag_invalid, status); 2242 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2243 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2244 } 2245 return (int64_t) LIT64( 0x8000000000000000 ); 2246 } 2247 if ( aExp ) aSig |= 0x00800000; 2248 aSig64 = aSig; 2249 aSig64 <<= 40; 2250 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 2251 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 2252 2253 } 2254 2255 /*---------------------------------------------------------------------------- 2256 | Returns the result of converting the single-precision floating-point value 2257 | `a' to the 64-bit unsigned integer format. The conversion is 2258 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2259 | Arithmetic---which means in particular that the conversion is rounded 2260 | according to the current rounding mode. If `a' is a NaN, the largest 2261 | unsigned integer is returned. Otherwise, if the conversion overflows, the 2262 | largest unsigned integer is returned. If the 'a' is negative, the result 2263 | is rounded and zero is returned; values that do not round to zero will 2264 | raise the inexact exception flag. 2265 *----------------------------------------------------------------------------*/ 2266 2267 uint64_t float32_to_uint64(float32 a, float_status *status) 2268 { 2269 flag aSign; 2270 int aExp; 2271 int shiftCount; 2272 uint32_t aSig; 2273 uint64_t aSig64, aSigExtra; 2274 a = float32_squash_input_denormal(a, status); 2275 2276 aSig = extractFloat32Frac(a); 2277 aExp = extractFloat32Exp(a); 2278 aSign = extractFloat32Sign(a); 2279 if ((aSign) && (aExp > 126)) { 2280 float_raise(float_flag_invalid, status); 2281 if (float32_is_any_nan(a)) { 2282 return LIT64(0xFFFFFFFFFFFFFFFF); 2283 } else { 2284 return 0; 2285 } 2286 } 2287 shiftCount = 0xBE - aExp; 2288 if (aExp) { 2289 aSig |= 0x00800000; 2290 } 2291 if (shiftCount < 0) { 2292 float_raise(float_flag_invalid, status); 2293 return LIT64(0xFFFFFFFFFFFFFFFF); 2294 } 2295 2296 aSig64 = aSig; 2297 aSig64 <<= 40; 2298 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 2299 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 2300 } 2301 2302 /*---------------------------------------------------------------------------- 2303 | Returns the result of converting the single-precision floating-point value 2304 | `a' to the 64-bit unsigned integer format. The conversion is 2305 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2306 | Arithmetic, except that the conversion is always rounded toward zero. If 2307 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 2308 | conversion overflows, the largest unsigned integer is returned. If the 2309 | 'a' is negative, the result is rounded and zero is returned; values that do 2310 | not round to zero will raise the inexact flag. 2311 *----------------------------------------------------------------------------*/ 2312 2313 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 2314 { 2315 signed char current_rounding_mode = status->float_rounding_mode; 2316 set_float_rounding_mode(float_round_to_zero, status); 2317 int64_t v = float32_to_uint64(a, status); 2318 set_float_rounding_mode(current_rounding_mode, status); 2319 return v; 2320 } 2321 2322 /*---------------------------------------------------------------------------- 2323 | Returns the result of converting the single-precision floating-point value 2324 | `a' to the 64-bit two's complement integer format. The conversion is 2325 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2326 | Arithmetic, except that the conversion is always rounded toward zero. If 2327 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 2328 | conversion overflows, the largest integer with the same sign as `a' is 2329 | returned. 2330 *----------------------------------------------------------------------------*/ 2331 2332 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 2333 { 2334 flag aSign; 2335 int aExp; 2336 int shiftCount; 2337 uint32_t aSig; 2338 uint64_t aSig64; 2339 int64_t z; 2340 a = float32_squash_input_denormal(a, status); 2341 2342 aSig = extractFloat32Frac( a ); 2343 aExp = extractFloat32Exp( a ); 2344 aSign = extractFloat32Sign( a ); 2345 shiftCount = aExp - 0xBE; 2346 if ( 0 <= shiftCount ) { 2347 if ( float32_val(a) != 0xDF000000 ) { 2348 float_raise(float_flag_invalid, status); 2349 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2350 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2351 } 2352 } 2353 return (int64_t) LIT64( 0x8000000000000000 ); 2354 } 2355 else if ( aExp <= 0x7E ) { 2356 if (aExp | aSig) { 2357 status->float_exception_flags |= float_flag_inexact; 2358 } 2359 return 0; 2360 } 2361 aSig64 = aSig | 0x00800000; 2362 aSig64 <<= 40; 2363 z = aSig64>>( - shiftCount ); 2364 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 2365 status->float_exception_flags |= float_flag_inexact; 2366 } 2367 if ( aSign ) z = - z; 2368 return z; 2369 2370 } 2371 2372 /*---------------------------------------------------------------------------- 2373 | Returns the result of converting the single-precision floating-point value 2374 | `a' to the double-precision floating-point format. The conversion is 2375 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2376 | Arithmetic. 2377 *----------------------------------------------------------------------------*/ 2378 2379 float64 float32_to_float64(float32 a, float_status *status) 2380 { 2381 flag aSign; 2382 int aExp; 2383 uint32_t aSig; 2384 a = float32_squash_input_denormal(a, status); 2385 2386 aSig = extractFloat32Frac( a ); 2387 aExp = extractFloat32Exp( a ); 2388 aSign = extractFloat32Sign( a ); 2389 if ( aExp == 0xFF ) { 2390 if (aSig) { 2391 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 2392 } 2393 return packFloat64( aSign, 0x7FF, 0 ); 2394 } 2395 if ( aExp == 0 ) { 2396 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 2397 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2398 --aExp; 2399 } 2400 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 2401 2402 } 2403 2404 /*---------------------------------------------------------------------------- 2405 | Returns the result of converting the single-precision floating-point value 2406 | `a' to the extended double-precision floating-point format. The conversion 2407 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2408 | Arithmetic. 2409 *----------------------------------------------------------------------------*/ 2410 2411 floatx80 float32_to_floatx80(float32 a, float_status *status) 2412 { 2413 flag aSign; 2414 int aExp; 2415 uint32_t aSig; 2416 2417 a = float32_squash_input_denormal(a, status); 2418 aSig = extractFloat32Frac( a ); 2419 aExp = extractFloat32Exp( a ); 2420 aSign = extractFloat32Sign( a ); 2421 if ( aExp == 0xFF ) { 2422 if (aSig) { 2423 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 2424 } 2425 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2426 } 2427 if ( aExp == 0 ) { 2428 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2429 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2430 } 2431 aSig |= 0x00800000; 2432 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 2433 2434 } 2435 2436 /*---------------------------------------------------------------------------- 2437 | Returns the result of converting the single-precision floating-point value 2438 | `a' to the double-precision floating-point format. The conversion is 2439 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2440 | Arithmetic. 2441 *----------------------------------------------------------------------------*/ 2442 2443 float128 float32_to_float128(float32 a, float_status *status) 2444 { 2445 flag aSign; 2446 int aExp; 2447 uint32_t aSig; 2448 2449 a = float32_squash_input_denormal(a, status); 2450 aSig = extractFloat32Frac( a ); 2451 aExp = extractFloat32Exp( a ); 2452 aSign = extractFloat32Sign( a ); 2453 if ( aExp == 0xFF ) { 2454 if (aSig) { 2455 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 2456 } 2457 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2458 } 2459 if ( aExp == 0 ) { 2460 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2461 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2462 --aExp; 2463 } 2464 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 2465 2466 } 2467 2468 /*---------------------------------------------------------------------------- 2469 | Rounds the single-precision floating-point value `a' to an integer, and 2470 | returns the result as a single-precision floating-point value. The 2471 | operation is performed according to the IEC/IEEE Standard for Binary 2472 | Floating-Point Arithmetic. 2473 *----------------------------------------------------------------------------*/ 2474 2475 float32 float32_round_to_int(float32 a, float_status *status) 2476 { 2477 flag aSign; 2478 int aExp; 2479 uint32_t lastBitMask, roundBitsMask; 2480 uint32_t z; 2481 a = float32_squash_input_denormal(a, status); 2482 2483 aExp = extractFloat32Exp( a ); 2484 if ( 0x96 <= aExp ) { 2485 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 2486 return propagateFloat32NaN(a, a, status); 2487 } 2488 return a; 2489 } 2490 if ( aExp <= 0x7E ) { 2491 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 2492 status->float_exception_flags |= float_flag_inexact; 2493 aSign = extractFloat32Sign( a ); 2494 switch (status->float_rounding_mode) { 2495 case float_round_nearest_even: 2496 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 2497 return packFloat32( aSign, 0x7F, 0 ); 2498 } 2499 break; 2500 case float_round_ties_away: 2501 if (aExp == 0x7E) { 2502 return packFloat32(aSign, 0x7F, 0); 2503 } 2504 break; 2505 case float_round_down: 2506 return make_float32(aSign ? 0xBF800000 : 0); 2507 case float_round_up: 2508 return make_float32(aSign ? 0x80000000 : 0x3F800000); 2509 } 2510 return packFloat32( aSign, 0, 0 ); 2511 } 2512 lastBitMask = 1; 2513 lastBitMask <<= 0x96 - aExp; 2514 roundBitsMask = lastBitMask - 1; 2515 z = float32_val(a); 2516 switch (status->float_rounding_mode) { 2517 case float_round_nearest_even: 2518 z += lastBitMask>>1; 2519 if ((z & roundBitsMask) == 0) { 2520 z &= ~lastBitMask; 2521 } 2522 break; 2523 case float_round_ties_away: 2524 z += lastBitMask >> 1; 2525 break; 2526 case float_round_to_zero: 2527 break; 2528 case float_round_up: 2529 if (!extractFloat32Sign(make_float32(z))) { 2530 z += roundBitsMask; 2531 } 2532 break; 2533 case float_round_down: 2534 if (extractFloat32Sign(make_float32(z))) { 2535 z += roundBitsMask; 2536 } 2537 break; 2538 default: 2539 abort(); 2540 } 2541 z &= ~ roundBitsMask; 2542 if (z != float32_val(a)) { 2543 status->float_exception_flags |= float_flag_inexact; 2544 } 2545 return make_float32(z); 2546 2547 } 2548 2549 /*---------------------------------------------------------------------------- 2550 | Returns the result of multiplying the single-precision floating-point values 2551 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2552 | for Binary Floating-Point Arithmetic. 2553 *----------------------------------------------------------------------------*/ 2554 2555 float32 float32_mul(float32 a, float32 b, float_status *status) 2556 { 2557 flag aSign, bSign, zSign; 2558 int aExp, bExp, zExp; 2559 uint32_t aSig, bSig; 2560 uint64_t zSig64; 2561 uint32_t zSig; 2562 2563 a = float32_squash_input_denormal(a, status); 2564 b = float32_squash_input_denormal(b, status); 2565 2566 aSig = extractFloat32Frac( a ); 2567 aExp = extractFloat32Exp( a ); 2568 aSign = extractFloat32Sign( a ); 2569 bSig = extractFloat32Frac( b ); 2570 bExp = extractFloat32Exp( b ); 2571 bSign = extractFloat32Sign( b ); 2572 zSign = aSign ^ bSign; 2573 if ( aExp == 0xFF ) { 2574 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2575 return propagateFloat32NaN(a, b, status); 2576 } 2577 if ( ( bExp | bSig ) == 0 ) { 2578 float_raise(float_flag_invalid, status); 2579 return float32_default_nan(status); 2580 } 2581 return packFloat32( zSign, 0xFF, 0 ); 2582 } 2583 if ( bExp == 0xFF ) { 2584 if (bSig) { 2585 return propagateFloat32NaN(a, b, status); 2586 } 2587 if ( ( aExp | aSig ) == 0 ) { 2588 float_raise(float_flag_invalid, status); 2589 return float32_default_nan(status); 2590 } 2591 return packFloat32( zSign, 0xFF, 0 ); 2592 } 2593 if ( aExp == 0 ) { 2594 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2595 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2596 } 2597 if ( bExp == 0 ) { 2598 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2599 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2600 } 2601 zExp = aExp + bExp - 0x7F; 2602 aSig = ( aSig | 0x00800000 )<<7; 2603 bSig = ( bSig | 0x00800000 )<<8; 2604 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2605 zSig = zSig64; 2606 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2607 zSig <<= 1; 2608 --zExp; 2609 } 2610 return roundAndPackFloat32(zSign, zExp, zSig, status); 2611 2612 } 2613 2614 /*---------------------------------------------------------------------------- 2615 | Returns the result of dividing the single-precision floating-point value `a' 2616 | by the corresponding value `b'. The operation is performed according to the 2617 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2618 *----------------------------------------------------------------------------*/ 2619 2620 float32 float32_div(float32 a, float32 b, float_status *status) 2621 { 2622 flag aSign, bSign, zSign; 2623 int aExp, bExp, zExp; 2624 uint32_t aSig, bSig, zSig; 2625 a = float32_squash_input_denormal(a, status); 2626 b = float32_squash_input_denormal(b, status); 2627 2628 aSig = extractFloat32Frac( a ); 2629 aExp = extractFloat32Exp( a ); 2630 aSign = extractFloat32Sign( a ); 2631 bSig = extractFloat32Frac( b ); 2632 bExp = extractFloat32Exp( b ); 2633 bSign = extractFloat32Sign( b ); 2634 zSign = aSign ^ bSign; 2635 if ( aExp == 0xFF ) { 2636 if (aSig) { 2637 return propagateFloat32NaN(a, b, status); 2638 } 2639 if ( bExp == 0xFF ) { 2640 if (bSig) { 2641 return propagateFloat32NaN(a, b, status); 2642 } 2643 float_raise(float_flag_invalid, status); 2644 return float32_default_nan(status); 2645 } 2646 return packFloat32( zSign, 0xFF, 0 ); 2647 } 2648 if ( bExp == 0xFF ) { 2649 if (bSig) { 2650 return propagateFloat32NaN(a, b, status); 2651 } 2652 return packFloat32( zSign, 0, 0 ); 2653 } 2654 if ( bExp == 0 ) { 2655 if ( bSig == 0 ) { 2656 if ( ( aExp | aSig ) == 0 ) { 2657 float_raise(float_flag_invalid, status); 2658 return float32_default_nan(status); 2659 } 2660 float_raise(float_flag_divbyzero, status); 2661 return packFloat32( zSign, 0xFF, 0 ); 2662 } 2663 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2664 } 2665 if ( aExp == 0 ) { 2666 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2667 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2668 } 2669 zExp = aExp - bExp + 0x7D; 2670 aSig = ( aSig | 0x00800000 )<<7; 2671 bSig = ( bSig | 0x00800000 )<<8; 2672 if ( bSig <= ( aSig + aSig ) ) { 2673 aSig >>= 1; 2674 ++zExp; 2675 } 2676 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2677 if ( ( zSig & 0x3F ) == 0 ) { 2678 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2679 } 2680 return roundAndPackFloat32(zSign, zExp, zSig, status); 2681 2682 } 2683 2684 /*---------------------------------------------------------------------------- 2685 | Returns the remainder of the single-precision floating-point value `a' 2686 | with respect to the corresponding value `b'. The operation is performed 2687 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2688 *----------------------------------------------------------------------------*/ 2689 2690 float32 float32_rem(float32 a, float32 b, float_status *status) 2691 { 2692 flag aSign, zSign; 2693 int aExp, bExp, expDiff; 2694 uint32_t aSig, bSig; 2695 uint32_t q; 2696 uint64_t aSig64, bSig64, q64; 2697 uint32_t alternateASig; 2698 int32_t sigMean; 2699 a = float32_squash_input_denormal(a, status); 2700 b = float32_squash_input_denormal(b, status); 2701 2702 aSig = extractFloat32Frac( a ); 2703 aExp = extractFloat32Exp( a ); 2704 aSign = extractFloat32Sign( a ); 2705 bSig = extractFloat32Frac( b ); 2706 bExp = extractFloat32Exp( b ); 2707 if ( aExp == 0xFF ) { 2708 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2709 return propagateFloat32NaN(a, b, status); 2710 } 2711 float_raise(float_flag_invalid, status); 2712 return float32_default_nan(status); 2713 } 2714 if ( bExp == 0xFF ) { 2715 if (bSig) { 2716 return propagateFloat32NaN(a, b, status); 2717 } 2718 return a; 2719 } 2720 if ( bExp == 0 ) { 2721 if ( bSig == 0 ) { 2722 float_raise(float_flag_invalid, status); 2723 return float32_default_nan(status); 2724 } 2725 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2726 } 2727 if ( aExp == 0 ) { 2728 if ( aSig == 0 ) return a; 2729 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2730 } 2731 expDiff = aExp - bExp; 2732 aSig |= 0x00800000; 2733 bSig |= 0x00800000; 2734 if ( expDiff < 32 ) { 2735 aSig <<= 8; 2736 bSig <<= 8; 2737 if ( expDiff < 0 ) { 2738 if ( expDiff < -1 ) return a; 2739 aSig >>= 1; 2740 } 2741 q = ( bSig <= aSig ); 2742 if ( q ) aSig -= bSig; 2743 if ( 0 < expDiff ) { 2744 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2745 q >>= 32 - expDiff; 2746 bSig >>= 2; 2747 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2748 } 2749 else { 2750 aSig >>= 2; 2751 bSig >>= 2; 2752 } 2753 } 2754 else { 2755 if ( bSig <= aSig ) aSig -= bSig; 2756 aSig64 = ( (uint64_t) aSig )<<40; 2757 bSig64 = ( (uint64_t) bSig )<<40; 2758 expDiff -= 64; 2759 while ( 0 < expDiff ) { 2760 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2761 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2762 aSig64 = - ( ( bSig * q64 )<<38 ); 2763 expDiff -= 62; 2764 } 2765 expDiff += 64; 2766 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2767 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2768 q = q64>>( 64 - expDiff ); 2769 bSig <<= 6; 2770 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2771 } 2772 do { 2773 alternateASig = aSig; 2774 ++q; 2775 aSig -= bSig; 2776 } while ( 0 <= (int32_t) aSig ); 2777 sigMean = aSig + alternateASig; 2778 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2779 aSig = alternateASig; 2780 } 2781 zSign = ( (int32_t) aSig < 0 ); 2782 if ( zSign ) aSig = - aSig; 2783 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2784 } 2785 2786 /*---------------------------------------------------------------------------- 2787 | Returns the result of multiplying the single-precision floating-point values 2788 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2789 | multiplication. The operation is performed according to the IEC/IEEE 2790 | Standard for Binary Floating-Point Arithmetic 754-2008. 2791 | The flags argument allows the caller to select negation of the 2792 | addend, the intermediate product, or the final result. (The difference 2793 | between this and having the caller do a separate negation is that negating 2794 | externally will flip the sign bit on NaNs.) 2795 *----------------------------------------------------------------------------*/ 2796 2797 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2798 float_status *status) 2799 { 2800 flag aSign, bSign, cSign, zSign; 2801 int aExp, bExp, cExp, pExp, zExp, expDiff; 2802 uint32_t aSig, bSig, cSig; 2803 flag pInf, pZero, pSign; 2804 uint64_t pSig64, cSig64, zSig64; 2805 uint32_t pSig; 2806 int shiftcount; 2807 flag signflip, infzero; 2808 2809 a = float32_squash_input_denormal(a, status); 2810 b = float32_squash_input_denormal(b, status); 2811 c = float32_squash_input_denormal(c, status); 2812 aSig = extractFloat32Frac(a); 2813 aExp = extractFloat32Exp(a); 2814 aSign = extractFloat32Sign(a); 2815 bSig = extractFloat32Frac(b); 2816 bExp = extractFloat32Exp(b); 2817 bSign = extractFloat32Sign(b); 2818 cSig = extractFloat32Frac(c); 2819 cExp = extractFloat32Exp(c); 2820 cSign = extractFloat32Sign(c); 2821 2822 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2823 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2824 2825 /* It is implementation-defined whether the cases of (0,inf,qnan) 2826 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2827 * they return if they do), so we have to hand this information 2828 * off to the target-specific pick-a-NaN routine. 2829 */ 2830 if (((aExp == 0xff) && aSig) || 2831 ((bExp == 0xff) && bSig) || 2832 ((cExp == 0xff) && cSig)) { 2833 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2834 } 2835 2836 if (infzero) { 2837 float_raise(float_flag_invalid, status); 2838 return float32_default_nan(status); 2839 } 2840 2841 if (flags & float_muladd_negate_c) { 2842 cSign ^= 1; 2843 } 2844 2845 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2846 2847 /* Work out the sign and type of the product */ 2848 pSign = aSign ^ bSign; 2849 if (flags & float_muladd_negate_product) { 2850 pSign ^= 1; 2851 } 2852 pInf = (aExp == 0xff) || (bExp == 0xff); 2853 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2854 2855 if (cExp == 0xff) { 2856 if (pInf && (pSign ^ cSign)) { 2857 /* addition of opposite-signed infinities => InvalidOperation */ 2858 float_raise(float_flag_invalid, status); 2859 return float32_default_nan(status); 2860 } 2861 /* Otherwise generate an infinity of the same sign */ 2862 return packFloat32(cSign ^ signflip, 0xff, 0); 2863 } 2864 2865 if (pInf) { 2866 return packFloat32(pSign ^ signflip, 0xff, 0); 2867 } 2868 2869 if (pZero) { 2870 if (cExp == 0) { 2871 if (cSig == 0) { 2872 /* Adding two exact zeroes */ 2873 if (pSign == cSign) { 2874 zSign = pSign; 2875 } else if (status->float_rounding_mode == float_round_down) { 2876 zSign = 1; 2877 } else { 2878 zSign = 0; 2879 } 2880 return packFloat32(zSign ^ signflip, 0, 0); 2881 } 2882 /* Exact zero plus a denorm */ 2883 if (status->flush_to_zero) { 2884 float_raise(float_flag_output_denormal, status); 2885 return packFloat32(cSign ^ signflip, 0, 0); 2886 } 2887 } 2888 /* Zero plus something non-zero : just return the something */ 2889 if (flags & float_muladd_halve_result) { 2890 if (cExp == 0) { 2891 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2892 } 2893 /* Subtract one to halve, and one again because roundAndPackFloat32 2894 * wants one less than the true exponent. 2895 */ 2896 cExp -= 2; 2897 cSig = (cSig | 0x00800000) << 7; 2898 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2899 } 2900 return packFloat32(cSign ^ signflip, cExp, cSig); 2901 } 2902 2903 if (aExp == 0) { 2904 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2905 } 2906 if (bExp == 0) { 2907 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2908 } 2909 2910 /* Calculate the actual result a * b + c */ 2911 2912 /* Multiply first; this is easy. */ 2913 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2914 * because we want the true exponent, not the "one-less-than" 2915 * flavour that roundAndPackFloat32() takes. 2916 */ 2917 pExp = aExp + bExp - 0x7e; 2918 aSig = (aSig | 0x00800000) << 7; 2919 bSig = (bSig | 0x00800000) << 8; 2920 pSig64 = (uint64_t)aSig * bSig; 2921 if ((int64_t)(pSig64 << 1) >= 0) { 2922 pSig64 <<= 1; 2923 pExp--; 2924 } 2925 2926 zSign = pSign ^ signflip; 2927 2928 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2929 * position 62. 2930 */ 2931 if (cExp == 0) { 2932 if (!cSig) { 2933 /* Throw out the special case of c being an exact zero now */ 2934 shift64RightJamming(pSig64, 32, &pSig64); 2935 pSig = pSig64; 2936 if (flags & float_muladd_halve_result) { 2937 pExp--; 2938 } 2939 return roundAndPackFloat32(zSign, pExp - 1, 2940 pSig, status); 2941 } 2942 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2943 } 2944 2945 cSig64 = (uint64_t)cSig << (62 - 23); 2946 cSig64 |= LIT64(0x4000000000000000); 2947 expDiff = pExp - cExp; 2948 2949 if (pSign == cSign) { 2950 /* Addition */ 2951 if (expDiff > 0) { 2952 /* scale c to match p */ 2953 shift64RightJamming(cSig64, expDiff, &cSig64); 2954 zExp = pExp; 2955 } else if (expDiff < 0) { 2956 /* scale p to match c */ 2957 shift64RightJamming(pSig64, -expDiff, &pSig64); 2958 zExp = cExp; 2959 } else { 2960 /* no scaling needed */ 2961 zExp = cExp; 2962 } 2963 /* Add significands and make sure explicit bit ends up in posn 62 */ 2964 zSig64 = pSig64 + cSig64; 2965 if ((int64_t)zSig64 < 0) { 2966 shift64RightJamming(zSig64, 1, &zSig64); 2967 } else { 2968 zExp--; 2969 } 2970 } else { 2971 /* Subtraction */ 2972 if (expDiff > 0) { 2973 shift64RightJamming(cSig64, expDiff, &cSig64); 2974 zSig64 = pSig64 - cSig64; 2975 zExp = pExp; 2976 } else if (expDiff < 0) { 2977 shift64RightJamming(pSig64, -expDiff, &pSig64); 2978 zSig64 = cSig64 - pSig64; 2979 zExp = cExp; 2980 zSign ^= 1; 2981 } else { 2982 zExp = pExp; 2983 if (cSig64 < pSig64) { 2984 zSig64 = pSig64 - cSig64; 2985 } else if (pSig64 < cSig64) { 2986 zSig64 = cSig64 - pSig64; 2987 zSign ^= 1; 2988 } else { 2989 /* Exact zero */ 2990 zSign = signflip; 2991 if (status->float_rounding_mode == float_round_down) { 2992 zSign ^= 1; 2993 } 2994 return packFloat32(zSign, 0, 0); 2995 } 2996 } 2997 --zExp; 2998 /* Normalize to put the explicit bit back into bit 62. */ 2999 shiftcount = countLeadingZeros64(zSig64) - 1; 3000 zSig64 <<= shiftcount; 3001 zExp -= shiftcount; 3002 } 3003 if (flags & float_muladd_halve_result) { 3004 zExp--; 3005 } 3006 3007 shift64RightJamming(zSig64, 32, &zSig64); 3008 return roundAndPackFloat32(zSign, zExp, zSig64, status); 3009 } 3010 3011 3012 /*---------------------------------------------------------------------------- 3013 | Returns the square root of the single-precision floating-point value `a'. 3014 | The operation is performed according to the IEC/IEEE Standard for Binary 3015 | Floating-Point Arithmetic. 3016 *----------------------------------------------------------------------------*/ 3017 3018 float32 float32_sqrt(float32 a, float_status *status) 3019 { 3020 flag aSign; 3021 int aExp, zExp; 3022 uint32_t aSig, zSig; 3023 uint64_t rem, term; 3024 a = float32_squash_input_denormal(a, status); 3025 3026 aSig = extractFloat32Frac( a ); 3027 aExp = extractFloat32Exp( a ); 3028 aSign = extractFloat32Sign( a ); 3029 if ( aExp == 0xFF ) { 3030 if (aSig) { 3031 return propagateFloat32NaN(a, float32_zero, status); 3032 } 3033 if ( ! aSign ) return a; 3034 float_raise(float_flag_invalid, status); 3035 return float32_default_nan(status); 3036 } 3037 if ( aSign ) { 3038 if ( ( aExp | aSig ) == 0 ) return a; 3039 float_raise(float_flag_invalid, status); 3040 return float32_default_nan(status); 3041 } 3042 if ( aExp == 0 ) { 3043 if ( aSig == 0 ) return float32_zero; 3044 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3045 } 3046 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3047 aSig = ( aSig | 0x00800000 )<<8; 3048 zSig = estimateSqrt32( aExp, aSig ) + 2; 3049 if ( ( zSig & 0x7F ) <= 5 ) { 3050 if ( zSig < 2 ) { 3051 zSig = 0x7FFFFFFF; 3052 goto roundAndPack; 3053 } 3054 aSig >>= aExp & 1; 3055 term = ( (uint64_t) zSig ) * zSig; 3056 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3057 while ( (int64_t) rem < 0 ) { 3058 --zSig; 3059 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3060 } 3061 zSig |= ( rem != 0 ); 3062 } 3063 shift32RightJamming( zSig, 1, &zSig ); 3064 roundAndPack: 3065 return roundAndPackFloat32(0, zExp, zSig, status); 3066 3067 } 3068 3069 /*---------------------------------------------------------------------------- 3070 | Returns the binary exponential of the single-precision floating-point value 3071 | `a'. The operation is performed according to the IEC/IEEE Standard for 3072 | Binary Floating-Point Arithmetic. 3073 | 3074 | Uses the following identities: 3075 | 3076 | 1. ------------------------------------------------------------------------- 3077 | x x*ln(2) 3078 | 2 = e 3079 | 3080 | 2. ------------------------------------------------------------------------- 3081 | 2 3 4 5 n 3082 | x x x x x x x 3083 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3084 | 1! 2! 3! 4! 5! n! 3085 *----------------------------------------------------------------------------*/ 3086 3087 static const float64 float32_exp2_coefficients[15] = 3088 { 3089 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3090 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3091 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3092 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3093 const_float64( 0x3f81111111111111ll ), /* 5 */ 3094 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3095 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3096 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3097 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3098 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3099 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3100 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3101 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3102 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3103 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3104 }; 3105 3106 float32 float32_exp2(float32 a, float_status *status) 3107 { 3108 flag aSign; 3109 int aExp; 3110 uint32_t aSig; 3111 float64 r, x, xn; 3112 int i; 3113 a = float32_squash_input_denormal(a, status); 3114 3115 aSig = extractFloat32Frac( a ); 3116 aExp = extractFloat32Exp( a ); 3117 aSign = extractFloat32Sign( a ); 3118 3119 if ( aExp == 0xFF) { 3120 if (aSig) { 3121 return propagateFloat32NaN(a, float32_zero, status); 3122 } 3123 return (aSign) ? float32_zero : a; 3124 } 3125 if (aExp == 0) { 3126 if (aSig == 0) return float32_one; 3127 } 3128 3129 float_raise(float_flag_inexact, status); 3130 3131 /* ******************************* */ 3132 /* using float64 for approximation */ 3133 /* ******************************* */ 3134 x = float32_to_float64(a, status); 3135 x = float64_mul(x, float64_ln2, status); 3136 3137 xn = x; 3138 r = float64_one; 3139 for (i = 0 ; i < 15 ; i++) { 3140 float64 f; 3141 3142 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3143 r = float64_add(r, f, status); 3144 3145 xn = float64_mul(xn, x, status); 3146 } 3147 3148 return float64_to_float32(r, status); 3149 } 3150 3151 /*---------------------------------------------------------------------------- 3152 | Returns the binary log of the single-precision floating-point value `a'. 3153 | The operation is performed according to the IEC/IEEE Standard for Binary 3154 | Floating-Point Arithmetic. 3155 *----------------------------------------------------------------------------*/ 3156 float32 float32_log2(float32 a, float_status *status) 3157 { 3158 flag aSign, zSign; 3159 int aExp; 3160 uint32_t aSig, zSig, i; 3161 3162 a = float32_squash_input_denormal(a, status); 3163 aSig = extractFloat32Frac( a ); 3164 aExp = extractFloat32Exp( a ); 3165 aSign = extractFloat32Sign( a ); 3166 3167 if ( aExp == 0 ) { 3168 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3169 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3170 } 3171 if ( aSign ) { 3172 float_raise(float_flag_invalid, status); 3173 return float32_default_nan(status); 3174 } 3175 if ( aExp == 0xFF ) { 3176 if (aSig) { 3177 return propagateFloat32NaN(a, float32_zero, status); 3178 } 3179 return a; 3180 } 3181 3182 aExp -= 0x7F; 3183 aSig |= 0x00800000; 3184 zSign = aExp < 0; 3185 zSig = aExp << 23; 3186 3187 for (i = 1 << 22; i > 0; i >>= 1) { 3188 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3189 if ( aSig & 0x01000000 ) { 3190 aSig >>= 1; 3191 zSig |= i; 3192 } 3193 } 3194 3195 if ( zSign ) 3196 zSig = -zSig; 3197 3198 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3199 } 3200 3201 /*---------------------------------------------------------------------------- 3202 | Returns 1 if the single-precision floating-point value `a' is equal to 3203 | the corresponding value `b', and 0 otherwise. The invalid exception is 3204 | raised if either operand is a NaN. Otherwise, the comparison is performed 3205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3206 *----------------------------------------------------------------------------*/ 3207 3208 int float32_eq(float32 a, float32 b, float_status *status) 3209 { 3210 uint32_t av, bv; 3211 a = float32_squash_input_denormal(a, status); 3212 b = float32_squash_input_denormal(b, status); 3213 3214 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3215 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3216 ) { 3217 float_raise(float_flag_invalid, status); 3218 return 0; 3219 } 3220 av = float32_val(a); 3221 bv = float32_val(b); 3222 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3223 } 3224 3225 /*---------------------------------------------------------------------------- 3226 | Returns 1 if the single-precision floating-point value `a' is less than 3227 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3228 | exception is raised if either operand is a NaN. The comparison is performed 3229 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3230 *----------------------------------------------------------------------------*/ 3231 3232 int float32_le(float32 a, float32 b, float_status *status) 3233 { 3234 flag aSign, bSign; 3235 uint32_t av, bv; 3236 a = float32_squash_input_denormal(a, status); 3237 b = float32_squash_input_denormal(b, status); 3238 3239 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3240 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3241 ) { 3242 float_raise(float_flag_invalid, status); 3243 return 0; 3244 } 3245 aSign = extractFloat32Sign( a ); 3246 bSign = extractFloat32Sign( b ); 3247 av = float32_val(a); 3248 bv = float32_val(b); 3249 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3250 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3251 3252 } 3253 3254 /*---------------------------------------------------------------------------- 3255 | Returns 1 if the single-precision floating-point value `a' is less than 3256 | the corresponding value `b', and 0 otherwise. The invalid exception is 3257 | raised if either operand is a NaN. The comparison is performed according 3258 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3259 *----------------------------------------------------------------------------*/ 3260 3261 int float32_lt(float32 a, float32 b, float_status *status) 3262 { 3263 flag aSign, bSign; 3264 uint32_t av, bv; 3265 a = float32_squash_input_denormal(a, status); 3266 b = float32_squash_input_denormal(b, status); 3267 3268 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3269 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3270 ) { 3271 float_raise(float_flag_invalid, status); 3272 return 0; 3273 } 3274 aSign = extractFloat32Sign( a ); 3275 bSign = extractFloat32Sign( b ); 3276 av = float32_val(a); 3277 bv = float32_val(b); 3278 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3279 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3280 3281 } 3282 3283 /*---------------------------------------------------------------------------- 3284 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3285 | be compared, and 0 otherwise. The invalid exception is raised if either 3286 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3287 | Standard for Binary Floating-Point Arithmetic. 3288 *----------------------------------------------------------------------------*/ 3289 3290 int float32_unordered(float32 a, float32 b, float_status *status) 3291 { 3292 a = float32_squash_input_denormal(a, status); 3293 b = float32_squash_input_denormal(b, status); 3294 3295 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3296 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3297 ) { 3298 float_raise(float_flag_invalid, status); 3299 return 1; 3300 } 3301 return 0; 3302 } 3303 3304 /*---------------------------------------------------------------------------- 3305 | Returns 1 if the single-precision floating-point value `a' is equal to 3306 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3307 | exception. The comparison is performed according to the IEC/IEEE Standard 3308 | for Binary Floating-Point Arithmetic. 3309 *----------------------------------------------------------------------------*/ 3310 3311 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3312 { 3313 a = float32_squash_input_denormal(a, status); 3314 b = float32_squash_input_denormal(b, status); 3315 3316 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3317 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3318 ) { 3319 if (float32_is_signaling_nan(a, status) 3320 || float32_is_signaling_nan(b, status)) { 3321 float_raise(float_flag_invalid, status); 3322 } 3323 return 0; 3324 } 3325 return ( float32_val(a) == float32_val(b) ) || 3326 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3327 } 3328 3329 /*---------------------------------------------------------------------------- 3330 | Returns 1 if the single-precision floating-point value `a' is less than or 3331 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3332 | cause an exception. Otherwise, the comparison is performed according to the 3333 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3334 *----------------------------------------------------------------------------*/ 3335 3336 int float32_le_quiet(float32 a, float32 b, float_status *status) 3337 { 3338 flag aSign, bSign; 3339 uint32_t av, bv; 3340 a = float32_squash_input_denormal(a, status); 3341 b = float32_squash_input_denormal(b, status); 3342 3343 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3344 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3345 ) { 3346 if (float32_is_signaling_nan(a, status) 3347 || float32_is_signaling_nan(b, status)) { 3348 float_raise(float_flag_invalid, status); 3349 } 3350 return 0; 3351 } 3352 aSign = extractFloat32Sign( a ); 3353 bSign = extractFloat32Sign( b ); 3354 av = float32_val(a); 3355 bv = float32_val(b); 3356 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3357 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3358 3359 } 3360 3361 /*---------------------------------------------------------------------------- 3362 | Returns 1 if the single-precision floating-point value `a' is less than 3363 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3364 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3365 | Standard for Binary Floating-Point Arithmetic. 3366 *----------------------------------------------------------------------------*/ 3367 3368 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3369 { 3370 flag aSign, bSign; 3371 uint32_t av, bv; 3372 a = float32_squash_input_denormal(a, status); 3373 b = float32_squash_input_denormal(b, status); 3374 3375 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3376 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3377 ) { 3378 if (float32_is_signaling_nan(a, status) 3379 || float32_is_signaling_nan(b, status)) { 3380 float_raise(float_flag_invalid, status); 3381 } 3382 return 0; 3383 } 3384 aSign = extractFloat32Sign( a ); 3385 bSign = extractFloat32Sign( b ); 3386 av = float32_val(a); 3387 bv = float32_val(b); 3388 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3389 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3390 3391 } 3392 3393 /*---------------------------------------------------------------------------- 3394 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3395 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3396 | comparison is performed according to the IEC/IEEE Standard for Binary 3397 | Floating-Point Arithmetic. 3398 *----------------------------------------------------------------------------*/ 3399 3400 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3401 { 3402 a = float32_squash_input_denormal(a, status); 3403 b = float32_squash_input_denormal(b, status); 3404 3405 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3406 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3407 ) { 3408 if (float32_is_signaling_nan(a, status) 3409 || float32_is_signaling_nan(b, status)) { 3410 float_raise(float_flag_invalid, status); 3411 } 3412 return 1; 3413 } 3414 return 0; 3415 } 3416 3417 /*---------------------------------------------------------------------------- 3418 | Returns the result of converting the double-precision floating-point value 3419 | `a' to the 32-bit two's complement integer format. The conversion is 3420 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3421 | Arithmetic---which means in particular that the conversion is rounded 3422 | according to the current rounding mode. If `a' is a NaN, the largest 3423 | positive integer is returned. Otherwise, if the conversion overflows, the 3424 | largest integer with the same sign as `a' is returned. 3425 *----------------------------------------------------------------------------*/ 3426 3427 int32_t float64_to_int32(float64 a, float_status *status) 3428 { 3429 flag aSign; 3430 int aExp; 3431 int shiftCount; 3432 uint64_t aSig; 3433 a = float64_squash_input_denormal(a, status); 3434 3435 aSig = extractFloat64Frac( a ); 3436 aExp = extractFloat64Exp( a ); 3437 aSign = extractFloat64Sign( a ); 3438 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3439 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3440 shiftCount = 0x42C - aExp; 3441 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3442 return roundAndPackInt32(aSign, aSig, status); 3443 3444 } 3445 3446 /*---------------------------------------------------------------------------- 3447 | Returns the result of converting the double-precision floating-point value 3448 | `a' to the 32-bit two's complement integer format. The conversion is 3449 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3450 | Arithmetic, except that the conversion is always rounded toward zero. 3451 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3452 | the conversion overflows, the largest integer with the same sign as `a' is 3453 | returned. 3454 *----------------------------------------------------------------------------*/ 3455 3456 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3457 { 3458 flag aSign; 3459 int aExp; 3460 int shiftCount; 3461 uint64_t aSig, savedASig; 3462 int32_t z; 3463 a = float64_squash_input_denormal(a, status); 3464 3465 aSig = extractFloat64Frac( a ); 3466 aExp = extractFloat64Exp( a ); 3467 aSign = extractFloat64Sign( a ); 3468 if ( 0x41E < aExp ) { 3469 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3470 goto invalid; 3471 } 3472 else if ( aExp < 0x3FF ) { 3473 if (aExp || aSig) { 3474 status->float_exception_flags |= float_flag_inexact; 3475 } 3476 return 0; 3477 } 3478 aSig |= LIT64( 0x0010000000000000 ); 3479 shiftCount = 0x433 - aExp; 3480 savedASig = aSig; 3481 aSig >>= shiftCount; 3482 z = aSig; 3483 if ( aSign ) z = - z; 3484 if ( ( z < 0 ) ^ aSign ) { 3485 invalid: 3486 float_raise(float_flag_invalid, status); 3487 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3488 } 3489 if ( ( aSig<<shiftCount ) != savedASig ) { 3490 status->float_exception_flags |= float_flag_inexact; 3491 } 3492 return z; 3493 3494 } 3495 3496 /*---------------------------------------------------------------------------- 3497 | Returns the result of converting the double-precision floating-point value 3498 | `a' to the 16-bit two's complement integer format. The conversion is 3499 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3500 | Arithmetic, except that the conversion is always rounded toward zero. 3501 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3502 | the conversion overflows, the largest integer with the same sign as `a' is 3503 | returned. 3504 *----------------------------------------------------------------------------*/ 3505 3506 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3507 { 3508 flag aSign; 3509 int aExp; 3510 int shiftCount; 3511 uint64_t aSig, savedASig; 3512 int32_t z; 3513 3514 aSig = extractFloat64Frac( a ); 3515 aExp = extractFloat64Exp( a ); 3516 aSign = extractFloat64Sign( a ); 3517 if ( 0x40E < aExp ) { 3518 if ( ( aExp == 0x7FF ) && aSig ) { 3519 aSign = 0; 3520 } 3521 goto invalid; 3522 } 3523 else if ( aExp < 0x3FF ) { 3524 if ( aExp || aSig ) { 3525 status->float_exception_flags |= float_flag_inexact; 3526 } 3527 return 0; 3528 } 3529 aSig |= LIT64( 0x0010000000000000 ); 3530 shiftCount = 0x433 - aExp; 3531 savedASig = aSig; 3532 aSig >>= shiftCount; 3533 z = aSig; 3534 if ( aSign ) { 3535 z = - z; 3536 } 3537 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3538 invalid: 3539 float_raise(float_flag_invalid, status); 3540 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3541 } 3542 if ( ( aSig<<shiftCount ) != savedASig ) { 3543 status->float_exception_flags |= float_flag_inexact; 3544 } 3545 return z; 3546 } 3547 3548 /*---------------------------------------------------------------------------- 3549 | Returns the result of converting the double-precision floating-point value 3550 | `a' to the 64-bit two's complement integer format. The conversion is 3551 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3552 | Arithmetic---which means in particular that the conversion is rounded 3553 | according to the current rounding mode. If `a' is a NaN, the largest 3554 | positive integer is returned. Otherwise, if the conversion overflows, the 3555 | largest integer with the same sign as `a' is returned. 3556 *----------------------------------------------------------------------------*/ 3557 3558 int64_t float64_to_int64(float64 a, float_status *status) 3559 { 3560 flag aSign; 3561 int aExp; 3562 int shiftCount; 3563 uint64_t aSig, aSigExtra; 3564 a = float64_squash_input_denormal(a, status); 3565 3566 aSig = extractFloat64Frac( a ); 3567 aExp = extractFloat64Exp( a ); 3568 aSign = extractFloat64Sign( a ); 3569 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3570 shiftCount = 0x433 - aExp; 3571 if ( shiftCount <= 0 ) { 3572 if ( 0x43E < aExp ) { 3573 float_raise(float_flag_invalid, status); 3574 if ( ! aSign 3575 || ( ( aExp == 0x7FF ) 3576 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3577 ) { 3578 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3579 } 3580 return (int64_t) LIT64( 0x8000000000000000 ); 3581 } 3582 aSigExtra = 0; 3583 aSig <<= - shiftCount; 3584 } 3585 else { 3586 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3587 } 3588 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3589 3590 } 3591 3592 /*---------------------------------------------------------------------------- 3593 | Returns the result of converting the double-precision floating-point value 3594 | `a' to the 64-bit two's complement integer format. The conversion is 3595 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3596 | Arithmetic, except that the conversion is always rounded toward zero. 3597 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3598 | the conversion overflows, the largest integer with the same sign as `a' is 3599 | returned. 3600 *----------------------------------------------------------------------------*/ 3601 3602 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3603 { 3604 flag aSign; 3605 int aExp; 3606 int shiftCount; 3607 uint64_t aSig; 3608 int64_t z; 3609 a = float64_squash_input_denormal(a, status); 3610 3611 aSig = extractFloat64Frac( a ); 3612 aExp = extractFloat64Exp( a ); 3613 aSign = extractFloat64Sign( a ); 3614 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3615 shiftCount = aExp - 0x433; 3616 if ( 0 <= shiftCount ) { 3617 if ( 0x43E <= aExp ) { 3618 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3619 float_raise(float_flag_invalid, status); 3620 if ( ! aSign 3621 || ( ( aExp == 0x7FF ) 3622 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3623 ) { 3624 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3625 } 3626 } 3627 return (int64_t) LIT64( 0x8000000000000000 ); 3628 } 3629 z = aSig<<shiftCount; 3630 } 3631 else { 3632 if ( aExp < 0x3FE ) { 3633 if (aExp | aSig) { 3634 status->float_exception_flags |= float_flag_inexact; 3635 } 3636 return 0; 3637 } 3638 z = aSig>>( - shiftCount ); 3639 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3640 status->float_exception_flags |= float_flag_inexact; 3641 } 3642 } 3643 if ( aSign ) z = - z; 3644 return z; 3645 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Returns the result of converting the double-precision floating-point value 3650 | `a' to the single-precision floating-point format. The conversion is 3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3652 | Arithmetic. 3653 *----------------------------------------------------------------------------*/ 3654 3655 float32 float64_to_float32(float64 a, float_status *status) 3656 { 3657 flag aSign; 3658 int aExp; 3659 uint64_t aSig; 3660 uint32_t zSig; 3661 a = float64_squash_input_denormal(a, status); 3662 3663 aSig = extractFloat64Frac( a ); 3664 aExp = extractFloat64Exp( a ); 3665 aSign = extractFloat64Sign( a ); 3666 if ( aExp == 0x7FF ) { 3667 if (aSig) { 3668 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3669 } 3670 return packFloat32( aSign, 0xFF, 0 ); 3671 } 3672 shift64RightJamming( aSig, 22, &aSig ); 3673 zSig = aSig; 3674 if ( aExp || zSig ) { 3675 zSig |= 0x40000000; 3676 aExp -= 0x381; 3677 } 3678 return roundAndPackFloat32(aSign, aExp, zSig, status); 3679 3680 } 3681 3682 3683 /*---------------------------------------------------------------------------- 3684 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3685 | half-precision floating-point value, returning the result. After being 3686 | shifted into the proper positions, the three fields are simply added 3687 | together to form the result. This means that any integer portion of `zSig' 3688 | will be added into the exponent. Since a properly normalized significand 3689 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3690 | than the desired result exponent whenever `zSig' is a complete, normalized 3691 | significand. 3692 *----------------------------------------------------------------------------*/ 3693 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3694 { 3695 return make_float16( 3696 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3697 } 3698 3699 /*---------------------------------------------------------------------------- 3700 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3701 | and significand `zSig', and returns the proper half-precision floating- 3702 | point value corresponding to the abstract input. Ordinarily, the abstract 3703 | value is simply rounded and packed into the half-precision format, with 3704 | the inexact exception raised if the abstract input cannot be represented 3705 | exactly. However, if the abstract value is too large, the overflow and 3706 | inexact exceptions are raised and an infinity or maximal finite value is 3707 | returned. If the abstract value is too small, the input value is rounded to 3708 | a subnormal number, and the underflow and inexact exceptions are raised if 3709 | the abstract input cannot be represented exactly as a subnormal half- 3710 | precision floating-point number. 3711 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3712 | ARM-style "alternative representation", which omits the NaN and Inf 3713 | encodings in order to raise the maximum representable exponent by one. 3714 | The input significand `zSig' has its binary point between bits 22 3715 | and 23, which is 13 bits to the left of the usual location. This shifted 3716 | significand must be normalized or smaller. If `zSig' is not normalized, 3717 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3718 | and it must not require rounding. In the usual case that `zSig' is 3719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3720 | Note the slightly odd position of the binary point in zSig compared with the 3721 | other roundAndPackFloat functions. This should probably be fixed if we 3722 | need to implement more float16 routines than just conversion. 3723 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3724 | Binary Floating-Point Arithmetic. 3725 *----------------------------------------------------------------------------*/ 3726 3727 static float16 roundAndPackFloat16(flag zSign, int zExp, 3728 uint32_t zSig, flag ieee, 3729 float_status *status) 3730 { 3731 int maxexp = ieee ? 29 : 30; 3732 uint32_t mask; 3733 uint32_t increment; 3734 bool rounding_bumps_exp; 3735 bool is_tiny = false; 3736 3737 /* Calculate the mask of bits of the mantissa which are not 3738 * representable in half-precision and will be lost. 3739 */ 3740 if (zExp < 1) { 3741 /* Will be denormal in halfprec */ 3742 mask = 0x00ffffff; 3743 if (zExp >= -11) { 3744 mask >>= 11 + zExp; 3745 } 3746 } else { 3747 /* Normal number in halfprec */ 3748 mask = 0x00001fff; 3749 } 3750 3751 switch (status->float_rounding_mode) { 3752 case float_round_nearest_even: 3753 increment = (mask + 1) >> 1; 3754 if ((zSig & mask) == increment) { 3755 increment = zSig & (increment << 1); 3756 } 3757 break; 3758 case float_round_ties_away: 3759 increment = (mask + 1) >> 1; 3760 break; 3761 case float_round_up: 3762 increment = zSign ? 0 : mask; 3763 break; 3764 case float_round_down: 3765 increment = zSign ? mask : 0; 3766 break; 3767 default: /* round_to_zero */ 3768 increment = 0; 3769 break; 3770 } 3771 3772 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3773 3774 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3775 if (ieee) { 3776 float_raise(float_flag_overflow | float_flag_inexact, status); 3777 return packFloat16(zSign, 0x1f, 0); 3778 } else { 3779 float_raise(float_flag_invalid, status); 3780 return packFloat16(zSign, 0x1f, 0x3ff); 3781 } 3782 } 3783 3784 if (zExp < 0) { 3785 /* Note that flush-to-zero does not affect half-precision results */ 3786 is_tiny = 3787 (status->float_detect_tininess == float_tininess_before_rounding) 3788 || (zExp < -1) 3789 || (!rounding_bumps_exp); 3790 } 3791 if (zSig & mask) { 3792 float_raise(float_flag_inexact, status); 3793 if (is_tiny) { 3794 float_raise(float_flag_underflow, status); 3795 } 3796 } 3797 3798 zSig += increment; 3799 if (rounding_bumps_exp) { 3800 zSig >>= 1; 3801 zExp++; 3802 } 3803 3804 if (zExp < -10) { 3805 return packFloat16(zSign, 0, 0); 3806 } 3807 if (zExp < 0) { 3808 zSig >>= -zExp; 3809 zExp = 0; 3810 } 3811 return packFloat16(zSign, zExp, zSig >> 13); 3812 } 3813 3814 /*---------------------------------------------------------------------------- 3815 | If `a' is denormal and we are in flush-to-zero mode then set the 3816 | input-denormal exception and return zero. Otherwise just return the value. 3817 *----------------------------------------------------------------------------*/ 3818 float16 float16_squash_input_denormal(float16 a, float_status *status) 3819 { 3820 if (status->flush_inputs_to_zero) { 3821 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3822 float_raise(float_flag_input_denormal, status); 3823 return make_float16(float16_val(a) & 0x8000); 3824 } 3825 } 3826 return a; 3827 } 3828 3829 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3830 uint32_t *zSigPtr) 3831 { 3832 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3833 *zSigPtr = aSig << shiftCount; 3834 *zExpPtr = 1 - shiftCount; 3835 } 3836 3837 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3838 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3839 3840 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3841 { 3842 flag aSign; 3843 int aExp; 3844 uint32_t aSig; 3845 3846 aSign = extractFloat16Sign(a); 3847 aExp = extractFloat16Exp(a); 3848 aSig = extractFloat16Frac(a); 3849 3850 if (aExp == 0x1f && ieee) { 3851 if (aSig) { 3852 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3853 } 3854 return packFloat32(aSign, 0xff, 0); 3855 } 3856 if (aExp == 0) { 3857 if (aSig == 0) { 3858 return packFloat32(aSign, 0, 0); 3859 } 3860 3861 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3862 aExp--; 3863 } 3864 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3865 } 3866 3867 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3868 { 3869 flag aSign; 3870 int aExp; 3871 uint32_t aSig; 3872 3873 a = float32_squash_input_denormal(a, status); 3874 3875 aSig = extractFloat32Frac( a ); 3876 aExp = extractFloat32Exp( a ); 3877 aSign = extractFloat32Sign( a ); 3878 if ( aExp == 0xFF ) { 3879 if (aSig) { 3880 /* Input is a NaN */ 3881 if (!ieee) { 3882 float_raise(float_flag_invalid, status); 3883 return packFloat16(aSign, 0, 0); 3884 } 3885 return commonNaNToFloat16( 3886 float32ToCommonNaN(a, status), status); 3887 } 3888 /* Infinity */ 3889 if (!ieee) { 3890 float_raise(float_flag_invalid, status); 3891 return packFloat16(aSign, 0x1f, 0x3ff); 3892 } 3893 return packFloat16(aSign, 0x1f, 0); 3894 } 3895 if (aExp == 0 && aSig == 0) { 3896 return packFloat16(aSign, 0, 0); 3897 } 3898 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3899 * even if the input is denormal; however this is harmless because 3900 * the largest possible single-precision denormal is still smaller 3901 * than the smallest representable half-precision denormal, and so we 3902 * will end up ignoring aSig and returning via the "always return zero" 3903 * codepath. 3904 */ 3905 aSig |= 0x00800000; 3906 aExp -= 0x71; 3907 3908 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3909 } 3910 3911 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3912 { 3913 flag aSign; 3914 int aExp; 3915 uint32_t aSig; 3916 3917 aSign = extractFloat16Sign(a); 3918 aExp = extractFloat16Exp(a); 3919 aSig = extractFloat16Frac(a); 3920 3921 if (aExp == 0x1f && ieee) { 3922 if (aSig) { 3923 return commonNaNToFloat64( 3924 float16ToCommonNaN(a, status), status); 3925 } 3926 return packFloat64(aSign, 0x7ff, 0); 3927 } 3928 if (aExp == 0) { 3929 if (aSig == 0) { 3930 return packFloat64(aSign, 0, 0); 3931 } 3932 3933 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3934 aExp--; 3935 } 3936 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3937 } 3938 3939 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3940 { 3941 flag aSign; 3942 int aExp; 3943 uint64_t aSig; 3944 uint32_t zSig; 3945 3946 a = float64_squash_input_denormal(a, status); 3947 3948 aSig = extractFloat64Frac(a); 3949 aExp = extractFloat64Exp(a); 3950 aSign = extractFloat64Sign(a); 3951 if (aExp == 0x7FF) { 3952 if (aSig) { 3953 /* Input is a NaN */ 3954 if (!ieee) { 3955 float_raise(float_flag_invalid, status); 3956 return packFloat16(aSign, 0, 0); 3957 } 3958 return commonNaNToFloat16( 3959 float64ToCommonNaN(a, status), status); 3960 } 3961 /* Infinity */ 3962 if (!ieee) { 3963 float_raise(float_flag_invalid, status); 3964 return packFloat16(aSign, 0x1f, 0x3ff); 3965 } 3966 return packFloat16(aSign, 0x1f, 0); 3967 } 3968 shift64RightJamming(aSig, 29, &aSig); 3969 zSig = aSig; 3970 if (aExp == 0 && zSig == 0) { 3971 return packFloat16(aSign, 0, 0); 3972 } 3973 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3974 * even if the input is denormal; however this is harmless because 3975 * the largest possible single-precision denormal is still smaller 3976 * than the smallest representable half-precision denormal, and so we 3977 * will end up ignoring aSig and returning via the "always return zero" 3978 * codepath. 3979 */ 3980 zSig |= 0x00800000; 3981 aExp -= 0x3F1; 3982 3983 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3984 } 3985 3986 /*---------------------------------------------------------------------------- 3987 | Returns the result of converting the double-precision floating-point value 3988 | `a' to the extended double-precision floating-point format. The conversion 3989 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3990 | Arithmetic. 3991 *----------------------------------------------------------------------------*/ 3992 3993 floatx80 float64_to_floatx80(float64 a, float_status *status) 3994 { 3995 flag aSign; 3996 int aExp; 3997 uint64_t aSig; 3998 3999 a = float64_squash_input_denormal(a, status); 4000 aSig = extractFloat64Frac( a ); 4001 aExp = extractFloat64Exp( a ); 4002 aSign = extractFloat64Sign( a ); 4003 if ( aExp == 0x7FF ) { 4004 if (aSig) { 4005 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4006 } 4007 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4008 } 4009 if ( aExp == 0 ) { 4010 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4011 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4012 } 4013 return 4014 packFloatx80( 4015 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4016 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | Returns the result of converting the double-precision floating-point value 4021 | `a' to the quadruple-precision floating-point format. The conversion is 4022 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4023 | Arithmetic. 4024 *----------------------------------------------------------------------------*/ 4025 4026 float128 float64_to_float128(float64 a, float_status *status) 4027 { 4028 flag aSign; 4029 int aExp; 4030 uint64_t aSig, zSig0, zSig1; 4031 4032 a = float64_squash_input_denormal(a, status); 4033 aSig = extractFloat64Frac( a ); 4034 aExp = extractFloat64Exp( a ); 4035 aSign = extractFloat64Sign( a ); 4036 if ( aExp == 0x7FF ) { 4037 if (aSig) { 4038 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4039 } 4040 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4041 } 4042 if ( aExp == 0 ) { 4043 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4044 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4045 --aExp; 4046 } 4047 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4048 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4049 4050 } 4051 4052 /*---------------------------------------------------------------------------- 4053 | Rounds the double-precision floating-point value `a' to an integer, and 4054 | returns the result as a double-precision floating-point value. The 4055 | operation is performed according to the IEC/IEEE Standard for Binary 4056 | Floating-Point Arithmetic. 4057 *----------------------------------------------------------------------------*/ 4058 4059 float64 float64_round_to_int(float64 a, float_status *status) 4060 { 4061 flag aSign; 4062 int aExp; 4063 uint64_t lastBitMask, roundBitsMask; 4064 uint64_t z; 4065 a = float64_squash_input_denormal(a, status); 4066 4067 aExp = extractFloat64Exp( a ); 4068 if ( 0x433 <= aExp ) { 4069 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 4070 return propagateFloat64NaN(a, a, status); 4071 } 4072 return a; 4073 } 4074 if ( aExp < 0x3FF ) { 4075 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 4076 status->float_exception_flags |= float_flag_inexact; 4077 aSign = extractFloat64Sign( a ); 4078 switch (status->float_rounding_mode) { 4079 case float_round_nearest_even: 4080 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 4081 return packFloat64( aSign, 0x3FF, 0 ); 4082 } 4083 break; 4084 case float_round_ties_away: 4085 if (aExp == 0x3FE) { 4086 return packFloat64(aSign, 0x3ff, 0); 4087 } 4088 break; 4089 case float_round_down: 4090 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 4091 case float_round_up: 4092 return make_float64( 4093 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 4094 } 4095 return packFloat64( aSign, 0, 0 ); 4096 } 4097 lastBitMask = 1; 4098 lastBitMask <<= 0x433 - aExp; 4099 roundBitsMask = lastBitMask - 1; 4100 z = float64_val(a); 4101 switch (status->float_rounding_mode) { 4102 case float_round_nearest_even: 4103 z += lastBitMask >> 1; 4104 if ((z & roundBitsMask) == 0) { 4105 z &= ~lastBitMask; 4106 } 4107 break; 4108 case float_round_ties_away: 4109 z += lastBitMask >> 1; 4110 break; 4111 case float_round_to_zero: 4112 break; 4113 case float_round_up: 4114 if (!extractFloat64Sign(make_float64(z))) { 4115 z += roundBitsMask; 4116 } 4117 break; 4118 case float_round_down: 4119 if (extractFloat64Sign(make_float64(z))) { 4120 z += roundBitsMask; 4121 } 4122 break; 4123 default: 4124 abort(); 4125 } 4126 z &= ~ roundBitsMask; 4127 if (z != float64_val(a)) { 4128 status->float_exception_flags |= float_flag_inexact; 4129 } 4130 return make_float64(z); 4131 4132 } 4133 4134 float64 float64_trunc_to_int(float64 a, float_status *status) 4135 { 4136 int oldmode; 4137 float64 res; 4138 oldmode = status->float_rounding_mode; 4139 status->float_rounding_mode = float_round_to_zero; 4140 res = float64_round_to_int(a, status); 4141 status->float_rounding_mode = oldmode; 4142 return res; 4143 } 4144 4145 4146 /*---------------------------------------------------------------------------- 4147 | Returns the result of multiplying the double-precision floating-point values 4148 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4149 | for Binary Floating-Point Arithmetic. 4150 *----------------------------------------------------------------------------*/ 4151 4152 float64 float64_mul(float64 a, float64 b, float_status *status) 4153 { 4154 flag aSign, bSign, zSign; 4155 int aExp, bExp, zExp; 4156 uint64_t aSig, bSig, zSig0, zSig1; 4157 4158 a = float64_squash_input_denormal(a, status); 4159 b = float64_squash_input_denormal(b, status); 4160 4161 aSig = extractFloat64Frac( a ); 4162 aExp = extractFloat64Exp( a ); 4163 aSign = extractFloat64Sign( a ); 4164 bSig = extractFloat64Frac( b ); 4165 bExp = extractFloat64Exp( b ); 4166 bSign = extractFloat64Sign( b ); 4167 zSign = aSign ^ bSign; 4168 if ( aExp == 0x7FF ) { 4169 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4170 return propagateFloat64NaN(a, b, status); 4171 } 4172 if ( ( bExp | bSig ) == 0 ) { 4173 float_raise(float_flag_invalid, status); 4174 return float64_default_nan(status); 4175 } 4176 return packFloat64( zSign, 0x7FF, 0 ); 4177 } 4178 if ( bExp == 0x7FF ) { 4179 if (bSig) { 4180 return propagateFloat64NaN(a, b, status); 4181 } 4182 if ( ( aExp | aSig ) == 0 ) { 4183 float_raise(float_flag_invalid, status); 4184 return float64_default_nan(status); 4185 } 4186 return packFloat64( zSign, 0x7FF, 0 ); 4187 } 4188 if ( aExp == 0 ) { 4189 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4190 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4191 } 4192 if ( bExp == 0 ) { 4193 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4194 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4195 } 4196 zExp = aExp + bExp - 0x3FF; 4197 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4198 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4199 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4200 zSig0 |= ( zSig1 != 0 ); 4201 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4202 zSig0 <<= 1; 4203 --zExp; 4204 } 4205 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4206 4207 } 4208 4209 /*---------------------------------------------------------------------------- 4210 | Returns the result of dividing the double-precision floating-point value `a' 4211 | by the corresponding value `b'. The operation is performed according to 4212 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4213 *----------------------------------------------------------------------------*/ 4214 4215 float64 float64_div(float64 a, float64 b, float_status *status) 4216 { 4217 flag aSign, bSign, zSign; 4218 int aExp, bExp, zExp; 4219 uint64_t aSig, bSig, zSig; 4220 uint64_t rem0, rem1; 4221 uint64_t term0, term1; 4222 a = float64_squash_input_denormal(a, status); 4223 b = float64_squash_input_denormal(b, status); 4224 4225 aSig = extractFloat64Frac( a ); 4226 aExp = extractFloat64Exp( a ); 4227 aSign = extractFloat64Sign( a ); 4228 bSig = extractFloat64Frac( b ); 4229 bExp = extractFloat64Exp( b ); 4230 bSign = extractFloat64Sign( b ); 4231 zSign = aSign ^ bSign; 4232 if ( aExp == 0x7FF ) { 4233 if (aSig) { 4234 return propagateFloat64NaN(a, b, status); 4235 } 4236 if ( bExp == 0x7FF ) { 4237 if (bSig) { 4238 return propagateFloat64NaN(a, b, status); 4239 } 4240 float_raise(float_flag_invalid, status); 4241 return float64_default_nan(status); 4242 } 4243 return packFloat64( zSign, 0x7FF, 0 ); 4244 } 4245 if ( bExp == 0x7FF ) { 4246 if (bSig) { 4247 return propagateFloat64NaN(a, b, status); 4248 } 4249 return packFloat64( zSign, 0, 0 ); 4250 } 4251 if ( bExp == 0 ) { 4252 if ( bSig == 0 ) { 4253 if ( ( aExp | aSig ) == 0 ) { 4254 float_raise(float_flag_invalid, status); 4255 return float64_default_nan(status); 4256 } 4257 float_raise(float_flag_divbyzero, status); 4258 return packFloat64( zSign, 0x7FF, 0 ); 4259 } 4260 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4261 } 4262 if ( aExp == 0 ) { 4263 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4264 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4265 } 4266 zExp = aExp - bExp + 0x3FD; 4267 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4268 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4269 if ( bSig <= ( aSig + aSig ) ) { 4270 aSig >>= 1; 4271 ++zExp; 4272 } 4273 zSig = estimateDiv128To64( aSig, 0, bSig ); 4274 if ( ( zSig & 0x1FF ) <= 2 ) { 4275 mul64To128( bSig, zSig, &term0, &term1 ); 4276 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4277 while ( (int64_t) rem0 < 0 ) { 4278 --zSig; 4279 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4280 } 4281 zSig |= ( rem1 != 0 ); 4282 } 4283 return roundAndPackFloat64(zSign, zExp, zSig, status); 4284 4285 } 4286 4287 /*---------------------------------------------------------------------------- 4288 | Returns the remainder of the double-precision floating-point value `a' 4289 | with respect to the corresponding value `b'. The operation is performed 4290 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4291 *----------------------------------------------------------------------------*/ 4292 4293 float64 float64_rem(float64 a, float64 b, float_status *status) 4294 { 4295 flag aSign, zSign; 4296 int aExp, bExp, expDiff; 4297 uint64_t aSig, bSig; 4298 uint64_t q, alternateASig; 4299 int64_t sigMean; 4300 4301 a = float64_squash_input_denormal(a, status); 4302 b = float64_squash_input_denormal(b, status); 4303 aSig = extractFloat64Frac( a ); 4304 aExp = extractFloat64Exp( a ); 4305 aSign = extractFloat64Sign( a ); 4306 bSig = extractFloat64Frac( b ); 4307 bExp = extractFloat64Exp( b ); 4308 if ( aExp == 0x7FF ) { 4309 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4310 return propagateFloat64NaN(a, b, status); 4311 } 4312 float_raise(float_flag_invalid, status); 4313 return float64_default_nan(status); 4314 } 4315 if ( bExp == 0x7FF ) { 4316 if (bSig) { 4317 return propagateFloat64NaN(a, b, status); 4318 } 4319 return a; 4320 } 4321 if ( bExp == 0 ) { 4322 if ( bSig == 0 ) { 4323 float_raise(float_flag_invalid, status); 4324 return float64_default_nan(status); 4325 } 4326 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4327 } 4328 if ( aExp == 0 ) { 4329 if ( aSig == 0 ) return a; 4330 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4331 } 4332 expDiff = aExp - bExp; 4333 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4334 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4335 if ( expDiff < 0 ) { 4336 if ( expDiff < -1 ) return a; 4337 aSig >>= 1; 4338 } 4339 q = ( bSig <= aSig ); 4340 if ( q ) aSig -= bSig; 4341 expDiff -= 64; 4342 while ( 0 < expDiff ) { 4343 q = estimateDiv128To64( aSig, 0, bSig ); 4344 q = ( 2 < q ) ? q - 2 : 0; 4345 aSig = - ( ( bSig>>2 ) * q ); 4346 expDiff -= 62; 4347 } 4348 expDiff += 64; 4349 if ( 0 < expDiff ) { 4350 q = estimateDiv128To64( aSig, 0, bSig ); 4351 q = ( 2 < q ) ? q - 2 : 0; 4352 q >>= 64 - expDiff; 4353 bSig >>= 2; 4354 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4355 } 4356 else { 4357 aSig >>= 2; 4358 bSig >>= 2; 4359 } 4360 do { 4361 alternateASig = aSig; 4362 ++q; 4363 aSig -= bSig; 4364 } while ( 0 <= (int64_t) aSig ); 4365 sigMean = aSig + alternateASig; 4366 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4367 aSig = alternateASig; 4368 } 4369 zSign = ( (int64_t) aSig < 0 ); 4370 if ( zSign ) aSig = - aSig; 4371 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4372 4373 } 4374 4375 /*---------------------------------------------------------------------------- 4376 | Returns the result of multiplying the double-precision floating-point values 4377 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4378 | multiplication. The operation is performed according to the IEC/IEEE 4379 | Standard for Binary Floating-Point Arithmetic 754-2008. 4380 | The flags argument allows the caller to select negation of the 4381 | addend, the intermediate product, or the final result. (The difference 4382 | between this and having the caller do a separate negation is that negating 4383 | externally will flip the sign bit on NaNs.) 4384 *----------------------------------------------------------------------------*/ 4385 4386 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4387 float_status *status) 4388 { 4389 flag aSign, bSign, cSign, zSign; 4390 int aExp, bExp, cExp, pExp, zExp, expDiff; 4391 uint64_t aSig, bSig, cSig; 4392 flag pInf, pZero, pSign; 4393 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4394 int shiftcount; 4395 flag signflip, infzero; 4396 4397 a = float64_squash_input_denormal(a, status); 4398 b = float64_squash_input_denormal(b, status); 4399 c = float64_squash_input_denormal(c, status); 4400 aSig = extractFloat64Frac(a); 4401 aExp = extractFloat64Exp(a); 4402 aSign = extractFloat64Sign(a); 4403 bSig = extractFloat64Frac(b); 4404 bExp = extractFloat64Exp(b); 4405 bSign = extractFloat64Sign(b); 4406 cSig = extractFloat64Frac(c); 4407 cExp = extractFloat64Exp(c); 4408 cSign = extractFloat64Sign(c); 4409 4410 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4411 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4412 4413 /* It is implementation-defined whether the cases of (0,inf,qnan) 4414 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4415 * they return if they do), so we have to hand this information 4416 * off to the target-specific pick-a-NaN routine. 4417 */ 4418 if (((aExp == 0x7ff) && aSig) || 4419 ((bExp == 0x7ff) && bSig) || 4420 ((cExp == 0x7ff) && cSig)) { 4421 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4422 } 4423 4424 if (infzero) { 4425 float_raise(float_flag_invalid, status); 4426 return float64_default_nan(status); 4427 } 4428 4429 if (flags & float_muladd_negate_c) { 4430 cSign ^= 1; 4431 } 4432 4433 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4434 4435 /* Work out the sign and type of the product */ 4436 pSign = aSign ^ bSign; 4437 if (flags & float_muladd_negate_product) { 4438 pSign ^= 1; 4439 } 4440 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4441 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4442 4443 if (cExp == 0x7ff) { 4444 if (pInf && (pSign ^ cSign)) { 4445 /* addition of opposite-signed infinities => InvalidOperation */ 4446 float_raise(float_flag_invalid, status); 4447 return float64_default_nan(status); 4448 } 4449 /* Otherwise generate an infinity of the same sign */ 4450 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4451 } 4452 4453 if (pInf) { 4454 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4455 } 4456 4457 if (pZero) { 4458 if (cExp == 0) { 4459 if (cSig == 0) { 4460 /* Adding two exact zeroes */ 4461 if (pSign == cSign) { 4462 zSign = pSign; 4463 } else if (status->float_rounding_mode == float_round_down) { 4464 zSign = 1; 4465 } else { 4466 zSign = 0; 4467 } 4468 return packFloat64(zSign ^ signflip, 0, 0); 4469 } 4470 /* Exact zero plus a denorm */ 4471 if (status->flush_to_zero) { 4472 float_raise(float_flag_output_denormal, status); 4473 return packFloat64(cSign ^ signflip, 0, 0); 4474 } 4475 } 4476 /* Zero plus something non-zero : just return the something */ 4477 if (flags & float_muladd_halve_result) { 4478 if (cExp == 0) { 4479 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4480 } 4481 /* Subtract one to halve, and one again because roundAndPackFloat64 4482 * wants one less than the true exponent. 4483 */ 4484 cExp -= 2; 4485 cSig = (cSig | 0x0010000000000000ULL) << 10; 4486 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4487 } 4488 return packFloat64(cSign ^ signflip, cExp, cSig); 4489 } 4490 4491 if (aExp == 0) { 4492 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4493 } 4494 if (bExp == 0) { 4495 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4496 } 4497 4498 /* Calculate the actual result a * b + c */ 4499 4500 /* Multiply first; this is easy. */ 4501 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4502 * because we want the true exponent, not the "one-less-than" 4503 * flavour that roundAndPackFloat64() takes. 4504 */ 4505 pExp = aExp + bExp - 0x3fe; 4506 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4507 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4508 mul64To128(aSig, bSig, &pSig0, &pSig1); 4509 if ((int64_t)(pSig0 << 1) >= 0) { 4510 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4511 pExp--; 4512 } 4513 4514 zSign = pSign ^ signflip; 4515 4516 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4517 * bit in position 126. 4518 */ 4519 if (cExp == 0) { 4520 if (!cSig) { 4521 /* Throw out the special case of c being an exact zero now */ 4522 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4523 if (flags & float_muladd_halve_result) { 4524 pExp--; 4525 } 4526 return roundAndPackFloat64(zSign, pExp - 1, 4527 pSig1, status); 4528 } 4529 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4530 } 4531 4532 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4533 * significand of the addend, with the explicit bit in position 126. 4534 */ 4535 cSig0 = cSig << (126 - 64 - 52); 4536 cSig1 = 0; 4537 cSig0 |= LIT64(0x4000000000000000); 4538 expDiff = pExp - cExp; 4539 4540 if (pSign == cSign) { 4541 /* Addition */ 4542 if (expDiff > 0) { 4543 /* scale c to match p */ 4544 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4545 zExp = pExp; 4546 } else if (expDiff < 0) { 4547 /* scale p to match c */ 4548 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4549 zExp = cExp; 4550 } else { 4551 /* no scaling needed */ 4552 zExp = cExp; 4553 } 4554 /* Add significands and make sure explicit bit ends up in posn 126 */ 4555 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4556 if ((int64_t)zSig0 < 0) { 4557 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4558 } else { 4559 zExp--; 4560 } 4561 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4562 if (flags & float_muladd_halve_result) { 4563 zExp--; 4564 } 4565 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4566 } else { 4567 /* Subtraction */ 4568 if (expDiff > 0) { 4569 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4570 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4571 zExp = pExp; 4572 } else if (expDiff < 0) { 4573 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4574 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4575 zExp = cExp; 4576 zSign ^= 1; 4577 } else { 4578 zExp = pExp; 4579 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4580 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4581 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4582 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4583 zSign ^= 1; 4584 } else { 4585 /* Exact zero */ 4586 zSign = signflip; 4587 if (status->float_rounding_mode == float_round_down) { 4588 zSign ^= 1; 4589 } 4590 return packFloat64(zSign, 0, 0); 4591 } 4592 } 4593 --zExp; 4594 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4595 * starting with the significand in a pair of uint64_t. 4596 */ 4597 if (zSig0) { 4598 shiftcount = countLeadingZeros64(zSig0) - 1; 4599 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4600 if (zSig1) { 4601 zSig0 |= 1; 4602 } 4603 zExp -= shiftcount; 4604 } else { 4605 shiftcount = countLeadingZeros64(zSig1); 4606 if (shiftcount == 0) { 4607 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4608 zExp -= 63; 4609 } else { 4610 shiftcount--; 4611 zSig0 = zSig1 << shiftcount; 4612 zExp -= (shiftcount + 64); 4613 } 4614 } 4615 if (flags & float_muladd_halve_result) { 4616 zExp--; 4617 } 4618 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4619 } 4620 } 4621 4622 /*---------------------------------------------------------------------------- 4623 | Returns the square root of the double-precision floating-point value `a'. 4624 | The operation is performed according to the IEC/IEEE Standard for Binary 4625 | Floating-Point Arithmetic. 4626 *----------------------------------------------------------------------------*/ 4627 4628 float64 float64_sqrt(float64 a, float_status *status) 4629 { 4630 flag aSign; 4631 int aExp, zExp; 4632 uint64_t aSig, zSig, doubleZSig; 4633 uint64_t rem0, rem1, term0, term1; 4634 a = float64_squash_input_denormal(a, status); 4635 4636 aSig = extractFloat64Frac( a ); 4637 aExp = extractFloat64Exp( a ); 4638 aSign = extractFloat64Sign( a ); 4639 if ( aExp == 0x7FF ) { 4640 if (aSig) { 4641 return propagateFloat64NaN(a, a, status); 4642 } 4643 if ( ! aSign ) return a; 4644 float_raise(float_flag_invalid, status); 4645 return float64_default_nan(status); 4646 } 4647 if ( aSign ) { 4648 if ( ( aExp | aSig ) == 0 ) return a; 4649 float_raise(float_flag_invalid, status); 4650 return float64_default_nan(status); 4651 } 4652 if ( aExp == 0 ) { 4653 if ( aSig == 0 ) return float64_zero; 4654 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4655 } 4656 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4657 aSig |= LIT64( 0x0010000000000000 ); 4658 zSig = estimateSqrt32( aExp, aSig>>21 ); 4659 aSig <<= 9 - ( aExp & 1 ); 4660 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4661 if ( ( zSig & 0x1FF ) <= 5 ) { 4662 doubleZSig = zSig<<1; 4663 mul64To128( zSig, zSig, &term0, &term1 ); 4664 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4665 while ( (int64_t) rem0 < 0 ) { 4666 --zSig; 4667 doubleZSig -= 2; 4668 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4669 } 4670 zSig |= ( ( rem0 | rem1 ) != 0 ); 4671 } 4672 return roundAndPackFloat64(0, zExp, zSig, status); 4673 4674 } 4675 4676 /*---------------------------------------------------------------------------- 4677 | Returns the binary log of the double-precision floating-point value `a'. 4678 | The operation is performed according to the IEC/IEEE Standard for Binary 4679 | Floating-Point Arithmetic. 4680 *----------------------------------------------------------------------------*/ 4681 float64 float64_log2(float64 a, float_status *status) 4682 { 4683 flag aSign, zSign; 4684 int aExp; 4685 uint64_t aSig, aSig0, aSig1, zSig, i; 4686 a = float64_squash_input_denormal(a, status); 4687 4688 aSig = extractFloat64Frac( a ); 4689 aExp = extractFloat64Exp( a ); 4690 aSign = extractFloat64Sign( a ); 4691 4692 if ( aExp == 0 ) { 4693 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4694 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4695 } 4696 if ( aSign ) { 4697 float_raise(float_flag_invalid, status); 4698 return float64_default_nan(status); 4699 } 4700 if ( aExp == 0x7FF ) { 4701 if (aSig) { 4702 return propagateFloat64NaN(a, float64_zero, status); 4703 } 4704 return a; 4705 } 4706 4707 aExp -= 0x3FF; 4708 aSig |= LIT64( 0x0010000000000000 ); 4709 zSign = aExp < 0; 4710 zSig = (uint64_t)aExp << 52; 4711 for (i = 1LL << 51; i > 0; i >>= 1) { 4712 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4713 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4714 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4715 aSig >>= 1; 4716 zSig |= i; 4717 } 4718 } 4719 4720 if ( zSign ) 4721 zSig = -zSig; 4722 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4723 } 4724 4725 /*---------------------------------------------------------------------------- 4726 | Returns 1 if the double-precision floating-point value `a' is equal to the 4727 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4728 | if either operand is a NaN. Otherwise, the comparison is performed 4729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4730 *----------------------------------------------------------------------------*/ 4731 4732 int float64_eq(float64 a, float64 b, float_status *status) 4733 { 4734 uint64_t av, bv; 4735 a = float64_squash_input_denormal(a, status); 4736 b = float64_squash_input_denormal(b, status); 4737 4738 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4739 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4740 ) { 4741 float_raise(float_flag_invalid, status); 4742 return 0; 4743 } 4744 av = float64_val(a); 4745 bv = float64_val(b); 4746 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4747 4748 } 4749 4750 /*---------------------------------------------------------------------------- 4751 | Returns 1 if the double-precision floating-point value `a' is less than or 4752 | equal to the corresponding value `b', and 0 otherwise. The invalid 4753 | exception is raised if either operand is a NaN. The comparison is performed 4754 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4755 *----------------------------------------------------------------------------*/ 4756 4757 int float64_le(float64 a, float64 b, float_status *status) 4758 { 4759 flag aSign, bSign; 4760 uint64_t av, bv; 4761 a = float64_squash_input_denormal(a, status); 4762 b = float64_squash_input_denormal(b, status); 4763 4764 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4765 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4766 ) { 4767 float_raise(float_flag_invalid, status); 4768 return 0; 4769 } 4770 aSign = extractFloat64Sign( a ); 4771 bSign = extractFloat64Sign( b ); 4772 av = float64_val(a); 4773 bv = float64_val(b); 4774 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4775 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4776 4777 } 4778 4779 /*---------------------------------------------------------------------------- 4780 | Returns 1 if the double-precision floating-point value `a' is less than 4781 | the corresponding value `b', and 0 otherwise. The invalid exception is 4782 | raised if either operand is a NaN. The comparison is performed according 4783 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4784 *----------------------------------------------------------------------------*/ 4785 4786 int float64_lt(float64 a, float64 b, float_status *status) 4787 { 4788 flag aSign, bSign; 4789 uint64_t av, bv; 4790 4791 a = float64_squash_input_denormal(a, status); 4792 b = float64_squash_input_denormal(b, status); 4793 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4794 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4795 ) { 4796 float_raise(float_flag_invalid, status); 4797 return 0; 4798 } 4799 aSign = extractFloat64Sign( a ); 4800 bSign = extractFloat64Sign( b ); 4801 av = float64_val(a); 4802 bv = float64_val(b); 4803 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4804 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4805 4806 } 4807 4808 /*---------------------------------------------------------------------------- 4809 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4810 | be compared, and 0 otherwise. The invalid exception is raised if either 4811 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4812 | Standard for Binary Floating-Point Arithmetic. 4813 *----------------------------------------------------------------------------*/ 4814 4815 int float64_unordered(float64 a, float64 b, float_status *status) 4816 { 4817 a = float64_squash_input_denormal(a, status); 4818 b = float64_squash_input_denormal(b, status); 4819 4820 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4821 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4822 ) { 4823 float_raise(float_flag_invalid, status); 4824 return 1; 4825 } 4826 return 0; 4827 } 4828 4829 /*---------------------------------------------------------------------------- 4830 | Returns 1 if the double-precision floating-point value `a' is equal to the 4831 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4832 | exception.The comparison is performed according to the IEC/IEEE Standard 4833 | for Binary Floating-Point Arithmetic. 4834 *----------------------------------------------------------------------------*/ 4835 4836 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4837 { 4838 uint64_t av, bv; 4839 a = float64_squash_input_denormal(a, status); 4840 b = float64_squash_input_denormal(b, status); 4841 4842 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4843 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4844 ) { 4845 if (float64_is_signaling_nan(a, status) 4846 || float64_is_signaling_nan(b, status)) { 4847 float_raise(float_flag_invalid, status); 4848 } 4849 return 0; 4850 } 4851 av = float64_val(a); 4852 bv = float64_val(b); 4853 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4854 4855 } 4856 4857 /*---------------------------------------------------------------------------- 4858 | Returns 1 if the double-precision floating-point value `a' is less than or 4859 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4860 | cause an exception. Otherwise, the comparison is performed according to the 4861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4862 *----------------------------------------------------------------------------*/ 4863 4864 int float64_le_quiet(float64 a, float64 b, float_status *status) 4865 { 4866 flag aSign, bSign; 4867 uint64_t av, bv; 4868 a = float64_squash_input_denormal(a, status); 4869 b = float64_squash_input_denormal(b, status); 4870 4871 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4872 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4873 ) { 4874 if (float64_is_signaling_nan(a, status) 4875 || float64_is_signaling_nan(b, status)) { 4876 float_raise(float_flag_invalid, status); 4877 } 4878 return 0; 4879 } 4880 aSign = extractFloat64Sign( a ); 4881 bSign = extractFloat64Sign( b ); 4882 av = float64_val(a); 4883 bv = float64_val(b); 4884 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4885 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4886 4887 } 4888 4889 /*---------------------------------------------------------------------------- 4890 | Returns 1 if the double-precision floating-point value `a' is less than 4891 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4892 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4893 | Standard for Binary Floating-Point Arithmetic. 4894 *----------------------------------------------------------------------------*/ 4895 4896 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4897 { 4898 flag aSign, bSign; 4899 uint64_t av, bv; 4900 a = float64_squash_input_denormal(a, status); 4901 b = float64_squash_input_denormal(b, status); 4902 4903 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4904 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4905 ) { 4906 if (float64_is_signaling_nan(a, status) 4907 || float64_is_signaling_nan(b, status)) { 4908 float_raise(float_flag_invalid, status); 4909 } 4910 return 0; 4911 } 4912 aSign = extractFloat64Sign( a ); 4913 bSign = extractFloat64Sign( b ); 4914 av = float64_val(a); 4915 bv = float64_val(b); 4916 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4917 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4918 4919 } 4920 4921 /*---------------------------------------------------------------------------- 4922 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4923 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4924 | comparison is performed according to the IEC/IEEE Standard for Binary 4925 | Floating-Point Arithmetic. 4926 *----------------------------------------------------------------------------*/ 4927 4928 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4929 { 4930 a = float64_squash_input_denormal(a, status); 4931 b = float64_squash_input_denormal(b, status); 4932 4933 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4934 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4935 ) { 4936 if (float64_is_signaling_nan(a, status) 4937 || float64_is_signaling_nan(b, status)) { 4938 float_raise(float_flag_invalid, status); 4939 } 4940 return 1; 4941 } 4942 return 0; 4943 } 4944 4945 /*---------------------------------------------------------------------------- 4946 | Returns the result of converting the extended double-precision floating- 4947 | point value `a' to the 32-bit two's complement integer format. The 4948 | conversion is performed according to the IEC/IEEE Standard for Binary 4949 | Floating-Point Arithmetic---which means in particular that the conversion 4950 | is rounded according to the current rounding mode. If `a' is a NaN, the 4951 | largest positive integer is returned. Otherwise, if the conversion 4952 | overflows, the largest integer with the same sign as `a' is returned. 4953 *----------------------------------------------------------------------------*/ 4954 4955 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4956 { 4957 flag aSign; 4958 int32_t aExp, shiftCount; 4959 uint64_t aSig; 4960 4961 if (floatx80_invalid_encoding(a)) { 4962 float_raise(float_flag_invalid, status); 4963 return 1 << 31; 4964 } 4965 aSig = extractFloatx80Frac( a ); 4966 aExp = extractFloatx80Exp( a ); 4967 aSign = extractFloatx80Sign( a ); 4968 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4969 shiftCount = 0x4037 - aExp; 4970 if ( shiftCount <= 0 ) shiftCount = 1; 4971 shift64RightJamming( aSig, shiftCount, &aSig ); 4972 return roundAndPackInt32(aSign, aSig, status); 4973 4974 } 4975 4976 /*---------------------------------------------------------------------------- 4977 | Returns the result of converting the extended double-precision floating- 4978 | point value `a' to the 32-bit two's complement integer format. The 4979 | conversion is performed according to the IEC/IEEE Standard for Binary 4980 | Floating-Point Arithmetic, except that the conversion is always rounded 4981 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4982 | Otherwise, if the conversion overflows, the largest integer with the same 4983 | sign as `a' is returned. 4984 *----------------------------------------------------------------------------*/ 4985 4986 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4987 { 4988 flag aSign; 4989 int32_t aExp, shiftCount; 4990 uint64_t aSig, savedASig; 4991 int32_t z; 4992 4993 if (floatx80_invalid_encoding(a)) { 4994 float_raise(float_flag_invalid, status); 4995 return 1 << 31; 4996 } 4997 aSig = extractFloatx80Frac( a ); 4998 aExp = extractFloatx80Exp( a ); 4999 aSign = extractFloatx80Sign( a ); 5000 if ( 0x401E < aExp ) { 5001 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5002 goto invalid; 5003 } 5004 else if ( aExp < 0x3FFF ) { 5005 if (aExp || aSig) { 5006 status->float_exception_flags |= float_flag_inexact; 5007 } 5008 return 0; 5009 } 5010 shiftCount = 0x403E - aExp; 5011 savedASig = aSig; 5012 aSig >>= shiftCount; 5013 z = aSig; 5014 if ( aSign ) z = - z; 5015 if ( ( z < 0 ) ^ aSign ) { 5016 invalid: 5017 float_raise(float_flag_invalid, status); 5018 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5019 } 5020 if ( ( aSig<<shiftCount ) != savedASig ) { 5021 status->float_exception_flags |= float_flag_inexact; 5022 } 5023 return z; 5024 5025 } 5026 5027 /*---------------------------------------------------------------------------- 5028 | Returns the result of converting the extended double-precision floating- 5029 | point value `a' to the 64-bit two's complement integer format. The 5030 | conversion is performed according to the IEC/IEEE Standard for Binary 5031 | Floating-Point Arithmetic---which means in particular that the conversion 5032 | is rounded according to the current rounding mode. If `a' is a NaN, 5033 | the largest positive integer is returned. Otherwise, if the conversion 5034 | overflows, the largest integer with the same sign as `a' is returned. 5035 *----------------------------------------------------------------------------*/ 5036 5037 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5038 { 5039 flag aSign; 5040 int32_t aExp, shiftCount; 5041 uint64_t aSig, aSigExtra; 5042 5043 if (floatx80_invalid_encoding(a)) { 5044 float_raise(float_flag_invalid, status); 5045 return 1ULL << 63; 5046 } 5047 aSig = extractFloatx80Frac( a ); 5048 aExp = extractFloatx80Exp( a ); 5049 aSign = extractFloatx80Sign( a ); 5050 shiftCount = 0x403E - aExp; 5051 if ( shiftCount <= 0 ) { 5052 if ( shiftCount ) { 5053 float_raise(float_flag_invalid, status); 5054 if ( ! aSign 5055 || ( ( aExp == 0x7FFF ) 5056 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 5057 ) { 5058 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5059 } 5060 return (int64_t) LIT64( 0x8000000000000000 ); 5061 } 5062 aSigExtra = 0; 5063 } 5064 else { 5065 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5066 } 5067 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5068 5069 } 5070 5071 /*---------------------------------------------------------------------------- 5072 | Returns the result of converting the extended double-precision floating- 5073 | point value `a' to the 64-bit two's complement integer format. The 5074 | conversion is performed according to the IEC/IEEE Standard for Binary 5075 | Floating-Point Arithmetic, except that the conversion is always rounded 5076 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5077 | Otherwise, if the conversion overflows, the largest integer with the same 5078 | sign as `a' is returned. 5079 *----------------------------------------------------------------------------*/ 5080 5081 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5082 { 5083 flag aSign; 5084 int32_t aExp, shiftCount; 5085 uint64_t aSig; 5086 int64_t z; 5087 5088 if (floatx80_invalid_encoding(a)) { 5089 float_raise(float_flag_invalid, status); 5090 return 1ULL << 63; 5091 } 5092 aSig = extractFloatx80Frac( a ); 5093 aExp = extractFloatx80Exp( a ); 5094 aSign = extractFloatx80Sign( a ); 5095 shiftCount = aExp - 0x403E; 5096 if ( 0 <= shiftCount ) { 5097 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5098 if ( ( a.high != 0xC03E ) || aSig ) { 5099 float_raise(float_flag_invalid, status); 5100 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5101 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5102 } 5103 } 5104 return (int64_t) LIT64( 0x8000000000000000 ); 5105 } 5106 else if ( aExp < 0x3FFF ) { 5107 if (aExp | aSig) { 5108 status->float_exception_flags |= float_flag_inexact; 5109 } 5110 return 0; 5111 } 5112 z = aSig>>( - shiftCount ); 5113 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5114 status->float_exception_flags |= float_flag_inexact; 5115 } 5116 if ( aSign ) z = - z; 5117 return z; 5118 5119 } 5120 5121 /*---------------------------------------------------------------------------- 5122 | Returns the result of converting the extended double-precision floating- 5123 | point value `a' to the single-precision floating-point format. The 5124 | conversion is performed according to the IEC/IEEE Standard for Binary 5125 | Floating-Point Arithmetic. 5126 *----------------------------------------------------------------------------*/ 5127 5128 float32 floatx80_to_float32(floatx80 a, float_status *status) 5129 { 5130 flag aSign; 5131 int32_t aExp; 5132 uint64_t aSig; 5133 5134 if (floatx80_invalid_encoding(a)) { 5135 float_raise(float_flag_invalid, status); 5136 return float32_default_nan(status); 5137 } 5138 aSig = extractFloatx80Frac( a ); 5139 aExp = extractFloatx80Exp( a ); 5140 aSign = extractFloatx80Sign( a ); 5141 if ( aExp == 0x7FFF ) { 5142 if ( (uint64_t) ( aSig<<1 ) ) { 5143 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5144 } 5145 return packFloat32( aSign, 0xFF, 0 ); 5146 } 5147 shift64RightJamming( aSig, 33, &aSig ); 5148 if ( aExp || aSig ) aExp -= 0x3F81; 5149 return roundAndPackFloat32(aSign, aExp, aSig, status); 5150 5151 } 5152 5153 /*---------------------------------------------------------------------------- 5154 | Returns the result of converting the extended double-precision floating- 5155 | point value `a' to the double-precision floating-point format. The 5156 | conversion is performed according to the IEC/IEEE Standard for Binary 5157 | Floating-Point Arithmetic. 5158 *----------------------------------------------------------------------------*/ 5159 5160 float64 floatx80_to_float64(floatx80 a, float_status *status) 5161 { 5162 flag aSign; 5163 int32_t aExp; 5164 uint64_t aSig, zSig; 5165 5166 if (floatx80_invalid_encoding(a)) { 5167 float_raise(float_flag_invalid, status); 5168 return float64_default_nan(status); 5169 } 5170 aSig = extractFloatx80Frac( a ); 5171 aExp = extractFloatx80Exp( a ); 5172 aSign = extractFloatx80Sign( a ); 5173 if ( aExp == 0x7FFF ) { 5174 if ( (uint64_t) ( aSig<<1 ) ) { 5175 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5176 } 5177 return packFloat64( aSign, 0x7FF, 0 ); 5178 } 5179 shift64RightJamming( aSig, 1, &zSig ); 5180 if ( aExp || aSig ) aExp -= 0x3C01; 5181 return roundAndPackFloat64(aSign, aExp, zSig, status); 5182 5183 } 5184 5185 /*---------------------------------------------------------------------------- 5186 | Returns the result of converting the extended double-precision floating- 5187 | point value `a' to the quadruple-precision floating-point format. The 5188 | conversion is performed according to the IEC/IEEE Standard for Binary 5189 | Floating-Point Arithmetic. 5190 *----------------------------------------------------------------------------*/ 5191 5192 float128 floatx80_to_float128(floatx80 a, float_status *status) 5193 { 5194 flag aSign; 5195 int aExp; 5196 uint64_t aSig, zSig0, zSig1; 5197 5198 if (floatx80_invalid_encoding(a)) { 5199 float_raise(float_flag_invalid, status); 5200 return float128_default_nan(status); 5201 } 5202 aSig = extractFloatx80Frac( a ); 5203 aExp = extractFloatx80Exp( a ); 5204 aSign = extractFloatx80Sign( a ); 5205 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5206 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5207 } 5208 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5209 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5210 5211 } 5212 5213 /*---------------------------------------------------------------------------- 5214 | Rounds the extended double-precision floating-point value `a' 5215 | to the precision provided by floatx80_rounding_precision and returns the 5216 | result as an extended double-precision floating-point value. 5217 | The operation is performed according to the IEC/IEEE Standard for Binary 5218 | Floating-Point Arithmetic. 5219 *----------------------------------------------------------------------------*/ 5220 5221 floatx80 floatx80_round(floatx80 a, float_status *status) 5222 { 5223 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5224 extractFloatx80Sign(a), 5225 extractFloatx80Exp(a), 5226 extractFloatx80Frac(a), 0, status); 5227 } 5228 5229 /*---------------------------------------------------------------------------- 5230 | Rounds the extended double-precision floating-point value `a' to an integer, 5231 | and returns the result as an extended quadruple-precision floating-point 5232 | value. The operation is performed according to the IEC/IEEE Standard for 5233 | Binary Floating-Point Arithmetic. 5234 *----------------------------------------------------------------------------*/ 5235 5236 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5237 { 5238 flag aSign; 5239 int32_t aExp; 5240 uint64_t lastBitMask, roundBitsMask; 5241 floatx80 z; 5242 5243 if (floatx80_invalid_encoding(a)) { 5244 float_raise(float_flag_invalid, status); 5245 return floatx80_default_nan(status); 5246 } 5247 aExp = extractFloatx80Exp( a ); 5248 if ( 0x403E <= aExp ) { 5249 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5250 return propagateFloatx80NaN(a, a, status); 5251 } 5252 return a; 5253 } 5254 if ( aExp < 0x3FFF ) { 5255 if ( ( aExp == 0 ) 5256 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5257 return a; 5258 } 5259 status->float_exception_flags |= float_flag_inexact; 5260 aSign = extractFloatx80Sign( a ); 5261 switch (status->float_rounding_mode) { 5262 case float_round_nearest_even: 5263 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5264 ) { 5265 return 5266 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5267 } 5268 break; 5269 case float_round_ties_away: 5270 if (aExp == 0x3FFE) { 5271 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5272 } 5273 break; 5274 case float_round_down: 5275 return 5276 aSign ? 5277 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5278 : packFloatx80( 0, 0, 0 ); 5279 case float_round_up: 5280 return 5281 aSign ? packFloatx80( 1, 0, 0 ) 5282 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5283 } 5284 return packFloatx80( aSign, 0, 0 ); 5285 } 5286 lastBitMask = 1; 5287 lastBitMask <<= 0x403E - aExp; 5288 roundBitsMask = lastBitMask - 1; 5289 z = a; 5290 switch (status->float_rounding_mode) { 5291 case float_round_nearest_even: 5292 z.low += lastBitMask>>1; 5293 if ((z.low & roundBitsMask) == 0) { 5294 z.low &= ~lastBitMask; 5295 } 5296 break; 5297 case float_round_ties_away: 5298 z.low += lastBitMask >> 1; 5299 break; 5300 case float_round_to_zero: 5301 break; 5302 case float_round_up: 5303 if (!extractFloatx80Sign(z)) { 5304 z.low += roundBitsMask; 5305 } 5306 break; 5307 case float_round_down: 5308 if (extractFloatx80Sign(z)) { 5309 z.low += roundBitsMask; 5310 } 5311 break; 5312 default: 5313 abort(); 5314 } 5315 z.low &= ~ roundBitsMask; 5316 if ( z.low == 0 ) { 5317 ++z.high; 5318 z.low = LIT64( 0x8000000000000000 ); 5319 } 5320 if (z.low != a.low) { 5321 status->float_exception_flags |= float_flag_inexact; 5322 } 5323 return z; 5324 5325 } 5326 5327 /*---------------------------------------------------------------------------- 5328 | Returns the result of adding the absolute values of the extended double- 5329 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5330 | negated before being returned. `zSign' is ignored if the result is a NaN. 5331 | The addition is performed according to the IEC/IEEE Standard for Binary 5332 | Floating-Point Arithmetic. 5333 *----------------------------------------------------------------------------*/ 5334 5335 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5336 float_status *status) 5337 { 5338 int32_t aExp, bExp, zExp; 5339 uint64_t aSig, bSig, zSig0, zSig1; 5340 int32_t expDiff; 5341 5342 aSig = extractFloatx80Frac( a ); 5343 aExp = extractFloatx80Exp( a ); 5344 bSig = extractFloatx80Frac( b ); 5345 bExp = extractFloatx80Exp( b ); 5346 expDiff = aExp - bExp; 5347 if ( 0 < expDiff ) { 5348 if ( aExp == 0x7FFF ) { 5349 if ((uint64_t)(aSig << 1)) { 5350 return propagateFloatx80NaN(a, b, status); 5351 } 5352 return a; 5353 } 5354 if ( bExp == 0 ) --expDiff; 5355 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5356 zExp = aExp; 5357 } 5358 else if ( expDiff < 0 ) { 5359 if ( bExp == 0x7FFF ) { 5360 if ((uint64_t)(bSig << 1)) { 5361 return propagateFloatx80NaN(a, b, status); 5362 } 5363 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5364 } 5365 if ( aExp == 0 ) ++expDiff; 5366 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5367 zExp = bExp; 5368 } 5369 else { 5370 if ( aExp == 0x7FFF ) { 5371 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5372 return propagateFloatx80NaN(a, b, status); 5373 } 5374 return a; 5375 } 5376 zSig1 = 0; 5377 zSig0 = aSig + bSig; 5378 if ( aExp == 0 ) { 5379 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5380 goto roundAndPack; 5381 } 5382 zExp = aExp; 5383 goto shiftRight1; 5384 } 5385 zSig0 = aSig + bSig; 5386 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5387 shiftRight1: 5388 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5389 zSig0 |= LIT64( 0x8000000000000000 ); 5390 ++zExp; 5391 roundAndPack: 5392 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5393 zSign, zExp, zSig0, zSig1, status); 5394 } 5395 5396 /*---------------------------------------------------------------------------- 5397 | Returns the result of subtracting the absolute values of the extended 5398 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5399 | difference is negated before being returned. `zSign' is ignored if the 5400 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5401 | Standard for Binary Floating-Point Arithmetic. 5402 *----------------------------------------------------------------------------*/ 5403 5404 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5405 float_status *status) 5406 { 5407 int32_t aExp, bExp, zExp; 5408 uint64_t aSig, bSig, zSig0, zSig1; 5409 int32_t expDiff; 5410 5411 aSig = extractFloatx80Frac( a ); 5412 aExp = extractFloatx80Exp( a ); 5413 bSig = extractFloatx80Frac( b ); 5414 bExp = extractFloatx80Exp( b ); 5415 expDiff = aExp - bExp; 5416 if ( 0 < expDiff ) goto aExpBigger; 5417 if ( expDiff < 0 ) goto bExpBigger; 5418 if ( aExp == 0x7FFF ) { 5419 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5420 return propagateFloatx80NaN(a, b, status); 5421 } 5422 float_raise(float_flag_invalid, status); 5423 return floatx80_default_nan(status); 5424 } 5425 if ( aExp == 0 ) { 5426 aExp = 1; 5427 bExp = 1; 5428 } 5429 zSig1 = 0; 5430 if ( bSig < aSig ) goto aBigger; 5431 if ( aSig < bSig ) goto bBigger; 5432 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5433 bExpBigger: 5434 if ( bExp == 0x7FFF ) { 5435 if ((uint64_t)(bSig << 1)) { 5436 return propagateFloatx80NaN(a, b, status); 5437 } 5438 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5439 } 5440 if ( aExp == 0 ) ++expDiff; 5441 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5442 bBigger: 5443 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5444 zExp = bExp; 5445 zSign ^= 1; 5446 goto normalizeRoundAndPack; 5447 aExpBigger: 5448 if ( aExp == 0x7FFF ) { 5449 if ((uint64_t)(aSig << 1)) { 5450 return propagateFloatx80NaN(a, b, status); 5451 } 5452 return a; 5453 } 5454 if ( bExp == 0 ) --expDiff; 5455 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5456 aBigger: 5457 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5458 zExp = aExp; 5459 normalizeRoundAndPack: 5460 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5461 zSign, zExp, zSig0, zSig1, status); 5462 } 5463 5464 /*---------------------------------------------------------------------------- 5465 | Returns the result of adding the extended double-precision floating-point 5466 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5467 | Standard for Binary Floating-Point Arithmetic. 5468 *----------------------------------------------------------------------------*/ 5469 5470 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5471 { 5472 flag aSign, bSign; 5473 5474 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5475 float_raise(float_flag_invalid, status); 5476 return floatx80_default_nan(status); 5477 } 5478 aSign = extractFloatx80Sign( a ); 5479 bSign = extractFloatx80Sign( b ); 5480 if ( aSign == bSign ) { 5481 return addFloatx80Sigs(a, b, aSign, status); 5482 } 5483 else { 5484 return subFloatx80Sigs(a, b, aSign, status); 5485 } 5486 5487 } 5488 5489 /*---------------------------------------------------------------------------- 5490 | Returns the result of subtracting the extended double-precision floating- 5491 | point values `a' and `b'. The operation is performed according to the 5492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5493 *----------------------------------------------------------------------------*/ 5494 5495 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5496 { 5497 flag aSign, bSign; 5498 5499 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5500 float_raise(float_flag_invalid, status); 5501 return floatx80_default_nan(status); 5502 } 5503 aSign = extractFloatx80Sign( a ); 5504 bSign = extractFloatx80Sign( b ); 5505 if ( aSign == bSign ) { 5506 return subFloatx80Sigs(a, b, aSign, status); 5507 } 5508 else { 5509 return addFloatx80Sigs(a, b, aSign, status); 5510 } 5511 5512 } 5513 5514 /*---------------------------------------------------------------------------- 5515 | Returns the result of multiplying the extended double-precision floating- 5516 | point values `a' and `b'. The operation is performed according to the 5517 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5518 *----------------------------------------------------------------------------*/ 5519 5520 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5521 { 5522 flag aSign, bSign, zSign; 5523 int32_t aExp, bExp, zExp; 5524 uint64_t aSig, bSig, zSig0, zSig1; 5525 5526 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5527 float_raise(float_flag_invalid, status); 5528 return floatx80_default_nan(status); 5529 } 5530 aSig = extractFloatx80Frac( a ); 5531 aExp = extractFloatx80Exp( a ); 5532 aSign = extractFloatx80Sign( a ); 5533 bSig = extractFloatx80Frac( b ); 5534 bExp = extractFloatx80Exp( b ); 5535 bSign = extractFloatx80Sign( b ); 5536 zSign = aSign ^ bSign; 5537 if ( aExp == 0x7FFF ) { 5538 if ( (uint64_t) ( aSig<<1 ) 5539 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5540 return propagateFloatx80NaN(a, b, status); 5541 } 5542 if ( ( bExp | bSig ) == 0 ) goto invalid; 5543 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5544 } 5545 if ( bExp == 0x7FFF ) { 5546 if ((uint64_t)(bSig << 1)) { 5547 return propagateFloatx80NaN(a, b, status); 5548 } 5549 if ( ( aExp | aSig ) == 0 ) { 5550 invalid: 5551 float_raise(float_flag_invalid, status); 5552 return floatx80_default_nan(status); 5553 } 5554 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5555 } 5556 if ( aExp == 0 ) { 5557 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5558 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5559 } 5560 if ( bExp == 0 ) { 5561 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5562 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5563 } 5564 zExp = aExp + bExp - 0x3FFE; 5565 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5566 if ( 0 < (int64_t) zSig0 ) { 5567 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5568 --zExp; 5569 } 5570 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5571 zSign, zExp, zSig0, zSig1, status); 5572 } 5573 5574 /*---------------------------------------------------------------------------- 5575 | Returns the result of dividing the extended double-precision floating-point 5576 | value `a' by the corresponding value `b'. The operation is performed 5577 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5578 *----------------------------------------------------------------------------*/ 5579 5580 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5581 { 5582 flag aSign, bSign, zSign; 5583 int32_t aExp, bExp, zExp; 5584 uint64_t aSig, bSig, zSig0, zSig1; 5585 uint64_t rem0, rem1, rem2, term0, term1, term2; 5586 5587 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5588 float_raise(float_flag_invalid, status); 5589 return floatx80_default_nan(status); 5590 } 5591 aSig = extractFloatx80Frac( a ); 5592 aExp = extractFloatx80Exp( a ); 5593 aSign = extractFloatx80Sign( a ); 5594 bSig = extractFloatx80Frac( b ); 5595 bExp = extractFloatx80Exp( b ); 5596 bSign = extractFloatx80Sign( b ); 5597 zSign = aSign ^ bSign; 5598 if ( aExp == 0x7FFF ) { 5599 if ((uint64_t)(aSig << 1)) { 5600 return propagateFloatx80NaN(a, b, status); 5601 } 5602 if ( bExp == 0x7FFF ) { 5603 if ((uint64_t)(bSig << 1)) { 5604 return propagateFloatx80NaN(a, b, status); 5605 } 5606 goto invalid; 5607 } 5608 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5609 } 5610 if ( bExp == 0x7FFF ) { 5611 if ((uint64_t)(bSig << 1)) { 5612 return propagateFloatx80NaN(a, b, status); 5613 } 5614 return packFloatx80( zSign, 0, 0 ); 5615 } 5616 if ( bExp == 0 ) { 5617 if ( bSig == 0 ) { 5618 if ( ( aExp | aSig ) == 0 ) { 5619 invalid: 5620 float_raise(float_flag_invalid, status); 5621 return floatx80_default_nan(status); 5622 } 5623 float_raise(float_flag_divbyzero, status); 5624 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5625 } 5626 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5627 } 5628 if ( aExp == 0 ) { 5629 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5630 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5631 } 5632 zExp = aExp - bExp + 0x3FFE; 5633 rem1 = 0; 5634 if ( bSig <= aSig ) { 5635 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5636 ++zExp; 5637 } 5638 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5639 mul64To128( bSig, zSig0, &term0, &term1 ); 5640 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5641 while ( (int64_t) rem0 < 0 ) { 5642 --zSig0; 5643 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5644 } 5645 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5646 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5647 mul64To128( bSig, zSig1, &term1, &term2 ); 5648 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5649 while ( (int64_t) rem1 < 0 ) { 5650 --zSig1; 5651 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5652 } 5653 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5654 } 5655 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5656 zSign, zExp, zSig0, zSig1, status); 5657 } 5658 5659 /*---------------------------------------------------------------------------- 5660 | Returns the remainder of the extended double-precision floating-point value 5661 | `a' with respect to the corresponding value `b'. The operation is performed 5662 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5663 *----------------------------------------------------------------------------*/ 5664 5665 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5666 { 5667 flag aSign, zSign; 5668 int32_t aExp, bExp, expDiff; 5669 uint64_t aSig0, aSig1, bSig; 5670 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5671 5672 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5673 float_raise(float_flag_invalid, status); 5674 return floatx80_default_nan(status); 5675 } 5676 aSig0 = extractFloatx80Frac( a ); 5677 aExp = extractFloatx80Exp( a ); 5678 aSign = extractFloatx80Sign( a ); 5679 bSig = extractFloatx80Frac( b ); 5680 bExp = extractFloatx80Exp( b ); 5681 if ( aExp == 0x7FFF ) { 5682 if ( (uint64_t) ( aSig0<<1 ) 5683 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5684 return propagateFloatx80NaN(a, b, status); 5685 } 5686 goto invalid; 5687 } 5688 if ( bExp == 0x7FFF ) { 5689 if ((uint64_t)(bSig << 1)) { 5690 return propagateFloatx80NaN(a, b, status); 5691 } 5692 return a; 5693 } 5694 if ( bExp == 0 ) { 5695 if ( bSig == 0 ) { 5696 invalid: 5697 float_raise(float_flag_invalid, status); 5698 return floatx80_default_nan(status); 5699 } 5700 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5701 } 5702 if ( aExp == 0 ) { 5703 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5704 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5705 } 5706 bSig |= LIT64( 0x8000000000000000 ); 5707 zSign = aSign; 5708 expDiff = aExp - bExp; 5709 aSig1 = 0; 5710 if ( expDiff < 0 ) { 5711 if ( expDiff < -1 ) return a; 5712 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5713 expDiff = 0; 5714 } 5715 q = ( bSig <= aSig0 ); 5716 if ( q ) aSig0 -= bSig; 5717 expDiff -= 64; 5718 while ( 0 < expDiff ) { 5719 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5720 q = ( 2 < q ) ? q - 2 : 0; 5721 mul64To128( bSig, q, &term0, &term1 ); 5722 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5723 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5724 expDiff -= 62; 5725 } 5726 expDiff += 64; 5727 if ( 0 < expDiff ) { 5728 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5729 q = ( 2 < q ) ? q - 2 : 0; 5730 q >>= 64 - expDiff; 5731 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5732 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5733 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5734 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5735 ++q; 5736 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5737 } 5738 } 5739 else { 5740 term1 = 0; 5741 term0 = bSig; 5742 } 5743 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5744 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5745 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5746 && ( q & 1 ) ) 5747 ) { 5748 aSig0 = alternateASig0; 5749 aSig1 = alternateASig1; 5750 zSign = ! zSign; 5751 } 5752 return 5753 normalizeRoundAndPackFloatx80( 5754 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5755 5756 } 5757 5758 /*---------------------------------------------------------------------------- 5759 | Returns the square root of the extended double-precision floating-point 5760 | value `a'. The operation is performed according to the IEC/IEEE Standard 5761 | for Binary Floating-Point Arithmetic. 5762 *----------------------------------------------------------------------------*/ 5763 5764 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5765 { 5766 flag aSign; 5767 int32_t aExp, zExp; 5768 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5769 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5770 5771 if (floatx80_invalid_encoding(a)) { 5772 float_raise(float_flag_invalid, status); 5773 return floatx80_default_nan(status); 5774 } 5775 aSig0 = extractFloatx80Frac( a ); 5776 aExp = extractFloatx80Exp( a ); 5777 aSign = extractFloatx80Sign( a ); 5778 if ( aExp == 0x7FFF ) { 5779 if ((uint64_t)(aSig0 << 1)) { 5780 return propagateFloatx80NaN(a, a, status); 5781 } 5782 if ( ! aSign ) return a; 5783 goto invalid; 5784 } 5785 if ( aSign ) { 5786 if ( ( aExp | aSig0 ) == 0 ) return a; 5787 invalid: 5788 float_raise(float_flag_invalid, status); 5789 return floatx80_default_nan(status); 5790 } 5791 if ( aExp == 0 ) { 5792 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5793 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5794 } 5795 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5796 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5797 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5798 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5799 doubleZSig0 = zSig0<<1; 5800 mul64To128( zSig0, zSig0, &term0, &term1 ); 5801 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5802 while ( (int64_t) rem0 < 0 ) { 5803 --zSig0; 5804 doubleZSig0 -= 2; 5805 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5806 } 5807 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5808 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5809 if ( zSig1 == 0 ) zSig1 = 1; 5810 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5811 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5812 mul64To128( zSig1, zSig1, &term2, &term3 ); 5813 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5814 while ( (int64_t) rem1 < 0 ) { 5815 --zSig1; 5816 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5817 term3 |= 1; 5818 term2 |= doubleZSig0; 5819 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5820 } 5821 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5822 } 5823 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5824 zSig0 |= doubleZSig0; 5825 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5826 0, zExp, zSig0, zSig1, status); 5827 } 5828 5829 /*---------------------------------------------------------------------------- 5830 | Returns 1 if the extended double-precision floating-point value `a' is equal 5831 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5832 | raised if either operand is a NaN. Otherwise, the comparison is performed 5833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5834 *----------------------------------------------------------------------------*/ 5835 5836 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5837 { 5838 5839 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5840 || (extractFloatx80Exp(a) == 0x7FFF 5841 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5842 || (extractFloatx80Exp(b) == 0x7FFF 5843 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5844 ) { 5845 float_raise(float_flag_invalid, status); 5846 return 0; 5847 } 5848 return 5849 ( a.low == b.low ) 5850 && ( ( a.high == b.high ) 5851 || ( ( a.low == 0 ) 5852 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5853 ); 5854 5855 } 5856 5857 /*---------------------------------------------------------------------------- 5858 | Returns 1 if the extended double-precision floating-point value `a' is 5859 | less than or equal to the corresponding value `b', and 0 otherwise. The 5860 | invalid exception is raised if either operand is a NaN. The comparison is 5861 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5862 | Arithmetic. 5863 *----------------------------------------------------------------------------*/ 5864 5865 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5866 { 5867 flag aSign, bSign; 5868 5869 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5870 || (extractFloatx80Exp(a) == 0x7FFF 5871 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5872 || (extractFloatx80Exp(b) == 0x7FFF 5873 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5874 ) { 5875 float_raise(float_flag_invalid, status); 5876 return 0; 5877 } 5878 aSign = extractFloatx80Sign( a ); 5879 bSign = extractFloatx80Sign( b ); 5880 if ( aSign != bSign ) { 5881 return 5882 aSign 5883 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5884 == 0 ); 5885 } 5886 return 5887 aSign ? le128( b.high, b.low, a.high, a.low ) 5888 : le128( a.high, a.low, b.high, b.low ); 5889 5890 } 5891 5892 /*---------------------------------------------------------------------------- 5893 | Returns 1 if the extended double-precision floating-point value `a' is 5894 | less than the corresponding value `b', and 0 otherwise. The invalid 5895 | exception is raised if either operand is a NaN. The comparison is performed 5896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5897 *----------------------------------------------------------------------------*/ 5898 5899 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5900 { 5901 flag aSign, bSign; 5902 5903 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5904 || (extractFloatx80Exp(a) == 0x7FFF 5905 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5906 || (extractFloatx80Exp(b) == 0x7FFF 5907 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5908 ) { 5909 float_raise(float_flag_invalid, status); 5910 return 0; 5911 } 5912 aSign = extractFloatx80Sign( a ); 5913 bSign = extractFloatx80Sign( b ); 5914 if ( aSign != bSign ) { 5915 return 5916 aSign 5917 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5918 != 0 ); 5919 } 5920 return 5921 aSign ? lt128( b.high, b.low, a.high, a.low ) 5922 : lt128( a.high, a.low, b.high, b.low ); 5923 5924 } 5925 5926 /*---------------------------------------------------------------------------- 5927 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5928 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5929 | either operand is a NaN. The comparison is performed according to the 5930 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5931 *----------------------------------------------------------------------------*/ 5932 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5933 { 5934 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5935 || (extractFloatx80Exp(a) == 0x7FFF 5936 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5937 || (extractFloatx80Exp(b) == 0x7FFF 5938 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5939 ) { 5940 float_raise(float_flag_invalid, status); 5941 return 1; 5942 } 5943 return 0; 5944 } 5945 5946 /*---------------------------------------------------------------------------- 5947 | Returns 1 if the extended double-precision floating-point value `a' is 5948 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5949 | cause an exception. The comparison is performed according to the IEC/IEEE 5950 | Standard for Binary Floating-Point Arithmetic. 5951 *----------------------------------------------------------------------------*/ 5952 5953 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5954 { 5955 5956 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5957 float_raise(float_flag_invalid, status); 5958 return 0; 5959 } 5960 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5961 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5962 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5963 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5964 ) { 5965 if (floatx80_is_signaling_nan(a, status) 5966 || floatx80_is_signaling_nan(b, status)) { 5967 float_raise(float_flag_invalid, status); 5968 } 5969 return 0; 5970 } 5971 return 5972 ( a.low == b.low ) 5973 && ( ( a.high == b.high ) 5974 || ( ( a.low == 0 ) 5975 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5976 ); 5977 5978 } 5979 5980 /*---------------------------------------------------------------------------- 5981 | Returns 1 if the extended double-precision floating-point value `a' is less 5982 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5983 | do not cause an exception. Otherwise, the comparison is performed according 5984 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5985 *----------------------------------------------------------------------------*/ 5986 5987 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5988 { 5989 flag aSign, bSign; 5990 5991 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5992 float_raise(float_flag_invalid, status); 5993 return 0; 5994 } 5995 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5996 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5997 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5998 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5999 ) { 6000 if (floatx80_is_signaling_nan(a, status) 6001 || floatx80_is_signaling_nan(b, status)) { 6002 float_raise(float_flag_invalid, status); 6003 } 6004 return 0; 6005 } 6006 aSign = extractFloatx80Sign( a ); 6007 bSign = extractFloatx80Sign( b ); 6008 if ( aSign != bSign ) { 6009 return 6010 aSign 6011 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6012 == 0 ); 6013 } 6014 return 6015 aSign ? le128( b.high, b.low, a.high, a.low ) 6016 : le128( a.high, a.low, b.high, b.low ); 6017 6018 } 6019 6020 /*---------------------------------------------------------------------------- 6021 | Returns 1 if the extended double-precision floating-point value `a' is less 6022 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6023 | an exception. Otherwise, the comparison is performed according to the 6024 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6025 *----------------------------------------------------------------------------*/ 6026 6027 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6028 { 6029 flag aSign, bSign; 6030 6031 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6032 float_raise(float_flag_invalid, status); 6033 return 0; 6034 } 6035 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6036 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6037 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6038 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6039 ) { 6040 if (floatx80_is_signaling_nan(a, status) 6041 || floatx80_is_signaling_nan(b, status)) { 6042 float_raise(float_flag_invalid, status); 6043 } 6044 return 0; 6045 } 6046 aSign = extractFloatx80Sign( a ); 6047 bSign = extractFloatx80Sign( b ); 6048 if ( aSign != bSign ) { 6049 return 6050 aSign 6051 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6052 != 0 ); 6053 } 6054 return 6055 aSign ? lt128( b.high, b.low, a.high, a.low ) 6056 : lt128( a.high, a.low, b.high, b.low ); 6057 6058 } 6059 6060 /*---------------------------------------------------------------------------- 6061 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6062 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6063 | The comparison is performed according to the IEC/IEEE Standard for Binary 6064 | Floating-Point Arithmetic. 6065 *----------------------------------------------------------------------------*/ 6066 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6067 { 6068 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6069 float_raise(float_flag_invalid, status); 6070 return 1; 6071 } 6072 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6073 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6074 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6075 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6076 ) { 6077 if (floatx80_is_signaling_nan(a, status) 6078 || floatx80_is_signaling_nan(b, status)) { 6079 float_raise(float_flag_invalid, status); 6080 } 6081 return 1; 6082 } 6083 return 0; 6084 } 6085 6086 /*---------------------------------------------------------------------------- 6087 | Returns the result of converting the quadruple-precision floating-point 6088 | value `a' to the 32-bit two's complement integer format. The conversion 6089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6090 | Arithmetic---which means in particular that the conversion is rounded 6091 | according to the current rounding mode. If `a' is a NaN, the largest 6092 | positive integer is returned. Otherwise, if the conversion overflows, the 6093 | largest integer with the same sign as `a' is returned. 6094 *----------------------------------------------------------------------------*/ 6095 6096 int32_t float128_to_int32(float128 a, float_status *status) 6097 { 6098 flag aSign; 6099 int32_t aExp, shiftCount; 6100 uint64_t aSig0, aSig1; 6101 6102 aSig1 = extractFloat128Frac1( a ); 6103 aSig0 = extractFloat128Frac0( a ); 6104 aExp = extractFloat128Exp( a ); 6105 aSign = extractFloat128Sign( a ); 6106 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6107 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6108 aSig0 |= ( aSig1 != 0 ); 6109 shiftCount = 0x4028 - aExp; 6110 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6111 return roundAndPackInt32(aSign, aSig0, status); 6112 6113 } 6114 6115 /*---------------------------------------------------------------------------- 6116 | Returns the result of converting the quadruple-precision floating-point 6117 | value `a' to the 32-bit two's complement integer format. The conversion 6118 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6119 | Arithmetic, except that the conversion is always rounded toward zero. If 6120 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6121 | conversion overflows, the largest integer with the same sign as `a' is 6122 | returned. 6123 *----------------------------------------------------------------------------*/ 6124 6125 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6126 { 6127 flag aSign; 6128 int32_t aExp, shiftCount; 6129 uint64_t aSig0, aSig1, savedASig; 6130 int32_t z; 6131 6132 aSig1 = extractFloat128Frac1( a ); 6133 aSig0 = extractFloat128Frac0( a ); 6134 aExp = extractFloat128Exp( a ); 6135 aSign = extractFloat128Sign( a ); 6136 aSig0 |= ( aSig1 != 0 ); 6137 if ( 0x401E < aExp ) { 6138 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6139 goto invalid; 6140 } 6141 else if ( aExp < 0x3FFF ) { 6142 if (aExp || aSig0) { 6143 status->float_exception_flags |= float_flag_inexact; 6144 } 6145 return 0; 6146 } 6147 aSig0 |= LIT64( 0x0001000000000000 ); 6148 shiftCount = 0x402F - aExp; 6149 savedASig = aSig0; 6150 aSig0 >>= shiftCount; 6151 z = aSig0; 6152 if ( aSign ) z = - z; 6153 if ( ( z < 0 ) ^ aSign ) { 6154 invalid: 6155 float_raise(float_flag_invalid, status); 6156 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6157 } 6158 if ( ( aSig0<<shiftCount ) != savedASig ) { 6159 status->float_exception_flags |= float_flag_inexact; 6160 } 6161 return z; 6162 6163 } 6164 6165 /*---------------------------------------------------------------------------- 6166 | Returns the result of converting the quadruple-precision floating-point 6167 | value `a' to the 64-bit two's complement integer format. The conversion 6168 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6169 | Arithmetic---which means in particular that the conversion is rounded 6170 | according to the current rounding mode. If `a' is a NaN, the largest 6171 | positive integer is returned. Otherwise, if the conversion overflows, the 6172 | largest integer with the same sign as `a' is returned. 6173 *----------------------------------------------------------------------------*/ 6174 6175 int64_t float128_to_int64(float128 a, float_status *status) 6176 { 6177 flag aSign; 6178 int32_t aExp, shiftCount; 6179 uint64_t aSig0, aSig1; 6180 6181 aSig1 = extractFloat128Frac1( a ); 6182 aSig0 = extractFloat128Frac0( a ); 6183 aExp = extractFloat128Exp( a ); 6184 aSign = extractFloat128Sign( a ); 6185 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6186 shiftCount = 0x402F - aExp; 6187 if ( shiftCount <= 0 ) { 6188 if ( 0x403E < aExp ) { 6189 float_raise(float_flag_invalid, status); 6190 if ( ! aSign 6191 || ( ( aExp == 0x7FFF ) 6192 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6193 ) 6194 ) { 6195 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6196 } 6197 return (int64_t) LIT64( 0x8000000000000000 ); 6198 } 6199 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6200 } 6201 else { 6202 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6203 } 6204 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6205 6206 } 6207 6208 /*---------------------------------------------------------------------------- 6209 | Returns the result of converting the quadruple-precision floating-point 6210 | value `a' to the 64-bit two's complement integer format. The conversion 6211 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6212 | Arithmetic, except that the conversion is always rounded toward zero. 6213 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6214 | the conversion overflows, the largest integer with the same sign as `a' is 6215 | returned. 6216 *----------------------------------------------------------------------------*/ 6217 6218 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6219 { 6220 flag aSign; 6221 int32_t aExp, shiftCount; 6222 uint64_t aSig0, aSig1; 6223 int64_t z; 6224 6225 aSig1 = extractFloat128Frac1( a ); 6226 aSig0 = extractFloat128Frac0( a ); 6227 aExp = extractFloat128Exp( a ); 6228 aSign = extractFloat128Sign( a ); 6229 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6230 shiftCount = aExp - 0x402F; 6231 if ( 0 < shiftCount ) { 6232 if ( 0x403E <= aExp ) { 6233 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6234 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6235 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6236 if (aSig1) { 6237 status->float_exception_flags |= float_flag_inexact; 6238 } 6239 } 6240 else { 6241 float_raise(float_flag_invalid, status); 6242 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6243 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6244 } 6245 } 6246 return (int64_t) LIT64( 0x8000000000000000 ); 6247 } 6248 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6249 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6250 status->float_exception_flags |= float_flag_inexact; 6251 } 6252 } 6253 else { 6254 if ( aExp < 0x3FFF ) { 6255 if ( aExp | aSig0 | aSig1 ) { 6256 status->float_exception_flags |= float_flag_inexact; 6257 } 6258 return 0; 6259 } 6260 z = aSig0>>( - shiftCount ); 6261 if ( aSig1 6262 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6263 status->float_exception_flags |= float_flag_inexact; 6264 } 6265 } 6266 if ( aSign ) z = - z; 6267 return z; 6268 6269 } 6270 6271 /*---------------------------------------------------------------------------- 6272 | Returns the result of converting the quadruple-precision floating-point value 6273 | `a' to the 64-bit unsigned integer format. The conversion is 6274 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6275 | Arithmetic---which means in particular that the conversion is rounded 6276 | according to the current rounding mode. If `a' is a NaN, the largest 6277 | positive integer is returned. If the conversion overflows, the 6278 | largest unsigned integer is returned. If 'a' is negative, the value is 6279 | rounded and zero is returned; negative values that do not round to zero 6280 | will raise the inexact exception. 6281 *----------------------------------------------------------------------------*/ 6282 6283 uint64_t float128_to_uint64(float128 a, float_status *status) 6284 { 6285 flag aSign; 6286 int aExp; 6287 int shiftCount; 6288 uint64_t aSig0, aSig1; 6289 6290 aSig0 = extractFloat128Frac0(a); 6291 aSig1 = extractFloat128Frac1(a); 6292 aExp = extractFloat128Exp(a); 6293 aSign = extractFloat128Sign(a); 6294 if (aSign && (aExp > 0x3FFE)) { 6295 float_raise(float_flag_invalid, status); 6296 if (float128_is_any_nan(a)) { 6297 return LIT64(0xFFFFFFFFFFFFFFFF); 6298 } else { 6299 return 0; 6300 } 6301 } 6302 if (aExp) { 6303 aSig0 |= LIT64(0x0001000000000000); 6304 } 6305 shiftCount = 0x402F - aExp; 6306 if (shiftCount <= 0) { 6307 if (0x403E < aExp) { 6308 float_raise(float_flag_invalid, status); 6309 return LIT64(0xFFFFFFFFFFFFFFFF); 6310 } 6311 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6312 } else { 6313 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6314 } 6315 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6316 } 6317 6318 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6319 { 6320 uint64_t v; 6321 signed char current_rounding_mode = status->float_rounding_mode; 6322 6323 set_float_rounding_mode(float_round_to_zero, status); 6324 v = float128_to_uint64(a, status); 6325 set_float_rounding_mode(current_rounding_mode, status); 6326 6327 return v; 6328 } 6329 6330 /*---------------------------------------------------------------------------- 6331 | Returns the result of converting the quadruple-precision floating-point 6332 | value `a' to the 32-bit unsigned integer format. The conversion 6333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6334 | Arithmetic except that the conversion is always rounded toward zero. 6335 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6336 | if the conversion overflows, the largest unsigned integer is returned. 6337 | If 'a' is negative, the value is rounded and zero is returned; negative 6338 | values that do not round to zero will raise the inexact exception. 6339 *----------------------------------------------------------------------------*/ 6340 6341 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6342 { 6343 uint64_t v; 6344 uint32_t res; 6345 int old_exc_flags = get_float_exception_flags(status); 6346 6347 v = float128_to_uint64_round_to_zero(a, status); 6348 if (v > 0xffffffff) { 6349 res = 0xffffffff; 6350 } else { 6351 return v; 6352 } 6353 set_float_exception_flags(old_exc_flags, status); 6354 float_raise(float_flag_invalid, status); 6355 return res; 6356 } 6357 6358 /*---------------------------------------------------------------------------- 6359 | Returns the result of converting the quadruple-precision floating-point 6360 | value `a' to the single-precision floating-point format. The conversion 6361 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6362 | Arithmetic. 6363 *----------------------------------------------------------------------------*/ 6364 6365 float32 float128_to_float32(float128 a, float_status *status) 6366 { 6367 flag aSign; 6368 int32_t aExp; 6369 uint64_t aSig0, aSig1; 6370 uint32_t zSig; 6371 6372 aSig1 = extractFloat128Frac1( a ); 6373 aSig0 = extractFloat128Frac0( a ); 6374 aExp = extractFloat128Exp( a ); 6375 aSign = extractFloat128Sign( a ); 6376 if ( aExp == 0x7FFF ) { 6377 if ( aSig0 | aSig1 ) { 6378 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6379 } 6380 return packFloat32( aSign, 0xFF, 0 ); 6381 } 6382 aSig0 |= ( aSig1 != 0 ); 6383 shift64RightJamming( aSig0, 18, &aSig0 ); 6384 zSig = aSig0; 6385 if ( aExp || zSig ) { 6386 zSig |= 0x40000000; 6387 aExp -= 0x3F81; 6388 } 6389 return roundAndPackFloat32(aSign, aExp, zSig, status); 6390 6391 } 6392 6393 /*---------------------------------------------------------------------------- 6394 | Returns the result of converting the quadruple-precision floating-point 6395 | value `a' to the double-precision floating-point format. The conversion 6396 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6397 | Arithmetic. 6398 *----------------------------------------------------------------------------*/ 6399 6400 float64 float128_to_float64(float128 a, float_status *status) 6401 { 6402 flag aSign; 6403 int32_t aExp; 6404 uint64_t aSig0, aSig1; 6405 6406 aSig1 = extractFloat128Frac1( a ); 6407 aSig0 = extractFloat128Frac0( a ); 6408 aExp = extractFloat128Exp( a ); 6409 aSign = extractFloat128Sign( a ); 6410 if ( aExp == 0x7FFF ) { 6411 if ( aSig0 | aSig1 ) { 6412 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6413 } 6414 return packFloat64( aSign, 0x7FF, 0 ); 6415 } 6416 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6417 aSig0 |= ( aSig1 != 0 ); 6418 if ( aExp || aSig0 ) { 6419 aSig0 |= LIT64( 0x4000000000000000 ); 6420 aExp -= 0x3C01; 6421 } 6422 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6423 6424 } 6425 6426 /*---------------------------------------------------------------------------- 6427 | Returns the result of converting the quadruple-precision floating-point 6428 | value `a' to the extended double-precision floating-point format. The 6429 | conversion is performed according to the IEC/IEEE Standard for Binary 6430 | Floating-Point Arithmetic. 6431 *----------------------------------------------------------------------------*/ 6432 6433 floatx80 float128_to_floatx80(float128 a, float_status *status) 6434 { 6435 flag aSign; 6436 int32_t aExp; 6437 uint64_t aSig0, aSig1; 6438 6439 aSig1 = extractFloat128Frac1( a ); 6440 aSig0 = extractFloat128Frac0( a ); 6441 aExp = extractFloat128Exp( a ); 6442 aSign = extractFloat128Sign( a ); 6443 if ( aExp == 0x7FFF ) { 6444 if ( aSig0 | aSig1 ) { 6445 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6446 } 6447 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6448 } 6449 if ( aExp == 0 ) { 6450 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6451 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6452 } 6453 else { 6454 aSig0 |= LIT64( 0x0001000000000000 ); 6455 } 6456 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6457 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6458 6459 } 6460 6461 /*---------------------------------------------------------------------------- 6462 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6463 | returns the result as a quadruple-precision floating-point value. The 6464 | operation is performed according to the IEC/IEEE Standard for Binary 6465 | Floating-Point Arithmetic. 6466 *----------------------------------------------------------------------------*/ 6467 6468 float128 float128_round_to_int(float128 a, float_status *status) 6469 { 6470 flag aSign; 6471 int32_t aExp; 6472 uint64_t lastBitMask, roundBitsMask; 6473 float128 z; 6474 6475 aExp = extractFloat128Exp( a ); 6476 if ( 0x402F <= aExp ) { 6477 if ( 0x406F <= aExp ) { 6478 if ( ( aExp == 0x7FFF ) 6479 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6480 ) { 6481 return propagateFloat128NaN(a, a, status); 6482 } 6483 return a; 6484 } 6485 lastBitMask = 1; 6486 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6487 roundBitsMask = lastBitMask - 1; 6488 z = a; 6489 switch (status->float_rounding_mode) { 6490 case float_round_nearest_even: 6491 if ( lastBitMask ) { 6492 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6493 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6494 } 6495 else { 6496 if ( (int64_t) z.low < 0 ) { 6497 ++z.high; 6498 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6499 } 6500 } 6501 break; 6502 case float_round_ties_away: 6503 if (lastBitMask) { 6504 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6505 } else { 6506 if ((int64_t) z.low < 0) { 6507 ++z.high; 6508 } 6509 } 6510 break; 6511 case float_round_to_zero: 6512 break; 6513 case float_round_up: 6514 if (!extractFloat128Sign(z)) { 6515 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6516 } 6517 break; 6518 case float_round_down: 6519 if (extractFloat128Sign(z)) { 6520 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6521 } 6522 break; 6523 default: 6524 abort(); 6525 } 6526 z.low &= ~ roundBitsMask; 6527 } 6528 else { 6529 if ( aExp < 0x3FFF ) { 6530 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6531 status->float_exception_flags |= float_flag_inexact; 6532 aSign = extractFloat128Sign( a ); 6533 switch (status->float_rounding_mode) { 6534 case float_round_nearest_even: 6535 if ( ( aExp == 0x3FFE ) 6536 && ( extractFloat128Frac0( a ) 6537 | extractFloat128Frac1( a ) ) 6538 ) { 6539 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6540 } 6541 break; 6542 case float_round_ties_away: 6543 if (aExp == 0x3FFE) { 6544 return packFloat128(aSign, 0x3FFF, 0, 0); 6545 } 6546 break; 6547 case float_round_down: 6548 return 6549 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6550 : packFloat128( 0, 0, 0, 0 ); 6551 case float_round_up: 6552 return 6553 aSign ? packFloat128( 1, 0, 0, 0 ) 6554 : packFloat128( 0, 0x3FFF, 0, 0 ); 6555 } 6556 return packFloat128( aSign, 0, 0, 0 ); 6557 } 6558 lastBitMask = 1; 6559 lastBitMask <<= 0x402F - aExp; 6560 roundBitsMask = lastBitMask - 1; 6561 z.low = 0; 6562 z.high = a.high; 6563 switch (status->float_rounding_mode) { 6564 case float_round_nearest_even: 6565 z.high += lastBitMask>>1; 6566 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6567 z.high &= ~ lastBitMask; 6568 } 6569 break; 6570 case float_round_ties_away: 6571 z.high += lastBitMask>>1; 6572 break; 6573 case float_round_to_zero: 6574 break; 6575 case float_round_up: 6576 if (!extractFloat128Sign(z)) { 6577 z.high |= ( a.low != 0 ); 6578 z.high += roundBitsMask; 6579 } 6580 break; 6581 case float_round_down: 6582 if (extractFloat128Sign(z)) { 6583 z.high |= (a.low != 0); 6584 z.high += roundBitsMask; 6585 } 6586 break; 6587 default: 6588 abort(); 6589 } 6590 z.high &= ~ roundBitsMask; 6591 } 6592 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6593 status->float_exception_flags |= float_flag_inexact; 6594 } 6595 return z; 6596 6597 } 6598 6599 /*---------------------------------------------------------------------------- 6600 | Returns the result of adding the absolute values of the quadruple-precision 6601 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6602 | before being returned. `zSign' is ignored if the result is a NaN. 6603 | The addition is performed according to the IEC/IEEE Standard for Binary 6604 | Floating-Point Arithmetic. 6605 *----------------------------------------------------------------------------*/ 6606 6607 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6608 float_status *status) 6609 { 6610 int32_t aExp, bExp, zExp; 6611 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6612 int32_t expDiff; 6613 6614 aSig1 = extractFloat128Frac1( a ); 6615 aSig0 = extractFloat128Frac0( a ); 6616 aExp = extractFloat128Exp( a ); 6617 bSig1 = extractFloat128Frac1( b ); 6618 bSig0 = extractFloat128Frac0( b ); 6619 bExp = extractFloat128Exp( b ); 6620 expDiff = aExp - bExp; 6621 if ( 0 < expDiff ) { 6622 if ( aExp == 0x7FFF ) { 6623 if (aSig0 | aSig1) { 6624 return propagateFloat128NaN(a, b, status); 6625 } 6626 return a; 6627 } 6628 if ( bExp == 0 ) { 6629 --expDiff; 6630 } 6631 else { 6632 bSig0 |= LIT64( 0x0001000000000000 ); 6633 } 6634 shift128ExtraRightJamming( 6635 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6636 zExp = aExp; 6637 } 6638 else if ( expDiff < 0 ) { 6639 if ( bExp == 0x7FFF ) { 6640 if (bSig0 | bSig1) { 6641 return propagateFloat128NaN(a, b, status); 6642 } 6643 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6644 } 6645 if ( aExp == 0 ) { 6646 ++expDiff; 6647 } 6648 else { 6649 aSig0 |= LIT64( 0x0001000000000000 ); 6650 } 6651 shift128ExtraRightJamming( 6652 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6653 zExp = bExp; 6654 } 6655 else { 6656 if ( aExp == 0x7FFF ) { 6657 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6658 return propagateFloat128NaN(a, b, status); 6659 } 6660 return a; 6661 } 6662 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6663 if ( aExp == 0 ) { 6664 if (status->flush_to_zero) { 6665 if (zSig0 | zSig1) { 6666 float_raise(float_flag_output_denormal, status); 6667 } 6668 return packFloat128(zSign, 0, 0, 0); 6669 } 6670 return packFloat128( zSign, 0, zSig0, zSig1 ); 6671 } 6672 zSig2 = 0; 6673 zSig0 |= LIT64( 0x0002000000000000 ); 6674 zExp = aExp; 6675 goto shiftRight1; 6676 } 6677 aSig0 |= LIT64( 0x0001000000000000 ); 6678 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6679 --zExp; 6680 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6681 ++zExp; 6682 shiftRight1: 6683 shift128ExtraRightJamming( 6684 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6685 roundAndPack: 6686 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6687 6688 } 6689 6690 /*---------------------------------------------------------------------------- 6691 | Returns the result of subtracting the absolute values of the quadruple- 6692 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6693 | difference is negated before being returned. `zSign' is ignored if the 6694 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6695 | Standard for Binary Floating-Point Arithmetic. 6696 *----------------------------------------------------------------------------*/ 6697 6698 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6699 float_status *status) 6700 { 6701 int32_t aExp, bExp, zExp; 6702 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6703 int32_t expDiff; 6704 6705 aSig1 = extractFloat128Frac1( a ); 6706 aSig0 = extractFloat128Frac0( a ); 6707 aExp = extractFloat128Exp( a ); 6708 bSig1 = extractFloat128Frac1( b ); 6709 bSig0 = extractFloat128Frac0( b ); 6710 bExp = extractFloat128Exp( b ); 6711 expDiff = aExp - bExp; 6712 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6713 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6714 if ( 0 < expDiff ) goto aExpBigger; 6715 if ( expDiff < 0 ) goto bExpBigger; 6716 if ( aExp == 0x7FFF ) { 6717 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6718 return propagateFloat128NaN(a, b, status); 6719 } 6720 float_raise(float_flag_invalid, status); 6721 return float128_default_nan(status); 6722 } 6723 if ( aExp == 0 ) { 6724 aExp = 1; 6725 bExp = 1; 6726 } 6727 if ( bSig0 < aSig0 ) goto aBigger; 6728 if ( aSig0 < bSig0 ) goto bBigger; 6729 if ( bSig1 < aSig1 ) goto aBigger; 6730 if ( aSig1 < bSig1 ) goto bBigger; 6731 return packFloat128(status->float_rounding_mode == float_round_down, 6732 0, 0, 0); 6733 bExpBigger: 6734 if ( bExp == 0x7FFF ) { 6735 if (bSig0 | bSig1) { 6736 return propagateFloat128NaN(a, b, status); 6737 } 6738 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6739 } 6740 if ( aExp == 0 ) { 6741 ++expDiff; 6742 } 6743 else { 6744 aSig0 |= LIT64( 0x4000000000000000 ); 6745 } 6746 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6747 bSig0 |= LIT64( 0x4000000000000000 ); 6748 bBigger: 6749 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6750 zExp = bExp; 6751 zSign ^= 1; 6752 goto normalizeRoundAndPack; 6753 aExpBigger: 6754 if ( aExp == 0x7FFF ) { 6755 if (aSig0 | aSig1) { 6756 return propagateFloat128NaN(a, b, status); 6757 } 6758 return a; 6759 } 6760 if ( bExp == 0 ) { 6761 --expDiff; 6762 } 6763 else { 6764 bSig0 |= LIT64( 0x4000000000000000 ); 6765 } 6766 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6767 aSig0 |= LIT64( 0x4000000000000000 ); 6768 aBigger: 6769 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6770 zExp = aExp; 6771 normalizeRoundAndPack: 6772 --zExp; 6773 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6774 status); 6775 6776 } 6777 6778 /*---------------------------------------------------------------------------- 6779 | Returns the result of adding the quadruple-precision floating-point values 6780 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6781 | for Binary Floating-Point Arithmetic. 6782 *----------------------------------------------------------------------------*/ 6783 6784 float128 float128_add(float128 a, float128 b, float_status *status) 6785 { 6786 flag aSign, bSign; 6787 6788 aSign = extractFloat128Sign( a ); 6789 bSign = extractFloat128Sign( b ); 6790 if ( aSign == bSign ) { 6791 return addFloat128Sigs(a, b, aSign, status); 6792 } 6793 else { 6794 return subFloat128Sigs(a, b, aSign, status); 6795 } 6796 6797 } 6798 6799 /*---------------------------------------------------------------------------- 6800 | Returns the result of subtracting the quadruple-precision floating-point 6801 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6802 | Standard for Binary Floating-Point Arithmetic. 6803 *----------------------------------------------------------------------------*/ 6804 6805 float128 float128_sub(float128 a, float128 b, float_status *status) 6806 { 6807 flag aSign, bSign; 6808 6809 aSign = extractFloat128Sign( a ); 6810 bSign = extractFloat128Sign( b ); 6811 if ( aSign == bSign ) { 6812 return subFloat128Sigs(a, b, aSign, status); 6813 } 6814 else { 6815 return addFloat128Sigs(a, b, aSign, status); 6816 } 6817 6818 } 6819 6820 /*---------------------------------------------------------------------------- 6821 | Returns the result of multiplying the quadruple-precision floating-point 6822 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6823 | Standard for Binary Floating-Point Arithmetic. 6824 *----------------------------------------------------------------------------*/ 6825 6826 float128 float128_mul(float128 a, float128 b, float_status *status) 6827 { 6828 flag aSign, bSign, zSign; 6829 int32_t aExp, bExp, zExp; 6830 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6831 6832 aSig1 = extractFloat128Frac1( a ); 6833 aSig0 = extractFloat128Frac0( a ); 6834 aExp = extractFloat128Exp( a ); 6835 aSign = extractFloat128Sign( a ); 6836 bSig1 = extractFloat128Frac1( b ); 6837 bSig0 = extractFloat128Frac0( b ); 6838 bExp = extractFloat128Exp( b ); 6839 bSign = extractFloat128Sign( b ); 6840 zSign = aSign ^ bSign; 6841 if ( aExp == 0x7FFF ) { 6842 if ( ( aSig0 | aSig1 ) 6843 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6844 return propagateFloat128NaN(a, b, status); 6845 } 6846 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6847 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6848 } 6849 if ( bExp == 0x7FFF ) { 6850 if (bSig0 | bSig1) { 6851 return propagateFloat128NaN(a, b, status); 6852 } 6853 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6854 invalid: 6855 float_raise(float_flag_invalid, status); 6856 return float128_default_nan(status); 6857 } 6858 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6859 } 6860 if ( aExp == 0 ) { 6861 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6862 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6863 } 6864 if ( bExp == 0 ) { 6865 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6866 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6867 } 6868 zExp = aExp + bExp - 0x4000; 6869 aSig0 |= LIT64( 0x0001000000000000 ); 6870 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6871 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6872 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6873 zSig2 |= ( zSig3 != 0 ); 6874 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6875 shift128ExtraRightJamming( 6876 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6877 ++zExp; 6878 } 6879 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6880 6881 } 6882 6883 /*---------------------------------------------------------------------------- 6884 | Returns the result of dividing the quadruple-precision floating-point value 6885 | `a' by the corresponding value `b'. The operation is performed according to 6886 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6887 *----------------------------------------------------------------------------*/ 6888 6889 float128 float128_div(float128 a, float128 b, float_status *status) 6890 { 6891 flag aSign, bSign, zSign; 6892 int32_t aExp, bExp, zExp; 6893 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6894 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6895 6896 aSig1 = extractFloat128Frac1( a ); 6897 aSig0 = extractFloat128Frac0( a ); 6898 aExp = extractFloat128Exp( a ); 6899 aSign = extractFloat128Sign( a ); 6900 bSig1 = extractFloat128Frac1( b ); 6901 bSig0 = extractFloat128Frac0( b ); 6902 bExp = extractFloat128Exp( b ); 6903 bSign = extractFloat128Sign( b ); 6904 zSign = aSign ^ bSign; 6905 if ( aExp == 0x7FFF ) { 6906 if (aSig0 | aSig1) { 6907 return propagateFloat128NaN(a, b, status); 6908 } 6909 if ( bExp == 0x7FFF ) { 6910 if (bSig0 | bSig1) { 6911 return propagateFloat128NaN(a, b, status); 6912 } 6913 goto invalid; 6914 } 6915 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6916 } 6917 if ( bExp == 0x7FFF ) { 6918 if (bSig0 | bSig1) { 6919 return propagateFloat128NaN(a, b, status); 6920 } 6921 return packFloat128( zSign, 0, 0, 0 ); 6922 } 6923 if ( bExp == 0 ) { 6924 if ( ( bSig0 | bSig1 ) == 0 ) { 6925 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6926 invalid: 6927 float_raise(float_flag_invalid, status); 6928 return float128_default_nan(status); 6929 } 6930 float_raise(float_flag_divbyzero, status); 6931 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6932 } 6933 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6934 } 6935 if ( aExp == 0 ) { 6936 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6937 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6938 } 6939 zExp = aExp - bExp + 0x3FFD; 6940 shortShift128Left( 6941 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6942 shortShift128Left( 6943 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6944 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6945 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6946 ++zExp; 6947 } 6948 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6949 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6950 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6951 while ( (int64_t) rem0 < 0 ) { 6952 --zSig0; 6953 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6954 } 6955 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6956 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6957 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6958 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6959 while ( (int64_t) rem1 < 0 ) { 6960 --zSig1; 6961 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6962 } 6963 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6964 } 6965 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6966 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6967 6968 } 6969 6970 /*---------------------------------------------------------------------------- 6971 | Returns the remainder of the quadruple-precision floating-point value `a' 6972 | with respect to the corresponding value `b'. The operation is performed 6973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6974 *----------------------------------------------------------------------------*/ 6975 6976 float128 float128_rem(float128 a, float128 b, float_status *status) 6977 { 6978 flag aSign, zSign; 6979 int32_t aExp, bExp, expDiff; 6980 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6981 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6982 int64_t sigMean0; 6983 6984 aSig1 = extractFloat128Frac1( a ); 6985 aSig0 = extractFloat128Frac0( a ); 6986 aExp = extractFloat128Exp( a ); 6987 aSign = extractFloat128Sign( a ); 6988 bSig1 = extractFloat128Frac1( b ); 6989 bSig0 = extractFloat128Frac0( b ); 6990 bExp = extractFloat128Exp( b ); 6991 if ( aExp == 0x7FFF ) { 6992 if ( ( aSig0 | aSig1 ) 6993 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6994 return propagateFloat128NaN(a, b, status); 6995 } 6996 goto invalid; 6997 } 6998 if ( bExp == 0x7FFF ) { 6999 if (bSig0 | bSig1) { 7000 return propagateFloat128NaN(a, b, status); 7001 } 7002 return a; 7003 } 7004 if ( bExp == 0 ) { 7005 if ( ( bSig0 | bSig1 ) == 0 ) { 7006 invalid: 7007 float_raise(float_flag_invalid, status); 7008 return float128_default_nan(status); 7009 } 7010 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7011 } 7012 if ( aExp == 0 ) { 7013 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7014 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7015 } 7016 expDiff = aExp - bExp; 7017 if ( expDiff < -1 ) return a; 7018 shortShift128Left( 7019 aSig0 | LIT64( 0x0001000000000000 ), 7020 aSig1, 7021 15 - ( expDiff < 0 ), 7022 &aSig0, 7023 &aSig1 7024 ); 7025 shortShift128Left( 7026 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7027 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7028 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7029 expDiff -= 64; 7030 while ( 0 < expDiff ) { 7031 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7032 q = ( 4 < q ) ? q - 4 : 0; 7033 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7034 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7035 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7036 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7037 expDiff -= 61; 7038 } 7039 if ( -64 < expDiff ) { 7040 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7041 q = ( 4 < q ) ? q - 4 : 0; 7042 q >>= - expDiff; 7043 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7044 expDiff += 52; 7045 if ( expDiff < 0 ) { 7046 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7047 } 7048 else { 7049 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7050 } 7051 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7052 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7053 } 7054 else { 7055 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7056 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7057 } 7058 do { 7059 alternateASig0 = aSig0; 7060 alternateASig1 = aSig1; 7061 ++q; 7062 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7063 } while ( 0 <= (int64_t) aSig0 ); 7064 add128( 7065 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7066 if ( ( sigMean0 < 0 ) 7067 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7068 aSig0 = alternateASig0; 7069 aSig1 = alternateASig1; 7070 } 7071 zSign = ( (int64_t) aSig0 < 0 ); 7072 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7073 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7074 status); 7075 } 7076 7077 /*---------------------------------------------------------------------------- 7078 | Returns the square root of the quadruple-precision floating-point value `a'. 7079 | The operation is performed according to the IEC/IEEE Standard for Binary 7080 | Floating-Point Arithmetic. 7081 *----------------------------------------------------------------------------*/ 7082 7083 float128 float128_sqrt(float128 a, float_status *status) 7084 { 7085 flag aSign; 7086 int32_t aExp, zExp; 7087 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7088 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7089 7090 aSig1 = extractFloat128Frac1( a ); 7091 aSig0 = extractFloat128Frac0( a ); 7092 aExp = extractFloat128Exp( a ); 7093 aSign = extractFloat128Sign( a ); 7094 if ( aExp == 0x7FFF ) { 7095 if (aSig0 | aSig1) { 7096 return propagateFloat128NaN(a, a, status); 7097 } 7098 if ( ! aSign ) return a; 7099 goto invalid; 7100 } 7101 if ( aSign ) { 7102 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7103 invalid: 7104 float_raise(float_flag_invalid, status); 7105 return float128_default_nan(status); 7106 } 7107 if ( aExp == 0 ) { 7108 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7109 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7110 } 7111 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7112 aSig0 |= LIT64( 0x0001000000000000 ); 7113 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7114 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7115 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7116 doubleZSig0 = zSig0<<1; 7117 mul64To128( zSig0, zSig0, &term0, &term1 ); 7118 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7119 while ( (int64_t) rem0 < 0 ) { 7120 --zSig0; 7121 doubleZSig0 -= 2; 7122 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7123 } 7124 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7125 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7126 if ( zSig1 == 0 ) zSig1 = 1; 7127 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7128 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7129 mul64To128( zSig1, zSig1, &term2, &term3 ); 7130 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7131 while ( (int64_t) rem1 < 0 ) { 7132 --zSig1; 7133 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7134 term3 |= 1; 7135 term2 |= doubleZSig0; 7136 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7137 } 7138 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7139 } 7140 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7141 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7142 7143 } 7144 7145 /*---------------------------------------------------------------------------- 7146 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7147 | the corresponding value `b', and 0 otherwise. The invalid exception is 7148 | raised if either operand is a NaN. Otherwise, the comparison is performed 7149 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7150 *----------------------------------------------------------------------------*/ 7151 7152 int float128_eq(float128 a, float128 b, float_status *status) 7153 { 7154 7155 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7156 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7157 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7158 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7159 ) { 7160 float_raise(float_flag_invalid, status); 7161 return 0; 7162 } 7163 return 7164 ( a.low == b.low ) 7165 && ( ( a.high == b.high ) 7166 || ( ( a.low == 0 ) 7167 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7168 ); 7169 7170 } 7171 7172 /*---------------------------------------------------------------------------- 7173 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7174 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7175 | exception is raised if either operand is a NaN. The comparison is performed 7176 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7177 *----------------------------------------------------------------------------*/ 7178 7179 int float128_le(float128 a, float128 b, float_status *status) 7180 { 7181 flag aSign, bSign; 7182 7183 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7184 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7185 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7186 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7187 ) { 7188 float_raise(float_flag_invalid, status); 7189 return 0; 7190 } 7191 aSign = extractFloat128Sign( a ); 7192 bSign = extractFloat128Sign( b ); 7193 if ( aSign != bSign ) { 7194 return 7195 aSign 7196 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7197 == 0 ); 7198 } 7199 return 7200 aSign ? le128( b.high, b.low, a.high, a.low ) 7201 : le128( a.high, a.low, b.high, b.low ); 7202 7203 } 7204 7205 /*---------------------------------------------------------------------------- 7206 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7207 | the corresponding value `b', and 0 otherwise. The invalid exception is 7208 | raised if either operand is a NaN. The comparison is performed according 7209 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7210 *----------------------------------------------------------------------------*/ 7211 7212 int float128_lt(float128 a, float128 b, float_status *status) 7213 { 7214 flag aSign, bSign; 7215 7216 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7217 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7218 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7219 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7220 ) { 7221 float_raise(float_flag_invalid, status); 7222 return 0; 7223 } 7224 aSign = extractFloat128Sign( a ); 7225 bSign = extractFloat128Sign( b ); 7226 if ( aSign != bSign ) { 7227 return 7228 aSign 7229 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7230 != 0 ); 7231 } 7232 return 7233 aSign ? lt128( b.high, b.low, a.high, a.low ) 7234 : lt128( a.high, a.low, b.high, b.low ); 7235 7236 } 7237 7238 /*---------------------------------------------------------------------------- 7239 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7240 | be compared, and 0 otherwise. The invalid exception is raised if either 7241 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7242 | Standard for Binary Floating-Point Arithmetic. 7243 *----------------------------------------------------------------------------*/ 7244 7245 int float128_unordered(float128 a, float128 b, float_status *status) 7246 { 7247 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7248 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7249 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7250 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7251 ) { 7252 float_raise(float_flag_invalid, status); 7253 return 1; 7254 } 7255 return 0; 7256 } 7257 7258 /*---------------------------------------------------------------------------- 7259 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7260 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7261 | exception. The comparison is performed according to the IEC/IEEE Standard 7262 | for Binary Floating-Point Arithmetic. 7263 *----------------------------------------------------------------------------*/ 7264 7265 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7266 { 7267 7268 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7269 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7270 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7271 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7272 ) { 7273 if (float128_is_signaling_nan(a, status) 7274 || float128_is_signaling_nan(b, status)) { 7275 float_raise(float_flag_invalid, status); 7276 } 7277 return 0; 7278 } 7279 return 7280 ( a.low == b.low ) 7281 && ( ( a.high == b.high ) 7282 || ( ( a.low == 0 ) 7283 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7284 ); 7285 7286 } 7287 7288 /*---------------------------------------------------------------------------- 7289 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7290 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7291 | cause an exception. Otherwise, the comparison is performed according to the 7292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7293 *----------------------------------------------------------------------------*/ 7294 7295 int float128_le_quiet(float128 a, float128 b, float_status *status) 7296 { 7297 flag aSign, bSign; 7298 7299 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7300 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7301 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7302 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7303 ) { 7304 if (float128_is_signaling_nan(a, status) 7305 || float128_is_signaling_nan(b, status)) { 7306 float_raise(float_flag_invalid, status); 7307 } 7308 return 0; 7309 } 7310 aSign = extractFloat128Sign( a ); 7311 bSign = extractFloat128Sign( b ); 7312 if ( aSign != bSign ) { 7313 return 7314 aSign 7315 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7316 == 0 ); 7317 } 7318 return 7319 aSign ? le128( b.high, b.low, a.high, a.low ) 7320 : le128( a.high, a.low, b.high, b.low ); 7321 7322 } 7323 7324 /*---------------------------------------------------------------------------- 7325 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7326 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7327 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7328 | Standard for Binary Floating-Point Arithmetic. 7329 *----------------------------------------------------------------------------*/ 7330 7331 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7332 { 7333 flag aSign, bSign; 7334 7335 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7336 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7337 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7338 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7339 ) { 7340 if (float128_is_signaling_nan(a, status) 7341 || float128_is_signaling_nan(b, status)) { 7342 float_raise(float_flag_invalid, status); 7343 } 7344 return 0; 7345 } 7346 aSign = extractFloat128Sign( a ); 7347 bSign = extractFloat128Sign( b ); 7348 if ( aSign != bSign ) { 7349 return 7350 aSign 7351 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7352 != 0 ); 7353 } 7354 return 7355 aSign ? lt128( b.high, b.low, a.high, a.low ) 7356 : lt128( a.high, a.low, b.high, b.low ); 7357 7358 } 7359 7360 /*---------------------------------------------------------------------------- 7361 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7362 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7363 | comparison is performed according to the IEC/IEEE Standard for Binary 7364 | Floating-Point Arithmetic. 7365 *----------------------------------------------------------------------------*/ 7366 7367 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7368 { 7369 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7370 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7371 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7372 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7373 ) { 7374 if (float128_is_signaling_nan(a, status) 7375 || float128_is_signaling_nan(b, status)) { 7376 float_raise(float_flag_invalid, status); 7377 } 7378 return 1; 7379 } 7380 return 0; 7381 } 7382 7383 /* misc functions */ 7384 float32 uint32_to_float32(uint32_t a, float_status *status) 7385 { 7386 return int64_to_float32(a, status); 7387 } 7388 7389 float64 uint32_to_float64(uint32_t a, float_status *status) 7390 { 7391 return int64_to_float64(a, status); 7392 } 7393 7394 uint32_t float32_to_uint32(float32 a, float_status *status) 7395 { 7396 int64_t v; 7397 uint32_t res; 7398 int old_exc_flags = get_float_exception_flags(status); 7399 7400 v = float32_to_int64(a, status); 7401 if (v < 0) { 7402 res = 0; 7403 } else if (v > 0xffffffff) { 7404 res = 0xffffffff; 7405 } else { 7406 return v; 7407 } 7408 set_float_exception_flags(old_exc_flags, status); 7409 float_raise(float_flag_invalid, status); 7410 return res; 7411 } 7412 7413 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7414 { 7415 int64_t v; 7416 uint32_t res; 7417 int old_exc_flags = get_float_exception_flags(status); 7418 7419 v = float32_to_int64_round_to_zero(a, status); 7420 if (v < 0) { 7421 res = 0; 7422 } else if (v > 0xffffffff) { 7423 res = 0xffffffff; 7424 } else { 7425 return v; 7426 } 7427 set_float_exception_flags(old_exc_flags, status); 7428 float_raise(float_flag_invalid, status); 7429 return res; 7430 } 7431 7432 int16_t float32_to_int16(float32 a, float_status *status) 7433 { 7434 int32_t v; 7435 int16_t res; 7436 int old_exc_flags = get_float_exception_flags(status); 7437 7438 v = float32_to_int32(a, status); 7439 if (v < -0x8000) { 7440 res = -0x8000; 7441 } else if (v > 0x7fff) { 7442 res = 0x7fff; 7443 } else { 7444 return v; 7445 } 7446 7447 set_float_exception_flags(old_exc_flags, status); 7448 float_raise(float_flag_invalid, status); 7449 return res; 7450 } 7451 7452 uint16_t float32_to_uint16(float32 a, float_status *status) 7453 { 7454 int32_t v; 7455 uint16_t res; 7456 int old_exc_flags = get_float_exception_flags(status); 7457 7458 v = float32_to_int32(a, status); 7459 if (v < 0) { 7460 res = 0; 7461 } else if (v > 0xffff) { 7462 res = 0xffff; 7463 } else { 7464 return v; 7465 } 7466 7467 set_float_exception_flags(old_exc_flags, status); 7468 float_raise(float_flag_invalid, status); 7469 return res; 7470 } 7471 7472 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7473 { 7474 int64_t v; 7475 uint16_t res; 7476 int old_exc_flags = get_float_exception_flags(status); 7477 7478 v = float32_to_int64_round_to_zero(a, status); 7479 if (v < 0) { 7480 res = 0; 7481 } else if (v > 0xffff) { 7482 res = 0xffff; 7483 } else { 7484 return v; 7485 } 7486 set_float_exception_flags(old_exc_flags, status); 7487 float_raise(float_flag_invalid, status); 7488 return res; 7489 } 7490 7491 uint32_t float64_to_uint32(float64 a, float_status *status) 7492 { 7493 uint64_t v; 7494 uint32_t res; 7495 int old_exc_flags = get_float_exception_flags(status); 7496 7497 v = float64_to_uint64(a, status); 7498 if (v > 0xffffffff) { 7499 res = 0xffffffff; 7500 } else { 7501 return v; 7502 } 7503 set_float_exception_flags(old_exc_flags, status); 7504 float_raise(float_flag_invalid, status); 7505 return res; 7506 } 7507 7508 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7509 { 7510 uint64_t v; 7511 uint32_t res; 7512 int old_exc_flags = get_float_exception_flags(status); 7513 7514 v = float64_to_uint64_round_to_zero(a, status); 7515 if (v > 0xffffffff) { 7516 res = 0xffffffff; 7517 } else { 7518 return v; 7519 } 7520 set_float_exception_flags(old_exc_flags, status); 7521 float_raise(float_flag_invalid, status); 7522 return res; 7523 } 7524 7525 int16_t float64_to_int16(float64 a, float_status *status) 7526 { 7527 int64_t v; 7528 int16_t res; 7529 int old_exc_flags = get_float_exception_flags(status); 7530 7531 v = float64_to_int32(a, status); 7532 if (v < -0x8000) { 7533 res = -0x8000; 7534 } else if (v > 0x7fff) { 7535 res = 0x7fff; 7536 } else { 7537 return v; 7538 } 7539 7540 set_float_exception_flags(old_exc_flags, status); 7541 float_raise(float_flag_invalid, status); 7542 return res; 7543 } 7544 7545 uint16_t float64_to_uint16(float64 a, float_status *status) 7546 { 7547 int64_t v; 7548 uint16_t res; 7549 int old_exc_flags = get_float_exception_flags(status); 7550 7551 v = float64_to_int32(a, status); 7552 if (v < 0) { 7553 res = 0; 7554 } else if (v > 0xffff) { 7555 res = 0xffff; 7556 } else { 7557 return v; 7558 } 7559 7560 set_float_exception_flags(old_exc_flags, status); 7561 float_raise(float_flag_invalid, status); 7562 return res; 7563 } 7564 7565 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7566 { 7567 int64_t v; 7568 uint16_t res; 7569 int old_exc_flags = get_float_exception_flags(status); 7570 7571 v = float64_to_int64_round_to_zero(a, status); 7572 if (v < 0) { 7573 res = 0; 7574 } else if (v > 0xffff) { 7575 res = 0xffff; 7576 } else { 7577 return v; 7578 } 7579 set_float_exception_flags(old_exc_flags, status); 7580 float_raise(float_flag_invalid, status); 7581 return res; 7582 } 7583 7584 /*---------------------------------------------------------------------------- 7585 | Returns the result of converting the double-precision floating-point value 7586 | `a' to the 64-bit unsigned integer format. The conversion is 7587 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7588 | Arithmetic---which means in particular that the conversion is rounded 7589 | according to the current rounding mode. If `a' is a NaN, the largest 7590 | positive integer is returned. If the conversion overflows, the 7591 | largest unsigned integer is returned. If 'a' is negative, the value is 7592 | rounded and zero is returned; negative values that do not round to zero 7593 | will raise the inexact exception. 7594 *----------------------------------------------------------------------------*/ 7595 7596 uint64_t float64_to_uint64(float64 a, float_status *status) 7597 { 7598 flag aSign; 7599 int aExp; 7600 int shiftCount; 7601 uint64_t aSig, aSigExtra; 7602 a = float64_squash_input_denormal(a, status); 7603 7604 aSig = extractFloat64Frac(a); 7605 aExp = extractFloat64Exp(a); 7606 aSign = extractFloat64Sign(a); 7607 if (aSign && (aExp > 1022)) { 7608 float_raise(float_flag_invalid, status); 7609 if (float64_is_any_nan(a)) { 7610 return LIT64(0xFFFFFFFFFFFFFFFF); 7611 } else { 7612 return 0; 7613 } 7614 } 7615 if (aExp) { 7616 aSig |= LIT64(0x0010000000000000); 7617 } 7618 shiftCount = 0x433 - aExp; 7619 if (shiftCount <= 0) { 7620 if (0x43E < aExp) { 7621 float_raise(float_flag_invalid, status); 7622 return LIT64(0xFFFFFFFFFFFFFFFF); 7623 } 7624 aSigExtra = 0; 7625 aSig <<= -shiftCount; 7626 } else { 7627 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7628 } 7629 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7630 } 7631 7632 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7633 { 7634 signed char current_rounding_mode = status->float_rounding_mode; 7635 set_float_rounding_mode(float_round_to_zero, status); 7636 uint64_t v = float64_to_uint64(a, status); 7637 set_float_rounding_mode(current_rounding_mode, status); 7638 return v; 7639 } 7640 7641 #define COMPARE(s, nan_exp) \ 7642 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7643 int is_quiet, float_status *status) \ 7644 { \ 7645 flag aSign, bSign; \ 7646 uint ## s ## _t av, bv; \ 7647 a = float ## s ## _squash_input_denormal(a, status); \ 7648 b = float ## s ## _squash_input_denormal(b, status); \ 7649 \ 7650 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7651 extractFloat ## s ## Frac( a ) ) || \ 7652 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7653 extractFloat ## s ## Frac( b ) )) { \ 7654 if (!is_quiet || \ 7655 float ## s ## _is_signaling_nan(a, status) || \ 7656 float ## s ## _is_signaling_nan(b, status)) { \ 7657 float_raise(float_flag_invalid, status); \ 7658 } \ 7659 return float_relation_unordered; \ 7660 } \ 7661 aSign = extractFloat ## s ## Sign( a ); \ 7662 bSign = extractFloat ## s ## Sign( b ); \ 7663 av = float ## s ## _val(a); \ 7664 bv = float ## s ## _val(b); \ 7665 if ( aSign != bSign ) { \ 7666 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7667 /* zero case */ \ 7668 return float_relation_equal; \ 7669 } else { \ 7670 return 1 - (2 * aSign); \ 7671 } \ 7672 } else { \ 7673 if (av == bv) { \ 7674 return float_relation_equal; \ 7675 } else { \ 7676 return 1 - 2 * (aSign ^ ( av < bv )); \ 7677 } \ 7678 } \ 7679 } \ 7680 \ 7681 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7682 { \ 7683 return float ## s ## _compare_internal(a, b, 0, status); \ 7684 } \ 7685 \ 7686 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7687 float_status *status) \ 7688 { \ 7689 return float ## s ## _compare_internal(a, b, 1, status); \ 7690 } 7691 7692 COMPARE(32, 0xff) 7693 COMPARE(64, 0x7ff) 7694 7695 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7696 int is_quiet, float_status *status) 7697 { 7698 flag aSign, bSign; 7699 7700 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7701 float_raise(float_flag_invalid, status); 7702 return float_relation_unordered; 7703 } 7704 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7705 ( extractFloatx80Frac( a )<<1 ) ) || 7706 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7707 ( extractFloatx80Frac( b )<<1 ) )) { 7708 if (!is_quiet || 7709 floatx80_is_signaling_nan(a, status) || 7710 floatx80_is_signaling_nan(b, status)) { 7711 float_raise(float_flag_invalid, status); 7712 } 7713 return float_relation_unordered; 7714 } 7715 aSign = extractFloatx80Sign( a ); 7716 bSign = extractFloatx80Sign( b ); 7717 if ( aSign != bSign ) { 7718 7719 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7720 ( ( a.low | b.low ) == 0 ) ) { 7721 /* zero case */ 7722 return float_relation_equal; 7723 } else { 7724 return 1 - (2 * aSign); 7725 } 7726 } else { 7727 if (a.low == b.low && a.high == b.high) { 7728 return float_relation_equal; 7729 } else { 7730 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7731 } 7732 } 7733 } 7734 7735 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7736 { 7737 return floatx80_compare_internal(a, b, 0, status); 7738 } 7739 7740 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7741 { 7742 return floatx80_compare_internal(a, b, 1, status); 7743 } 7744 7745 static inline int float128_compare_internal(float128 a, float128 b, 7746 int is_quiet, float_status *status) 7747 { 7748 flag aSign, bSign; 7749 7750 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7751 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7752 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7753 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7754 if (!is_quiet || 7755 float128_is_signaling_nan(a, status) || 7756 float128_is_signaling_nan(b, status)) { 7757 float_raise(float_flag_invalid, status); 7758 } 7759 return float_relation_unordered; 7760 } 7761 aSign = extractFloat128Sign( a ); 7762 bSign = extractFloat128Sign( b ); 7763 if ( aSign != bSign ) { 7764 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7765 /* zero case */ 7766 return float_relation_equal; 7767 } else { 7768 return 1 - (2 * aSign); 7769 } 7770 } else { 7771 if (a.low == b.low && a.high == b.high) { 7772 return float_relation_equal; 7773 } else { 7774 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7775 } 7776 } 7777 } 7778 7779 int float128_compare(float128 a, float128 b, float_status *status) 7780 { 7781 return float128_compare_internal(a, b, 0, status); 7782 } 7783 7784 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7785 { 7786 return float128_compare_internal(a, b, 1, status); 7787 } 7788 7789 /* min() and max() functions. These can't be implemented as 7790 * 'compare and pick one input' because that would mishandle 7791 * NaNs and +0 vs -0. 7792 * 7793 * minnum() and maxnum() functions. These are similar to the min() 7794 * and max() functions but if one of the arguments is a QNaN and 7795 * the other is numerical then the numerical argument is returned. 7796 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7797 * and maxNum() operations. min() and max() are the typical min/max 7798 * semantics provided by many CPUs which predate that specification. 7799 * 7800 * minnummag() and maxnummag() functions correspond to minNumMag() 7801 * and minNumMag() from the IEEE-754 2008. 7802 */ 7803 #define MINMAX(s) \ 7804 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7805 int ismin, int isieee, \ 7806 int ismag, \ 7807 float_status *status) \ 7808 { \ 7809 flag aSign, bSign; \ 7810 uint ## s ## _t av, bv, aav, abv; \ 7811 a = float ## s ## _squash_input_denormal(a, status); \ 7812 b = float ## s ## _squash_input_denormal(b, status); \ 7813 if (float ## s ## _is_any_nan(a) || \ 7814 float ## s ## _is_any_nan(b)) { \ 7815 if (isieee) { \ 7816 if (float ## s ## _is_quiet_nan(a, status) && \ 7817 !float ## s ##_is_any_nan(b)) { \ 7818 return b; \ 7819 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7820 !float ## s ## _is_any_nan(a)) { \ 7821 return a; \ 7822 } \ 7823 } \ 7824 return propagateFloat ## s ## NaN(a, b, status); \ 7825 } \ 7826 aSign = extractFloat ## s ## Sign(a); \ 7827 bSign = extractFloat ## s ## Sign(b); \ 7828 av = float ## s ## _val(a); \ 7829 bv = float ## s ## _val(b); \ 7830 if (ismag) { \ 7831 aav = float ## s ## _abs(av); \ 7832 abv = float ## s ## _abs(bv); \ 7833 if (aav != abv) { \ 7834 if (ismin) { \ 7835 return (aav < abv) ? a : b; \ 7836 } else { \ 7837 return (aav < abv) ? b : a; \ 7838 } \ 7839 } \ 7840 } \ 7841 if (aSign != bSign) { \ 7842 if (ismin) { \ 7843 return aSign ? a : b; \ 7844 } else { \ 7845 return aSign ? b : a; \ 7846 } \ 7847 } else { \ 7848 if (ismin) { \ 7849 return (aSign ^ (av < bv)) ? a : b; \ 7850 } else { \ 7851 return (aSign ^ (av < bv)) ? b : a; \ 7852 } \ 7853 } \ 7854 } \ 7855 \ 7856 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7857 float_status *status) \ 7858 { \ 7859 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7860 } \ 7861 \ 7862 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7863 float_status *status) \ 7864 { \ 7865 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7866 } \ 7867 \ 7868 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7869 float_status *status) \ 7870 { \ 7871 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7872 } \ 7873 \ 7874 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7875 float_status *status) \ 7876 { \ 7877 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7878 } \ 7879 \ 7880 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7881 float_status *status) \ 7882 { \ 7883 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7884 } \ 7885 \ 7886 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7887 float_status *status) \ 7888 { \ 7889 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7890 } 7891 7892 MINMAX(32) 7893 MINMAX(64) 7894 7895 7896 /* Multiply A by 2 raised to the power N. */ 7897 float32 float32_scalbn(float32 a, int n, float_status *status) 7898 { 7899 flag aSign; 7900 int16_t aExp; 7901 uint32_t aSig; 7902 7903 a = float32_squash_input_denormal(a, status); 7904 aSig = extractFloat32Frac( a ); 7905 aExp = extractFloat32Exp( a ); 7906 aSign = extractFloat32Sign( a ); 7907 7908 if ( aExp == 0xFF ) { 7909 if ( aSig ) { 7910 return propagateFloat32NaN(a, a, status); 7911 } 7912 return a; 7913 } 7914 if (aExp != 0) { 7915 aSig |= 0x00800000; 7916 } else if (aSig == 0) { 7917 return a; 7918 } else { 7919 aExp++; 7920 } 7921 7922 if (n > 0x200) { 7923 n = 0x200; 7924 } else if (n < -0x200) { 7925 n = -0x200; 7926 } 7927 7928 aExp += n - 1; 7929 aSig <<= 7; 7930 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7931 } 7932 7933 float64 float64_scalbn(float64 a, int n, float_status *status) 7934 { 7935 flag aSign; 7936 int16_t aExp; 7937 uint64_t aSig; 7938 7939 a = float64_squash_input_denormal(a, status); 7940 aSig = extractFloat64Frac( a ); 7941 aExp = extractFloat64Exp( a ); 7942 aSign = extractFloat64Sign( a ); 7943 7944 if ( aExp == 0x7FF ) { 7945 if ( aSig ) { 7946 return propagateFloat64NaN(a, a, status); 7947 } 7948 return a; 7949 } 7950 if (aExp != 0) { 7951 aSig |= LIT64( 0x0010000000000000 ); 7952 } else if (aSig == 0) { 7953 return a; 7954 } else { 7955 aExp++; 7956 } 7957 7958 if (n > 0x1000) { 7959 n = 0x1000; 7960 } else if (n < -0x1000) { 7961 n = -0x1000; 7962 } 7963 7964 aExp += n - 1; 7965 aSig <<= 10; 7966 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7967 } 7968 7969 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7970 { 7971 flag aSign; 7972 int32_t aExp; 7973 uint64_t aSig; 7974 7975 if (floatx80_invalid_encoding(a)) { 7976 float_raise(float_flag_invalid, status); 7977 return floatx80_default_nan(status); 7978 } 7979 aSig = extractFloatx80Frac( a ); 7980 aExp = extractFloatx80Exp( a ); 7981 aSign = extractFloatx80Sign( a ); 7982 7983 if ( aExp == 0x7FFF ) { 7984 if ( aSig<<1 ) { 7985 return propagateFloatx80NaN(a, a, status); 7986 } 7987 return a; 7988 } 7989 7990 if (aExp == 0) { 7991 if (aSig == 0) { 7992 return a; 7993 } 7994 aExp++; 7995 } 7996 7997 if (n > 0x10000) { 7998 n = 0x10000; 7999 } else if (n < -0x10000) { 8000 n = -0x10000; 8001 } 8002 8003 aExp += n; 8004 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 8005 aSign, aExp, aSig, 0, status); 8006 } 8007 8008 float128 float128_scalbn(float128 a, int n, float_status *status) 8009 { 8010 flag aSign; 8011 int32_t aExp; 8012 uint64_t aSig0, aSig1; 8013 8014 aSig1 = extractFloat128Frac1( a ); 8015 aSig0 = extractFloat128Frac0( a ); 8016 aExp = extractFloat128Exp( a ); 8017 aSign = extractFloat128Sign( a ); 8018 if ( aExp == 0x7FFF ) { 8019 if ( aSig0 | aSig1 ) { 8020 return propagateFloat128NaN(a, a, status); 8021 } 8022 return a; 8023 } 8024 if (aExp != 0) { 8025 aSig0 |= LIT64( 0x0001000000000000 ); 8026 } else if (aSig0 == 0 && aSig1 == 0) { 8027 return a; 8028 } else { 8029 aExp++; 8030 } 8031 8032 if (n > 0x10000) { 8033 n = 0x10000; 8034 } else if (n < -0x10000) { 8035 n = -0x10000; 8036 } 8037 8038 aExp += n - 1; 8039 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 8040 , status); 8041 8042 } 8043