1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 564 { 565 if (is_snan(a.cls) || is_snan(b.cls)) { 566 s->float_exception_flags |= float_flag_invalid; 567 } 568 569 if (s->default_nan_mode) { 570 a.cls = float_class_dnan; 571 } else { 572 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 573 is_qnan(b.cls), is_snan(b.cls), 574 a.frac > b.frac || 575 (a.frac == b.frac && a.sign < b.sign))) { 576 a = b; 577 } 578 a.cls = float_class_msnan; 579 } 580 return a; 581 } 582 583 /* 584 * Returns the result of adding or subtracting the values of the 585 * floating-point values `a' and `b'. The operation is performed 586 * according to the IEC/IEEE Standard for Binary Floating-Point 587 * Arithmetic. 588 */ 589 590 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 591 float_status *s) 592 { 593 bool a_sign = a.sign; 594 bool b_sign = b.sign ^ subtract; 595 596 if (a_sign != b_sign) { 597 /* Subtraction */ 598 599 if (a.cls == float_class_normal && b.cls == float_class_normal) { 600 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 601 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 602 a.frac = a.frac - b.frac; 603 } else { 604 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 605 a.frac = b.frac - a.frac; 606 a.exp = b.exp; 607 a_sign ^= 1; 608 } 609 610 if (a.frac == 0) { 611 a.cls = float_class_zero; 612 a.sign = s->float_rounding_mode == float_round_down; 613 } else { 614 int shift = clz64(a.frac) - 1; 615 a.frac = a.frac << shift; 616 a.exp = a.exp - shift; 617 a.sign = a_sign; 618 } 619 return a; 620 } 621 if (is_nan(a.cls) || is_nan(b.cls)) { 622 return pick_nan(a, b, s); 623 } 624 if (a.cls == float_class_inf) { 625 if (b.cls == float_class_inf) { 626 float_raise(float_flag_invalid, s); 627 a.cls = float_class_dnan; 628 } 629 return a; 630 } 631 if (a.cls == float_class_zero && b.cls == float_class_zero) { 632 a.sign = s->float_rounding_mode == float_round_down; 633 return a; 634 } 635 if (a.cls == float_class_zero || b.cls == float_class_inf) { 636 b.sign = a_sign ^ 1; 637 return b; 638 } 639 if (b.cls == float_class_zero) { 640 return a; 641 } 642 } else { 643 /* Addition */ 644 if (a.cls == float_class_normal && b.cls == float_class_normal) { 645 if (a.exp > b.exp) { 646 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 647 } else if (a.exp < b.exp) { 648 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 649 a.exp = b.exp; 650 } 651 a.frac += b.frac; 652 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 653 a.frac >>= 1; 654 a.exp += 1; 655 } 656 return a; 657 } 658 if (is_nan(a.cls) || is_nan(b.cls)) { 659 return pick_nan(a, b, s); 660 } 661 if (a.cls == float_class_inf || b.cls == float_class_zero) { 662 return a; 663 } 664 if (b.cls == float_class_inf || a.cls == float_class_zero) { 665 b.sign = b_sign; 666 return b; 667 } 668 } 669 g_assert_not_reached(); 670 } 671 672 /* 673 * Returns the result of adding or subtracting the floating-point 674 * values `a' and `b'. The operation is performed according to the 675 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 676 */ 677 678 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 679 float_status *status) 680 { 681 FloatParts pa = float16_unpack_canonical(a, status); 682 FloatParts pb = float16_unpack_canonical(b, status); 683 FloatParts pr = addsub_floats(pa, pb, false, status); 684 685 return float16_round_pack_canonical(pr, status); 686 } 687 688 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 689 float_status *status) 690 { 691 FloatParts pa = float32_unpack_canonical(a, status); 692 FloatParts pb = float32_unpack_canonical(b, status); 693 FloatParts pr = addsub_floats(pa, pb, false, status); 694 695 return float32_round_pack_canonical(pr, status); 696 } 697 698 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 699 float_status *status) 700 { 701 FloatParts pa = float64_unpack_canonical(a, status); 702 FloatParts pb = float64_unpack_canonical(b, status); 703 FloatParts pr = addsub_floats(pa, pb, false, status); 704 705 return float64_round_pack_canonical(pr, status); 706 } 707 708 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 709 float_status *status) 710 { 711 FloatParts pa = float16_unpack_canonical(a, status); 712 FloatParts pb = float16_unpack_canonical(b, status); 713 FloatParts pr = addsub_floats(pa, pb, true, status); 714 715 return float16_round_pack_canonical(pr, status); 716 } 717 718 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 719 float_status *status) 720 { 721 FloatParts pa = float32_unpack_canonical(a, status); 722 FloatParts pb = float32_unpack_canonical(b, status); 723 FloatParts pr = addsub_floats(pa, pb, true, status); 724 725 return float32_round_pack_canonical(pr, status); 726 } 727 728 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 729 float_status *status) 730 { 731 FloatParts pa = float64_unpack_canonical(a, status); 732 FloatParts pb = float64_unpack_canonical(b, status); 733 FloatParts pr = addsub_floats(pa, pb, true, status); 734 735 return float64_round_pack_canonical(pr, status); 736 } 737 738 /* 739 * Returns the result of multiplying the floating-point values `a' and 740 * `b'. The operation is performed according to the IEC/IEEE Standard 741 * for Binary Floating-Point Arithmetic. 742 */ 743 744 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 745 { 746 bool sign = a.sign ^ b.sign; 747 748 if (a.cls == float_class_normal && b.cls == float_class_normal) { 749 uint64_t hi, lo; 750 int exp = a.exp + b.exp; 751 752 mul64To128(a.frac, b.frac, &hi, &lo); 753 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 754 if (lo & DECOMPOSED_OVERFLOW_BIT) { 755 shift64RightJamming(lo, 1, &lo); 756 exp += 1; 757 } 758 759 /* Re-use a */ 760 a.exp = exp; 761 a.sign = sign; 762 a.frac = lo; 763 return a; 764 } 765 /* handle all the NaN cases */ 766 if (is_nan(a.cls) || is_nan(b.cls)) { 767 return pick_nan(a, b, s); 768 } 769 /* Inf * Zero == NaN */ 770 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 771 (a.cls == float_class_zero && b.cls == float_class_inf)) { 772 s->float_exception_flags |= float_flag_invalid; 773 a.cls = float_class_dnan; 774 a.sign = sign; 775 return a; 776 } 777 /* Multiply by 0 or Inf */ 778 if (a.cls == float_class_inf || a.cls == float_class_zero) { 779 a.sign = sign; 780 return a; 781 } 782 if (b.cls == float_class_inf || b.cls == float_class_zero) { 783 b.sign = sign; 784 return b; 785 } 786 g_assert_not_reached(); 787 } 788 789 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 790 float_status *status) 791 { 792 FloatParts pa = float16_unpack_canonical(a, status); 793 FloatParts pb = float16_unpack_canonical(b, status); 794 FloatParts pr = mul_floats(pa, pb, status); 795 796 return float16_round_pack_canonical(pr, status); 797 } 798 799 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 800 float_status *status) 801 { 802 FloatParts pa = float32_unpack_canonical(a, status); 803 FloatParts pb = float32_unpack_canonical(b, status); 804 FloatParts pr = mul_floats(pa, pb, status); 805 806 return float32_round_pack_canonical(pr, status); 807 } 808 809 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 810 float_status *status) 811 { 812 FloatParts pa = float64_unpack_canonical(a, status); 813 FloatParts pb = float64_unpack_canonical(b, status); 814 FloatParts pr = mul_floats(pa, pb, status); 815 816 return float64_round_pack_canonical(pr, status); 817 } 818 819 /* 820 * Returns the result of dividing the floating-point value `a' by the 821 * corresponding value `b'. The operation is performed according to 822 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 823 */ 824 825 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 826 { 827 bool sign = a.sign ^ b.sign; 828 829 if (a.cls == float_class_normal && b.cls == float_class_normal) { 830 uint64_t temp_lo, temp_hi; 831 int exp = a.exp - b.exp; 832 if (a.frac < b.frac) { 833 exp -= 1; 834 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 835 &temp_hi, &temp_lo); 836 } else { 837 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 838 &temp_hi, &temp_lo); 839 } 840 /* LSB of quot is set if inexact which roundandpack will use 841 * to set flags. Yet again we re-use a for the result */ 842 a.frac = div128To64(temp_lo, temp_hi, b.frac); 843 a.sign = sign; 844 a.exp = exp; 845 return a; 846 } 847 /* handle all the NaN cases */ 848 if (is_nan(a.cls) || is_nan(b.cls)) { 849 return pick_nan(a, b, s); 850 } 851 /* 0/0 or Inf/Inf */ 852 if (a.cls == b.cls 853 && 854 (a.cls == float_class_inf || a.cls == float_class_zero)) { 855 s->float_exception_flags |= float_flag_invalid; 856 a.cls = float_class_dnan; 857 return a; 858 } 859 /* Div 0 => Inf */ 860 if (b.cls == float_class_zero) { 861 s->float_exception_flags |= float_flag_divbyzero; 862 a.cls = float_class_inf; 863 a.sign = sign; 864 return a; 865 } 866 /* Inf / x or 0 / x */ 867 if (a.cls == float_class_inf || a.cls == float_class_zero) { 868 a.sign = sign; 869 return a; 870 } 871 /* Div by Inf */ 872 if (b.cls == float_class_inf) { 873 a.cls = float_class_zero; 874 a.sign = sign; 875 return a; 876 } 877 g_assert_not_reached(); 878 } 879 880 float16 float16_div(float16 a, float16 b, float_status *status) 881 { 882 FloatParts pa = float16_unpack_canonical(a, status); 883 FloatParts pb = float16_unpack_canonical(b, status); 884 FloatParts pr = div_floats(pa, pb, status); 885 886 return float16_round_pack_canonical(pr, status); 887 } 888 889 float32 float32_div(float32 a, float32 b, float_status *status) 890 { 891 FloatParts pa = float32_unpack_canonical(a, status); 892 FloatParts pb = float32_unpack_canonical(b, status); 893 FloatParts pr = div_floats(pa, pb, status); 894 895 return float32_round_pack_canonical(pr, status); 896 } 897 898 float64 float64_div(float64 a, float64 b, float_status *status) 899 { 900 FloatParts pa = float64_unpack_canonical(a, status); 901 FloatParts pb = float64_unpack_canonical(b, status); 902 FloatParts pr = div_floats(pa, pb, status); 903 904 return float64_round_pack_canonical(pr, status); 905 } 906 907 /*---------------------------------------------------------------------------- 908 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 909 | and 7, and returns the properly rounded 32-bit integer corresponding to the 910 | input. If `zSign' is 1, the input is negated before being converted to an 911 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 912 | is simply rounded to an integer, with the inexact exception raised if the 913 | input cannot be represented exactly as an integer. However, if the fixed- 914 | point input is too large, the invalid exception is raised and the largest 915 | positive or negative integer is returned. 916 *----------------------------------------------------------------------------*/ 917 918 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 919 { 920 int8_t roundingMode; 921 flag roundNearestEven; 922 int8_t roundIncrement, roundBits; 923 int32_t z; 924 925 roundingMode = status->float_rounding_mode; 926 roundNearestEven = ( roundingMode == float_round_nearest_even ); 927 switch (roundingMode) { 928 case float_round_nearest_even: 929 case float_round_ties_away: 930 roundIncrement = 0x40; 931 break; 932 case float_round_to_zero: 933 roundIncrement = 0; 934 break; 935 case float_round_up: 936 roundIncrement = zSign ? 0 : 0x7f; 937 break; 938 case float_round_down: 939 roundIncrement = zSign ? 0x7f : 0; 940 break; 941 default: 942 abort(); 943 } 944 roundBits = absZ & 0x7F; 945 absZ = ( absZ + roundIncrement )>>7; 946 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 947 z = absZ; 948 if ( zSign ) z = - z; 949 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 950 float_raise(float_flag_invalid, status); 951 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 952 } 953 if (roundBits) { 954 status->float_exception_flags |= float_flag_inexact; 955 } 956 return z; 957 958 } 959 960 /*---------------------------------------------------------------------------- 961 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 962 | `absZ1', with binary point between bits 63 and 64 (between the input words), 963 | and returns the properly rounded 64-bit integer corresponding to the input. 964 | If `zSign' is 1, the input is negated before being converted to an integer. 965 | Ordinarily, the fixed-point input is simply rounded to an integer, with 966 | the inexact exception raised if the input cannot be represented exactly as 967 | an integer. However, if the fixed-point input is too large, the invalid 968 | exception is raised and the largest positive or negative integer is 969 | returned. 970 *----------------------------------------------------------------------------*/ 971 972 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 973 float_status *status) 974 { 975 int8_t roundingMode; 976 flag roundNearestEven, increment; 977 int64_t z; 978 979 roundingMode = status->float_rounding_mode; 980 roundNearestEven = ( roundingMode == float_round_nearest_even ); 981 switch (roundingMode) { 982 case float_round_nearest_even: 983 case float_round_ties_away: 984 increment = ((int64_t) absZ1 < 0); 985 break; 986 case float_round_to_zero: 987 increment = 0; 988 break; 989 case float_round_up: 990 increment = !zSign && absZ1; 991 break; 992 case float_round_down: 993 increment = zSign && absZ1; 994 break; 995 default: 996 abort(); 997 } 998 if ( increment ) { 999 ++absZ0; 1000 if ( absZ0 == 0 ) goto overflow; 1001 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1002 } 1003 z = absZ0; 1004 if ( zSign ) z = - z; 1005 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1006 overflow: 1007 float_raise(float_flag_invalid, status); 1008 return 1009 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 1010 : LIT64( 0x7FFFFFFFFFFFFFFF ); 1011 } 1012 if (absZ1) { 1013 status->float_exception_flags |= float_flag_inexact; 1014 } 1015 return z; 1016 1017 } 1018 1019 /*---------------------------------------------------------------------------- 1020 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1021 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1022 | and returns the properly rounded 64-bit unsigned integer corresponding to the 1023 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 1024 | with the inexact exception raised if the input cannot be represented exactly 1025 | as an integer. However, if the fixed-point input is too large, the invalid 1026 | exception is raised and the largest unsigned integer is returned. 1027 *----------------------------------------------------------------------------*/ 1028 1029 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 1030 uint64_t absZ1, float_status *status) 1031 { 1032 int8_t roundingMode; 1033 flag roundNearestEven, increment; 1034 1035 roundingMode = status->float_rounding_mode; 1036 roundNearestEven = (roundingMode == float_round_nearest_even); 1037 switch (roundingMode) { 1038 case float_round_nearest_even: 1039 case float_round_ties_away: 1040 increment = ((int64_t)absZ1 < 0); 1041 break; 1042 case float_round_to_zero: 1043 increment = 0; 1044 break; 1045 case float_round_up: 1046 increment = !zSign && absZ1; 1047 break; 1048 case float_round_down: 1049 increment = zSign && absZ1; 1050 break; 1051 default: 1052 abort(); 1053 } 1054 if (increment) { 1055 ++absZ0; 1056 if (absZ0 == 0) { 1057 float_raise(float_flag_invalid, status); 1058 return LIT64(0xFFFFFFFFFFFFFFFF); 1059 } 1060 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 1061 } 1062 1063 if (zSign && absZ0) { 1064 float_raise(float_flag_invalid, status); 1065 return 0; 1066 } 1067 1068 if (absZ1) { 1069 status->float_exception_flags |= float_flag_inexact; 1070 } 1071 return absZ0; 1072 } 1073 1074 /*---------------------------------------------------------------------------- 1075 | If `a' is denormal and we are in flush-to-zero mode then set the 1076 | input-denormal exception and return zero. Otherwise just return the value. 1077 *----------------------------------------------------------------------------*/ 1078 float32 float32_squash_input_denormal(float32 a, float_status *status) 1079 { 1080 if (status->flush_inputs_to_zero) { 1081 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 1082 float_raise(float_flag_input_denormal, status); 1083 return make_float32(float32_val(a) & 0x80000000); 1084 } 1085 } 1086 return a; 1087 } 1088 1089 /*---------------------------------------------------------------------------- 1090 | Normalizes the subnormal single-precision floating-point value represented 1091 | by the denormalized significand `aSig'. The normalized exponent and 1092 | significand are stored at the locations pointed to by `zExpPtr' and 1093 | `zSigPtr', respectively. 1094 *----------------------------------------------------------------------------*/ 1095 1096 static void 1097 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 1098 { 1099 int8_t shiftCount; 1100 1101 shiftCount = countLeadingZeros32( aSig ) - 8; 1102 *zSigPtr = aSig<<shiftCount; 1103 *zExpPtr = 1 - shiftCount; 1104 1105 } 1106 1107 /*---------------------------------------------------------------------------- 1108 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1109 | single-precision floating-point value, returning the result. After being 1110 | shifted into the proper positions, the three fields are simply added 1111 | together to form the result. This means that any integer portion of `zSig' 1112 | will be added into the exponent. Since a properly normalized significand 1113 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1114 | than the desired result exponent whenever `zSig' is a complete, normalized 1115 | significand. 1116 *----------------------------------------------------------------------------*/ 1117 1118 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 1119 { 1120 1121 return make_float32( 1122 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 1123 1124 } 1125 1126 /*---------------------------------------------------------------------------- 1127 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1128 | and significand `zSig', and returns the proper single-precision floating- 1129 | point value corresponding to the abstract input. Ordinarily, the abstract 1130 | value is simply rounded and packed into the single-precision format, with 1131 | the inexact exception raised if the abstract input cannot be represented 1132 | exactly. However, if the abstract value is too large, the overflow and 1133 | inexact exceptions are raised and an infinity or maximal finite value is 1134 | returned. If the abstract value is too small, the input value is rounded to 1135 | a subnormal number, and the underflow and inexact exceptions are raised if 1136 | the abstract input cannot be represented exactly as a subnormal single- 1137 | precision floating-point number. 1138 | The input significand `zSig' has its binary point between bits 30 1139 | and 29, which is 7 bits to the left of the usual location. This shifted 1140 | significand must be normalized or smaller. If `zSig' is not normalized, 1141 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1142 | and it must not require rounding. In the usual case that `zSig' is 1143 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1144 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1145 | Binary Floating-Point Arithmetic. 1146 *----------------------------------------------------------------------------*/ 1147 1148 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1149 float_status *status) 1150 { 1151 int8_t roundingMode; 1152 flag roundNearestEven; 1153 int8_t roundIncrement, roundBits; 1154 flag isTiny; 1155 1156 roundingMode = status->float_rounding_mode; 1157 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1158 switch (roundingMode) { 1159 case float_round_nearest_even: 1160 case float_round_ties_away: 1161 roundIncrement = 0x40; 1162 break; 1163 case float_round_to_zero: 1164 roundIncrement = 0; 1165 break; 1166 case float_round_up: 1167 roundIncrement = zSign ? 0 : 0x7f; 1168 break; 1169 case float_round_down: 1170 roundIncrement = zSign ? 0x7f : 0; 1171 break; 1172 default: 1173 abort(); 1174 break; 1175 } 1176 roundBits = zSig & 0x7F; 1177 if ( 0xFD <= (uint16_t) zExp ) { 1178 if ( ( 0xFD < zExp ) 1179 || ( ( zExp == 0xFD ) 1180 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 1181 ) { 1182 float_raise(float_flag_overflow | float_flag_inexact, status); 1183 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 1184 } 1185 if ( zExp < 0 ) { 1186 if (status->flush_to_zero) { 1187 float_raise(float_flag_output_denormal, status); 1188 return packFloat32(zSign, 0, 0); 1189 } 1190 isTiny = 1191 (status->float_detect_tininess 1192 == float_tininess_before_rounding) 1193 || ( zExp < -1 ) 1194 || ( zSig + roundIncrement < 0x80000000 ); 1195 shift32RightJamming( zSig, - zExp, &zSig ); 1196 zExp = 0; 1197 roundBits = zSig & 0x7F; 1198 if (isTiny && roundBits) { 1199 float_raise(float_flag_underflow, status); 1200 } 1201 } 1202 } 1203 if (roundBits) { 1204 status->float_exception_flags |= float_flag_inexact; 1205 } 1206 zSig = ( zSig + roundIncrement )>>7; 1207 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1208 if ( zSig == 0 ) zExp = 0; 1209 return packFloat32( zSign, zExp, zSig ); 1210 1211 } 1212 1213 /*---------------------------------------------------------------------------- 1214 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1215 | and significand `zSig', and returns the proper single-precision floating- 1216 | point value corresponding to the abstract input. This routine is just like 1217 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 1218 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1219 | floating-point exponent. 1220 *----------------------------------------------------------------------------*/ 1221 1222 static float32 1223 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 1224 float_status *status) 1225 { 1226 int8_t shiftCount; 1227 1228 shiftCount = countLeadingZeros32( zSig ) - 1; 1229 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 1230 status); 1231 1232 } 1233 1234 /*---------------------------------------------------------------------------- 1235 | If `a' is denormal and we are in flush-to-zero mode then set the 1236 | input-denormal exception and return zero. Otherwise just return the value. 1237 *----------------------------------------------------------------------------*/ 1238 float64 float64_squash_input_denormal(float64 a, float_status *status) 1239 { 1240 if (status->flush_inputs_to_zero) { 1241 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 1242 float_raise(float_flag_input_denormal, status); 1243 return make_float64(float64_val(a) & (1ULL << 63)); 1244 } 1245 } 1246 return a; 1247 } 1248 1249 /*---------------------------------------------------------------------------- 1250 | Normalizes the subnormal double-precision floating-point value represented 1251 | by the denormalized significand `aSig'. The normalized exponent and 1252 | significand are stored at the locations pointed to by `zExpPtr' and 1253 | `zSigPtr', respectively. 1254 *----------------------------------------------------------------------------*/ 1255 1256 static void 1257 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 1258 { 1259 int8_t shiftCount; 1260 1261 shiftCount = countLeadingZeros64( aSig ) - 11; 1262 *zSigPtr = aSig<<shiftCount; 1263 *zExpPtr = 1 - shiftCount; 1264 1265 } 1266 1267 /*---------------------------------------------------------------------------- 1268 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 1269 | double-precision floating-point value, returning the result. After being 1270 | shifted into the proper positions, the three fields are simply added 1271 | together to form the result. This means that any integer portion of `zSig' 1272 | will be added into the exponent. Since a properly normalized significand 1273 | will have an integer portion equal to 1, the `zExp' input should be 1 less 1274 | than the desired result exponent whenever `zSig' is a complete, normalized 1275 | significand. 1276 *----------------------------------------------------------------------------*/ 1277 1278 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 1279 { 1280 1281 return make_float64( 1282 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 1283 1284 } 1285 1286 /*---------------------------------------------------------------------------- 1287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1288 | and significand `zSig', and returns the proper double-precision floating- 1289 | point value corresponding to the abstract input. Ordinarily, the abstract 1290 | value is simply rounded and packed into the double-precision format, with 1291 | the inexact exception raised if the abstract input cannot be represented 1292 | exactly. However, if the abstract value is too large, the overflow and 1293 | inexact exceptions are raised and an infinity or maximal finite value is 1294 | returned. If the abstract value is too small, the input value is rounded to 1295 | a subnormal number, and the underflow and inexact exceptions are raised if 1296 | the abstract input cannot be represented exactly as a subnormal double- 1297 | precision floating-point number. 1298 | The input significand `zSig' has its binary point between bits 62 1299 | and 61, which is 10 bits to the left of the usual location. This shifted 1300 | significand must be normalized or smaller. If `zSig' is not normalized, 1301 | `zExp' must be 0; in that case, the result returned is a subnormal number, 1302 | and it must not require rounding. In the usual case that `zSig' is 1303 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 1304 | The handling of underflow and overflow follows the IEC/IEEE Standard for 1305 | Binary Floating-Point Arithmetic. 1306 *----------------------------------------------------------------------------*/ 1307 1308 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1309 float_status *status) 1310 { 1311 int8_t roundingMode; 1312 flag roundNearestEven; 1313 int roundIncrement, roundBits; 1314 flag isTiny; 1315 1316 roundingMode = status->float_rounding_mode; 1317 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1318 switch (roundingMode) { 1319 case float_round_nearest_even: 1320 case float_round_ties_away: 1321 roundIncrement = 0x200; 1322 break; 1323 case float_round_to_zero: 1324 roundIncrement = 0; 1325 break; 1326 case float_round_up: 1327 roundIncrement = zSign ? 0 : 0x3ff; 1328 break; 1329 case float_round_down: 1330 roundIncrement = zSign ? 0x3ff : 0; 1331 break; 1332 case float_round_to_odd: 1333 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1334 break; 1335 default: 1336 abort(); 1337 } 1338 roundBits = zSig & 0x3FF; 1339 if ( 0x7FD <= (uint16_t) zExp ) { 1340 if ( ( 0x7FD < zExp ) 1341 || ( ( zExp == 0x7FD ) 1342 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 1343 ) { 1344 bool overflow_to_inf = roundingMode != float_round_to_odd && 1345 roundIncrement != 0; 1346 float_raise(float_flag_overflow | float_flag_inexact, status); 1347 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 1348 } 1349 if ( zExp < 0 ) { 1350 if (status->flush_to_zero) { 1351 float_raise(float_flag_output_denormal, status); 1352 return packFloat64(zSign, 0, 0); 1353 } 1354 isTiny = 1355 (status->float_detect_tininess 1356 == float_tininess_before_rounding) 1357 || ( zExp < -1 ) 1358 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 1359 shift64RightJamming( zSig, - zExp, &zSig ); 1360 zExp = 0; 1361 roundBits = zSig & 0x3FF; 1362 if (isTiny && roundBits) { 1363 float_raise(float_flag_underflow, status); 1364 } 1365 if (roundingMode == float_round_to_odd) { 1366 /* 1367 * For round-to-odd case, the roundIncrement depends on 1368 * zSig which just changed. 1369 */ 1370 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 1371 } 1372 } 1373 } 1374 if (roundBits) { 1375 status->float_exception_flags |= float_flag_inexact; 1376 } 1377 zSig = ( zSig + roundIncrement )>>10; 1378 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 1379 if ( zSig == 0 ) zExp = 0; 1380 return packFloat64( zSign, zExp, zSig ); 1381 1382 } 1383 1384 /*---------------------------------------------------------------------------- 1385 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1386 | and significand `zSig', and returns the proper double-precision floating- 1387 | point value corresponding to the abstract input. This routine is just like 1388 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 1389 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 1390 | floating-point exponent. 1391 *----------------------------------------------------------------------------*/ 1392 1393 static float64 1394 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 1395 float_status *status) 1396 { 1397 int8_t shiftCount; 1398 1399 shiftCount = countLeadingZeros64( zSig ) - 1; 1400 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 1401 status); 1402 1403 } 1404 1405 /*---------------------------------------------------------------------------- 1406 | Returns the fraction bits of the extended double-precision floating-point 1407 | value `a'. 1408 *----------------------------------------------------------------------------*/ 1409 1410 static inline uint64_t extractFloatx80Frac( floatx80 a ) 1411 { 1412 1413 return a.low; 1414 1415 } 1416 1417 /*---------------------------------------------------------------------------- 1418 | Returns the exponent bits of the extended double-precision floating-point 1419 | value `a'. 1420 *----------------------------------------------------------------------------*/ 1421 1422 static inline int32_t extractFloatx80Exp( floatx80 a ) 1423 { 1424 1425 return a.high & 0x7FFF; 1426 1427 } 1428 1429 /*---------------------------------------------------------------------------- 1430 | Returns the sign bit of the extended double-precision floating-point value 1431 | `a'. 1432 *----------------------------------------------------------------------------*/ 1433 1434 static inline flag extractFloatx80Sign( floatx80 a ) 1435 { 1436 1437 return a.high>>15; 1438 1439 } 1440 1441 /*---------------------------------------------------------------------------- 1442 | Normalizes the subnormal extended double-precision floating-point value 1443 | represented by the denormalized significand `aSig'. The normalized exponent 1444 | and significand are stored at the locations pointed to by `zExpPtr' and 1445 | `zSigPtr', respectively. 1446 *----------------------------------------------------------------------------*/ 1447 1448 static void 1449 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 1450 { 1451 int8_t shiftCount; 1452 1453 shiftCount = countLeadingZeros64( aSig ); 1454 *zSigPtr = aSig<<shiftCount; 1455 *zExpPtr = 1 - shiftCount; 1456 1457 } 1458 1459 /*---------------------------------------------------------------------------- 1460 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 1461 | extended double-precision floating-point value, returning the result. 1462 *----------------------------------------------------------------------------*/ 1463 1464 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 1465 { 1466 floatx80 z; 1467 1468 z.low = zSig; 1469 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 1470 return z; 1471 1472 } 1473 1474 /*---------------------------------------------------------------------------- 1475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1476 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 1477 | and returns the proper extended double-precision floating-point value 1478 | corresponding to the abstract input. Ordinarily, the abstract value is 1479 | rounded and packed into the extended double-precision format, with the 1480 | inexact exception raised if the abstract input cannot be represented 1481 | exactly. However, if the abstract value is too large, the overflow and 1482 | inexact exceptions are raised and an infinity or maximal finite value is 1483 | returned. If the abstract value is too small, the input value is rounded to 1484 | a subnormal number, and the underflow and inexact exceptions are raised if 1485 | the abstract input cannot be represented exactly as a subnormal extended 1486 | double-precision floating-point number. 1487 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 1488 | number of bits as single or double precision, respectively. Otherwise, the 1489 | result is rounded to the full precision of the extended double-precision 1490 | format. 1491 | The input significand must be normalized or smaller. If the input 1492 | significand is not normalized, `zExp' must be 0; in that case, the result 1493 | returned is a subnormal number, and it must not require rounding. The 1494 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 1495 | Floating-Point Arithmetic. 1496 *----------------------------------------------------------------------------*/ 1497 1498 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 1499 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 1500 float_status *status) 1501 { 1502 int8_t roundingMode; 1503 flag roundNearestEven, increment, isTiny; 1504 int64_t roundIncrement, roundMask, roundBits; 1505 1506 roundingMode = status->float_rounding_mode; 1507 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1508 if ( roundingPrecision == 80 ) goto precision80; 1509 if ( roundingPrecision == 64 ) { 1510 roundIncrement = LIT64( 0x0000000000000400 ); 1511 roundMask = LIT64( 0x00000000000007FF ); 1512 } 1513 else if ( roundingPrecision == 32 ) { 1514 roundIncrement = LIT64( 0x0000008000000000 ); 1515 roundMask = LIT64( 0x000000FFFFFFFFFF ); 1516 } 1517 else { 1518 goto precision80; 1519 } 1520 zSig0 |= ( zSig1 != 0 ); 1521 switch (roundingMode) { 1522 case float_round_nearest_even: 1523 case float_round_ties_away: 1524 break; 1525 case float_round_to_zero: 1526 roundIncrement = 0; 1527 break; 1528 case float_round_up: 1529 roundIncrement = zSign ? 0 : roundMask; 1530 break; 1531 case float_round_down: 1532 roundIncrement = zSign ? roundMask : 0; 1533 break; 1534 default: 1535 abort(); 1536 } 1537 roundBits = zSig0 & roundMask; 1538 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 1539 if ( ( 0x7FFE < zExp ) 1540 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 1541 ) { 1542 goto overflow; 1543 } 1544 if ( zExp <= 0 ) { 1545 if (status->flush_to_zero) { 1546 float_raise(float_flag_output_denormal, status); 1547 return packFloatx80(zSign, 0, 0); 1548 } 1549 isTiny = 1550 (status->float_detect_tininess 1551 == float_tininess_before_rounding) 1552 || ( zExp < 0 ) 1553 || ( zSig0 <= zSig0 + roundIncrement ); 1554 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 1555 zExp = 0; 1556 roundBits = zSig0 & roundMask; 1557 if (isTiny && roundBits) { 1558 float_raise(float_flag_underflow, status); 1559 } 1560 if (roundBits) { 1561 status->float_exception_flags |= float_flag_inexact; 1562 } 1563 zSig0 += roundIncrement; 1564 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1565 roundIncrement = roundMask + 1; 1566 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1567 roundMask |= roundIncrement; 1568 } 1569 zSig0 &= ~ roundMask; 1570 return packFloatx80( zSign, zExp, zSig0 ); 1571 } 1572 } 1573 if (roundBits) { 1574 status->float_exception_flags |= float_flag_inexact; 1575 } 1576 zSig0 += roundIncrement; 1577 if ( zSig0 < roundIncrement ) { 1578 ++zExp; 1579 zSig0 = LIT64( 0x8000000000000000 ); 1580 } 1581 roundIncrement = roundMask + 1; 1582 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 1583 roundMask |= roundIncrement; 1584 } 1585 zSig0 &= ~ roundMask; 1586 if ( zSig0 == 0 ) zExp = 0; 1587 return packFloatx80( zSign, zExp, zSig0 ); 1588 precision80: 1589 switch (roundingMode) { 1590 case float_round_nearest_even: 1591 case float_round_ties_away: 1592 increment = ((int64_t)zSig1 < 0); 1593 break; 1594 case float_round_to_zero: 1595 increment = 0; 1596 break; 1597 case float_round_up: 1598 increment = !zSign && zSig1; 1599 break; 1600 case float_round_down: 1601 increment = zSign && zSig1; 1602 break; 1603 default: 1604 abort(); 1605 } 1606 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 1607 if ( ( 0x7FFE < zExp ) 1608 || ( ( zExp == 0x7FFE ) 1609 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 1610 && increment 1611 ) 1612 ) { 1613 roundMask = 0; 1614 overflow: 1615 float_raise(float_flag_overflow | float_flag_inexact, status); 1616 if ( ( roundingMode == float_round_to_zero ) 1617 || ( zSign && ( roundingMode == float_round_up ) ) 1618 || ( ! zSign && ( roundingMode == float_round_down ) ) 1619 ) { 1620 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 1621 } 1622 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1623 } 1624 if ( zExp <= 0 ) { 1625 isTiny = 1626 (status->float_detect_tininess 1627 == float_tininess_before_rounding) 1628 || ( zExp < 0 ) 1629 || ! increment 1630 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 1631 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 1632 zExp = 0; 1633 if (isTiny && zSig1) { 1634 float_raise(float_flag_underflow, status); 1635 } 1636 if (zSig1) { 1637 status->float_exception_flags |= float_flag_inexact; 1638 } 1639 switch (roundingMode) { 1640 case float_round_nearest_even: 1641 case float_round_ties_away: 1642 increment = ((int64_t)zSig1 < 0); 1643 break; 1644 case float_round_to_zero: 1645 increment = 0; 1646 break; 1647 case float_round_up: 1648 increment = !zSign && zSig1; 1649 break; 1650 case float_round_down: 1651 increment = zSign && zSig1; 1652 break; 1653 default: 1654 abort(); 1655 } 1656 if ( increment ) { 1657 ++zSig0; 1658 zSig0 &= 1659 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1660 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1661 } 1662 return packFloatx80( zSign, zExp, zSig0 ); 1663 } 1664 } 1665 if (zSig1) { 1666 status->float_exception_flags |= float_flag_inexact; 1667 } 1668 if ( increment ) { 1669 ++zSig0; 1670 if ( zSig0 == 0 ) { 1671 ++zExp; 1672 zSig0 = LIT64( 0x8000000000000000 ); 1673 } 1674 else { 1675 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1676 } 1677 } 1678 else { 1679 if ( zSig0 == 0 ) zExp = 0; 1680 } 1681 return packFloatx80( zSign, zExp, zSig0 ); 1682 1683 } 1684 1685 /*---------------------------------------------------------------------------- 1686 | Takes an abstract floating-point value having sign `zSign', exponent 1687 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 1688 | and returns the proper extended double-precision floating-point value 1689 | corresponding to the abstract input. This routine is just like 1690 | `roundAndPackFloatx80' except that the input significand does not have to be 1691 | normalized. 1692 *----------------------------------------------------------------------------*/ 1693 1694 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 1695 flag zSign, int32_t zExp, 1696 uint64_t zSig0, uint64_t zSig1, 1697 float_status *status) 1698 { 1699 int8_t shiftCount; 1700 1701 if ( zSig0 == 0 ) { 1702 zSig0 = zSig1; 1703 zSig1 = 0; 1704 zExp -= 64; 1705 } 1706 shiftCount = countLeadingZeros64( zSig0 ); 1707 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1708 zExp -= shiftCount; 1709 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 1710 zSig0, zSig1, status); 1711 1712 } 1713 1714 /*---------------------------------------------------------------------------- 1715 | Returns the least-significant 64 fraction bits of the quadruple-precision 1716 | floating-point value `a'. 1717 *----------------------------------------------------------------------------*/ 1718 1719 static inline uint64_t extractFloat128Frac1( float128 a ) 1720 { 1721 1722 return a.low; 1723 1724 } 1725 1726 /*---------------------------------------------------------------------------- 1727 | Returns the most-significant 48 fraction bits of the quadruple-precision 1728 | floating-point value `a'. 1729 *----------------------------------------------------------------------------*/ 1730 1731 static inline uint64_t extractFloat128Frac0( float128 a ) 1732 { 1733 1734 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1735 1736 } 1737 1738 /*---------------------------------------------------------------------------- 1739 | Returns the exponent bits of the quadruple-precision floating-point value 1740 | `a'. 1741 *----------------------------------------------------------------------------*/ 1742 1743 static inline int32_t extractFloat128Exp( float128 a ) 1744 { 1745 1746 return ( a.high>>48 ) & 0x7FFF; 1747 1748 } 1749 1750 /*---------------------------------------------------------------------------- 1751 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1752 *----------------------------------------------------------------------------*/ 1753 1754 static inline flag extractFloat128Sign( float128 a ) 1755 { 1756 1757 return a.high>>63; 1758 1759 } 1760 1761 /*---------------------------------------------------------------------------- 1762 | Normalizes the subnormal quadruple-precision floating-point value 1763 | represented by the denormalized significand formed by the concatenation of 1764 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1765 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1766 | significand are stored at the location pointed to by `zSig0Ptr', and the 1767 | least significant 64 bits of the normalized significand are stored at the 1768 | location pointed to by `zSig1Ptr'. 1769 *----------------------------------------------------------------------------*/ 1770 1771 static void 1772 normalizeFloat128Subnormal( 1773 uint64_t aSig0, 1774 uint64_t aSig1, 1775 int32_t *zExpPtr, 1776 uint64_t *zSig0Ptr, 1777 uint64_t *zSig1Ptr 1778 ) 1779 { 1780 int8_t shiftCount; 1781 1782 if ( aSig0 == 0 ) { 1783 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1784 if ( shiftCount < 0 ) { 1785 *zSig0Ptr = aSig1>>( - shiftCount ); 1786 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1787 } 1788 else { 1789 *zSig0Ptr = aSig1<<shiftCount; 1790 *zSig1Ptr = 0; 1791 } 1792 *zExpPtr = - shiftCount - 63; 1793 } 1794 else { 1795 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1796 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1797 *zExpPtr = 1 - shiftCount; 1798 } 1799 1800 } 1801 1802 /*---------------------------------------------------------------------------- 1803 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1804 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1805 | floating-point value, returning the result. After being shifted into the 1806 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1807 | added together to form the most significant 32 bits of the result. This 1808 | means that any integer portion of `zSig0' will be added into the exponent. 1809 | Since a properly normalized significand will have an integer portion equal 1810 | to 1, the `zExp' input should be 1 less than the desired result exponent 1811 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1812 | significand. 1813 *----------------------------------------------------------------------------*/ 1814 1815 static inline float128 1816 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1817 { 1818 float128 z; 1819 1820 z.low = zSig1; 1821 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1822 return z; 1823 1824 } 1825 1826 /*---------------------------------------------------------------------------- 1827 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1828 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1829 | and `zSig2', and returns the proper quadruple-precision floating-point value 1830 | corresponding to the abstract input. Ordinarily, the abstract value is 1831 | simply rounded and packed into the quadruple-precision format, with the 1832 | inexact exception raised if the abstract input cannot be represented 1833 | exactly. However, if the abstract value is too large, the overflow and 1834 | inexact exceptions are raised and an infinity or maximal finite value is 1835 | returned. If the abstract value is too small, the input value is rounded to 1836 | a subnormal number, and the underflow and inexact exceptions are raised if 1837 | the abstract input cannot be represented exactly as a subnormal quadruple- 1838 | precision floating-point number. 1839 | The input significand must be normalized or smaller. If the input 1840 | significand is not normalized, `zExp' must be 0; in that case, the result 1841 | returned is a subnormal number, and it must not require rounding. In the 1842 | usual case that the input significand is normalized, `zExp' must be 1 less 1843 | than the ``true'' floating-point exponent. The handling of underflow and 1844 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1845 *----------------------------------------------------------------------------*/ 1846 1847 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1848 uint64_t zSig0, uint64_t zSig1, 1849 uint64_t zSig2, float_status *status) 1850 { 1851 int8_t roundingMode; 1852 flag roundNearestEven, increment, isTiny; 1853 1854 roundingMode = status->float_rounding_mode; 1855 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1856 switch (roundingMode) { 1857 case float_round_nearest_even: 1858 case float_round_ties_away: 1859 increment = ((int64_t)zSig2 < 0); 1860 break; 1861 case float_round_to_zero: 1862 increment = 0; 1863 break; 1864 case float_round_up: 1865 increment = !zSign && zSig2; 1866 break; 1867 case float_round_down: 1868 increment = zSign && zSig2; 1869 break; 1870 case float_round_to_odd: 1871 increment = !(zSig1 & 0x1) && zSig2; 1872 break; 1873 default: 1874 abort(); 1875 } 1876 if ( 0x7FFD <= (uint32_t) zExp ) { 1877 if ( ( 0x7FFD < zExp ) 1878 || ( ( zExp == 0x7FFD ) 1879 && eq128( 1880 LIT64( 0x0001FFFFFFFFFFFF ), 1881 LIT64( 0xFFFFFFFFFFFFFFFF ), 1882 zSig0, 1883 zSig1 1884 ) 1885 && increment 1886 ) 1887 ) { 1888 float_raise(float_flag_overflow | float_flag_inexact, status); 1889 if ( ( roundingMode == float_round_to_zero ) 1890 || ( zSign && ( roundingMode == float_round_up ) ) 1891 || ( ! zSign && ( roundingMode == float_round_down ) ) 1892 || (roundingMode == float_round_to_odd) 1893 ) { 1894 return 1895 packFloat128( 1896 zSign, 1897 0x7FFE, 1898 LIT64( 0x0000FFFFFFFFFFFF ), 1899 LIT64( 0xFFFFFFFFFFFFFFFF ) 1900 ); 1901 } 1902 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1903 } 1904 if ( zExp < 0 ) { 1905 if (status->flush_to_zero) { 1906 float_raise(float_flag_output_denormal, status); 1907 return packFloat128(zSign, 0, 0, 0); 1908 } 1909 isTiny = 1910 (status->float_detect_tininess 1911 == float_tininess_before_rounding) 1912 || ( zExp < -1 ) 1913 || ! increment 1914 || lt128( 1915 zSig0, 1916 zSig1, 1917 LIT64( 0x0001FFFFFFFFFFFF ), 1918 LIT64( 0xFFFFFFFFFFFFFFFF ) 1919 ); 1920 shift128ExtraRightJamming( 1921 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1922 zExp = 0; 1923 if (isTiny && zSig2) { 1924 float_raise(float_flag_underflow, status); 1925 } 1926 switch (roundingMode) { 1927 case float_round_nearest_even: 1928 case float_round_ties_away: 1929 increment = ((int64_t)zSig2 < 0); 1930 break; 1931 case float_round_to_zero: 1932 increment = 0; 1933 break; 1934 case float_round_up: 1935 increment = !zSign && zSig2; 1936 break; 1937 case float_round_down: 1938 increment = zSign && zSig2; 1939 break; 1940 case float_round_to_odd: 1941 increment = !(zSig1 & 0x1) && zSig2; 1942 break; 1943 default: 1944 abort(); 1945 } 1946 } 1947 } 1948 if (zSig2) { 1949 status->float_exception_flags |= float_flag_inexact; 1950 } 1951 if ( increment ) { 1952 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1953 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1954 } 1955 else { 1956 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1957 } 1958 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1959 1960 } 1961 1962 /*---------------------------------------------------------------------------- 1963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1964 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1965 | returns the proper quadruple-precision floating-point value corresponding 1966 | to the abstract input. This routine is just like `roundAndPackFloat128' 1967 | except that the input significand has fewer bits and does not have to be 1968 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1969 | point exponent. 1970 *----------------------------------------------------------------------------*/ 1971 1972 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1973 uint64_t zSig0, uint64_t zSig1, 1974 float_status *status) 1975 { 1976 int8_t shiftCount; 1977 uint64_t zSig2; 1978 1979 if ( zSig0 == 0 ) { 1980 zSig0 = zSig1; 1981 zSig1 = 0; 1982 zExp -= 64; 1983 } 1984 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1985 if ( 0 <= shiftCount ) { 1986 zSig2 = 0; 1987 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1988 } 1989 else { 1990 shift128ExtraRightJamming( 1991 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1992 } 1993 zExp -= shiftCount; 1994 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1995 1996 } 1997 1998 /*---------------------------------------------------------------------------- 1999 | Returns the result of converting the 32-bit two's complement integer `a' 2000 | to the single-precision floating-point format. The conversion is performed 2001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2002 *----------------------------------------------------------------------------*/ 2003 2004 float32 int32_to_float32(int32_t a, float_status *status) 2005 { 2006 flag zSign; 2007 2008 if ( a == 0 ) return float32_zero; 2009 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 2010 zSign = ( a < 0 ); 2011 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 2012 } 2013 2014 /*---------------------------------------------------------------------------- 2015 | Returns the result of converting the 32-bit two's complement integer `a' 2016 | to the double-precision floating-point format. The conversion is performed 2017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2018 *----------------------------------------------------------------------------*/ 2019 2020 float64 int32_to_float64(int32_t a, float_status *status) 2021 { 2022 flag zSign; 2023 uint32_t absA; 2024 int8_t shiftCount; 2025 uint64_t zSig; 2026 2027 if ( a == 0 ) return float64_zero; 2028 zSign = ( a < 0 ); 2029 absA = zSign ? - a : a; 2030 shiftCount = countLeadingZeros32( absA ) + 21; 2031 zSig = absA; 2032 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 2033 2034 } 2035 2036 /*---------------------------------------------------------------------------- 2037 | Returns the result of converting the 32-bit two's complement integer `a' 2038 | to the extended double-precision floating-point format. The conversion 2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2040 | Arithmetic. 2041 *----------------------------------------------------------------------------*/ 2042 2043 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2044 { 2045 flag zSign; 2046 uint32_t absA; 2047 int8_t shiftCount; 2048 uint64_t zSig; 2049 2050 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2051 zSign = ( a < 0 ); 2052 absA = zSign ? - a : a; 2053 shiftCount = countLeadingZeros32( absA ) + 32; 2054 zSig = absA; 2055 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 2056 2057 } 2058 2059 /*---------------------------------------------------------------------------- 2060 | Returns the result of converting the 32-bit two's complement integer `a' to 2061 | the quadruple-precision floating-point format. The conversion is performed 2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2063 *----------------------------------------------------------------------------*/ 2064 2065 float128 int32_to_float128(int32_t a, float_status *status) 2066 { 2067 flag zSign; 2068 uint32_t absA; 2069 int8_t shiftCount; 2070 uint64_t zSig0; 2071 2072 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2073 zSign = ( a < 0 ); 2074 absA = zSign ? - a : a; 2075 shiftCount = countLeadingZeros32( absA ) + 17; 2076 zSig0 = absA; 2077 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 2078 2079 } 2080 2081 /*---------------------------------------------------------------------------- 2082 | Returns the result of converting the 64-bit two's complement integer `a' 2083 | to the single-precision floating-point format. The conversion is performed 2084 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2085 *----------------------------------------------------------------------------*/ 2086 2087 float32 int64_to_float32(int64_t a, float_status *status) 2088 { 2089 flag zSign; 2090 uint64_t absA; 2091 int8_t shiftCount; 2092 2093 if ( a == 0 ) return float32_zero; 2094 zSign = ( a < 0 ); 2095 absA = zSign ? - a : a; 2096 shiftCount = countLeadingZeros64( absA ) - 40; 2097 if ( 0 <= shiftCount ) { 2098 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 2099 } 2100 else { 2101 shiftCount += 7; 2102 if ( shiftCount < 0 ) { 2103 shift64RightJamming( absA, - shiftCount, &absA ); 2104 } 2105 else { 2106 absA <<= shiftCount; 2107 } 2108 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 2109 } 2110 2111 } 2112 2113 /*---------------------------------------------------------------------------- 2114 | Returns the result of converting the 64-bit two's complement integer `a' 2115 | to the double-precision floating-point format. The conversion is performed 2116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2117 *----------------------------------------------------------------------------*/ 2118 2119 float64 int64_to_float64(int64_t a, float_status *status) 2120 { 2121 flag zSign; 2122 2123 if ( a == 0 ) return float64_zero; 2124 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 2125 return packFloat64( 1, 0x43E, 0 ); 2126 } 2127 zSign = ( a < 0 ); 2128 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 2129 } 2130 2131 /*---------------------------------------------------------------------------- 2132 | Returns the result of converting the 64-bit two's complement integer `a' 2133 | to the extended double-precision floating-point format. The conversion 2134 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2135 | Arithmetic. 2136 *----------------------------------------------------------------------------*/ 2137 2138 floatx80 int64_to_floatx80(int64_t a, float_status *status) 2139 { 2140 flag zSign; 2141 uint64_t absA; 2142 int8_t shiftCount; 2143 2144 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 2145 zSign = ( a < 0 ); 2146 absA = zSign ? - a : a; 2147 shiftCount = countLeadingZeros64( absA ); 2148 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 2149 2150 } 2151 2152 /*---------------------------------------------------------------------------- 2153 | Returns the result of converting the 64-bit two's complement integer `a' to 2154 | the quadruple-precision floating-point format. The conversion is performed 2155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2156 *----------------------------------------------------------------------------*/ 2157 2158 float128 int64_to_float128(int64_t a, float_status *status) 2159 { 2160 flag zSign; 2161 uint64_t absA; 2162 int8_t shiftCount; 2163 int32_t zExp; 2164 uint64_t zSig0, zSig1; 2165 2166 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 2167 zSign = ( a < 0 ); 2168 absA = zSign ? - a : a; 2169 shiftCount = countLeadingZeros64( absA ) + 49; 2170 zExp = 0x406E - shiftCount; 2171 if ( 64 <= shiftCount ) { 2172 zSig1 = 0; 2173 zSig0 = absA; 2174 shiftCount -= 64; 2175 } 2176 else { 2177 zSig1 = absA; 2178 zSig0 = 0; 2179 } 2180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2181 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2182 2183 } 2184 2185 /*---------------------------------------------------------------------------- 2186 | Returns the result of converting the 64-bit unsigned integer `a' 2187 | to the single-precision floating-point format. The conversion is performed 2188 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2189 *----------------------------------------------------------------------------*/ 2190 2191 float32 uint64_to_float32(uint64_t a, float_status *status) 2192 { 2193 int shiftcount; 2194 2195 if (a == 0) { 2196 return float32_zero; 2197 } 2198 2199 /* Determine (left) shift needed to put first set bit into bit posn 23 2200 * (since packFloat32() expects the binary point between bits 23 and 22); 2201 * this is the fast case for smallish numbers. 2202 */ 2203 shiftcount = countLeadingZeros64(a) - 40; 2204 if (shiftcount >= 0) { 2205 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 2206 } 2207 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 2208 * expects the binary point between bits 30 and 29, hence the + 7. 2209 */ 2210 shiftcount += 7; 2211 if (shiftcount < 0) { 2212 shift64RightJamming(a, -shiftcount, &a); 2213 } else { 2214 a <<= shiftcount; 2215 } 2216 2217 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 2218 } 2219 2220 /*---------------------------------------------------------------------------- 2221 | Returns the result of converting the 64-bit unsigned integer `a' 2222 | to the double-precision floating-point format. The conversion is performed 2223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2224 *----------------------------------------------------------------------------*/ 2225 2226 float64 uint64_to_float64(uint64_t a, float_status *status) 2227 { 2228 int exp = 0x43C; 2229 int shiftcount; 2230 2231 if (a == 0) { 2232 return float64_zero; 2233 } 2234 2235 shiftcount = countLeadingZeros64(a) - 1; 2236 if (shiftcount < 0) { 2237 shift64RightJamming(a, -shiftcount, &a); 2238 } else { 2239 a <<= shiftcount; 2240 } 2241 return roundAndPackFloat64(0, exp - shiftcount, a, status); 2242 } 2243 2244 /*---------------------------------------------------------------------------- 2245 | Returns the result of converting the 64-bit unsigned integer `a' 2246 | to the quadruple-precision floating-point format. The conversion is performed 2247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2248 *----------------------------------------------------------------------------*/ 2249 2250 float128 uint64_to_float128(uint64_t a, float_status *status) 2251 { 2252 if (a == 0) { 2253 return float128_zero; 2254 } 2255 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 2256 } 2257 2258 /*---------------------------------------------------------------------------- 2259 | Returns the result of converting the single-precision floating-point value 2260 | `a' to the 32-bit two's complement integer format. The conversion is 2261 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2262 | Arithmetic---which means in particular that the conversion is rounded 2263 | according to the current rounding mode. If `a' is a NaN, the largest 2264 | positive integer is returned. Otherwise, if the conversion overflows, the 2265 | largest integer with the same sign as `a' is returned. 2266 *----------------------------------------------------------------------------*/ 2267 2268 int32_t float32_to_int32(float32 a, float_status *status) 2269 { 2270 flag aSign; 2271 int aExp; 2272 int shiftCount; 2273 uint32_t aSig; 2274 uint64_t aSig64; 2275 2276 a = float32_squash_input_denormal(a, status); 2277 aSig = extractFloat32Frac( a ); 2278 aExp = extractFloat32Exp( a ); 2279 aSign = extractFloat32Sign( a ); 2280 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 2281 if ( aExp ) aSig |= 0x00800000; 2282 shiftCount = 0xAF - aExp; 2283 aSig64 = aSig; 2284 aSig64 <<= 32; 2285 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 2286 return roundAndPackInt32(aSign, aSig64, status); 2287 2288 } 2289 2290 /*---------------------------------------------------------------------------- 2291 | Returns the result of converting the single-precision floating-point value 2292 | `a' to the 32-bit two's complement integer format. The conversion is 2293 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2294 | Arithmetic, except that the conversion is always rounded toward zero. 2295 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2296 | the conversion overflows, the largest integer with the same sign as `a' is 2297 | returned. 2298 *----------------------------------------------------------------------------*/ 2299 2300 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 2301 { 2302 flag aSign; 2303 int aExp; 2304 int shiftCount; 2305 uint32_t aSig; 2306 int32_t z; 2307 a = float32_squash_input_denormal(a, status); 2308 2309 aSig = extractFloat32Frac( a ); 2310 aExp = extractFloat32Exp( a ); 2311 aSign = extractFloat32Sign( a ); 2312 shiftCount = aExp - 0x9E; 2313 if ( 0 <= shiftCount ) { 2314 if ( float32_val(a) != 0xCF000000 ) { 2315 float_raise(float_flag_invalid, status); 2316 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 2317 } 2318 return (int32_t) 0x80000000; 2319 } 2320 else if ( aExp <= 0x7E ) { 2321 if (aExp | aSig) { 2322 status->float_exception_flags |= float_flag_inexact; 2323 } 2324 return 0; 2325 } 2326 aSig = ( aSig | 0x00800000 )<<8; 2327 z = aSig>>( - shiftCount ); 2328 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2329 status->float_exception_flags |= float_flag_inexact; 2330 } 2331 if ( aSign ) z = - z; 2332 return z; 2333 2334 } 2335 2336 /*---------------------------------------------------------------------------- 2337 | Returns the result of converting the single-precision floating-point value 2338 | `a' to the 16-bit two's complement integer format. The conversion is 2339 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2340 | Arithmetic, except that the conversion is always rounded toward zero. 2341 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2342 | the conversion overflows, the largest integer with the same sign as `a' is 2343 | returned. 2344 *----------------------------------------------------------------------------*/ 2345 2346 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 2347 { 2348 flag aSign; 2349 int aExp; 2350 int shiftCount; 2351 uint32_t aSig; 2352 int32_t z; 2353 2354 aSig = extractFloat32Frac( a ); 2355 aExp = extractFloat32Exp( a ); 2356 aSign = extractFloat32Sign( a ); 2357 shiftCount = aExp - 0x8E; 2358 if ( 0 <= shiftCount ) { 2359 if ( float32_val(a) != 0xC7000000 ) { 2360 float_raise(float_flag_invalid, status); 2361 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2362 return 0x7FFF; 2363 } 2364 } 2365 return (int32_t) 0xffff8000; 2366 } 2367 else if ( aExp <= 0x7E ) { 2368 if ( aExp | aSig ) { 2369 status->float_exception_flags |= float_flag_inexact; 2370 } 2371 return 0; 2372 } 2373 shiftCount -= 0x10; 2374 aSig = ( aSig | 0x00800000 )<<8; 2375 z = aSig>>( - shiftCount ); 2376 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 2377 status->float_exception_flags |= float_flag_inexact; 2378 } 2379 if ( aSign ) { 2380 z = - z; 2381 } 2382 return z; 2383 2384 } 2385 2386 /*---------------------------------------------------------------------------- 2387 | Returns the result of converting the single-precision floating-point value 2388 | `a' to the 64-bit two's complement integer format. The conversion is 2389 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2390 | Arithmetic---which means in particular that the conversion is rounded 2391 | according to the current rounding mode. If `a' is a NaN, the largest 2392 | positive integer is returned. Otherwise, if the conversion overflows, the 2393 | largest integer with the same sign as `a' is returned. 2394 *----------------------------------------------------------------------------*/ 2395 2396 int64_t float32_to_int64(float32 a, float_status *status) 2397 { 2398 flag aSign; 2399 int aExp; 2400 int shiftCount; 2401 uint32_t aSig; 2402 uint64_t aSig64, aSigExtra; 2403 a = float32_squash_input_denormal(a, status); 2404 2405 aSig = extractFloat32Frac( a ); 2406 aExp = extractFloat32Exp( a ); 2407 aSign = extractFloat32Sign( a ); 2408 shiftCount = 0xBE - aExp; 2409 if ( shiftCount < 0 ) { 2410 float_raise(float_flag_invalid, status); 2411 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2412 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2413 } 2414 return (int64_t) LIT64( 0x8000000000000000 ); 2415 } 2416 if ( aExp ) aSig |= 0x00800000; 2417 aSig64 = aSig; 2418 aSig64 <<= 40; 2419 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 2420 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 2421 2422 } 2423 2424 /*---------------------------------------------------------------------------- 2425 | Returns the result of converting the single-precision floating-point value 2426 | `a' to the 64-bit unsigned integer format. The conversion is 2427 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2428 | Arithmetic---which means in particular that the conversion is rounded 2429 | according to the current rounding mode. If `a' is a NaN, the largest 2430 | unsigned integer is returned. Otherwise, if the conversion overflows, the 2431 | largest unsigned integer is returned. If the 'a' is negative, the result 2432 | is rounded and zero is returned; values that do not round to zero will 2433 | raise the inexact exception flag. 2434 *----------------------------------------------------------------------------*/ 2435 2436 uint64_t float32_to_uint64(float32 a, float_status *status) 2437 { 2438 flag aSign; 2439 int aExp; 2440 int shiftCount; 2441 uint32_t aSig; 2442 uint64_t aSig64, aSigExtra; 2443 a = float32_squash_input_denormal(a, status); 2444 2445 aSig = extractFloat32Frac(a); 2446 aExp = extractFloat32Exp(a); 2447 aSign = extractFloat32Sign(a); 2448 if ((aSign) && (aExp > 126)) { 2449 float_raise(float_flag_invalid, status); 2450 if (float32_is_any_nan(a)) { 2451 return LIT64(0xFFFFFFFFFFFFFFFF); 2452 } else { 2453 return 0; 2454 } 2455 } 2456 shiftCount = 0xBE - aExp; 2457 if (aExp) { 2458 aSig |= 0x00800000; 2459 } 2460 if (shiftCount < 0) { 2461 float_raise(float_flag_invalid, status); 2462 return LIT64(0xFFFFFFFFFFFFFFFF); 2463 } 2464 2465 aSig64 = aSig; 2466 aSig64 <<= 40; 2467 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 2468 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 2469 } 2470 2471 /*---------------------------------------------------------------------------- 2472 | Returns the result of converting the single-precision floating-point value 2473 | `a' to the 64-bit unsigned integer format. The conversion is 2474 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2475 | Arithmetic, except that the conversion is always rounded toward zero. If 2476 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 2477 | conversion overflows, the largest unsigned integer is returned. If the 2478 | 'a' is negative, the result is rounded and zero is returned; values that do 2479 | not round to zero will raise the inexact flag. 2480 *----------------------------------------------------------------------------*/ 2481 2482 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 2483 { 2484 signed char current_rounding_mode = status->float_rounding_mode; 2485 set_float_rounding_mode(float_round_to_zero, status); 2486 int64_t v = float32_to_uint64(a, status); 2487 set_float_rounding_mode(current_rounding_mode, status); 2488 return v; 2489 } 2490 2491 /*---------------------------------------------------------------------------- 2492 | Returns the result of converting the single-precision floating-point value 2493 | `a' to the 64-bit two's complement integer format. The conversion is 2494 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2495 | Arithmetic, except that the conversion is always rounded toward zero. If 2496 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 2497 | conversion overflows, the largest integer with the same sign as `a' is 2498 | returned. 2499 *----------------------------------------------------------------------------*/ 2500 2501 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 2502 { 2503 flag aSign; 2504 int aExp; 2505 int shiftCount; 2506 uint32_t aSig; 2507 uint64_t aSig64; 2508 int64_t z; 2509 a = float32_squash_input_denormal(a, status); 2510 2511 aSig = extractFloat32Frac( a ); 2512 aExp = extractFloat32Exp( a ); 2513 aSign = extractFloat32Sign( a ); 2514 shiftCount = aExp - 0xBE; 2515 if ( 0 <= shiftCount ) { 2516 if ( float32_val(a) != 0xDF000000 ) { 2517 float_raise(float_flag_invalid, status); 2518 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 2519 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2520 } 2521 } 2522 return (int64_t) LIT64( 0x8000000000000000 ); 2523 } 2524 else if ( aExp <= 0x7E ) { 2525 if (aExp | aSig) { 2526 status->float_exception_flags |= float_flag_inexact; 2527 } 2528 return 0; 2529 } 2530 aSig64 = aSig | 0x00800000; 2531 aSig64 <<= 40; 2532 z = aSig64>>( - shiftCount ); 2533 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 2534 status->float_exception_flags |= float_flag_inexact; 2535 } 2536 if ( aSign ) z = - z; 2537 return z; 2538 2539 } 2540 2541 /*---------------------------------------------------------------------------- 2542 | Returns the result of converting the single-precision floating-point value 2543 | `a' to the double-precision floating-point format. The conversion is 2544 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2545 | Arithmetic. 2546 *----------------------------------------------------------------------------*/ 2547 2548 float64 float32_to_float64(float32 a, float_status *status) 2549 { 2550 flag aSign; 2551 int aExp; 2552 uint32_t aSig; 2553 a = float32_squash_input_denormal(a, status); 2554 2555 aSig = extractFloat32Frac( a ); 2556 aExp = extractFloat32Exp( a ); 2557 aSign = extractFloat32Sign( a ); 2558 if ( aExp == 0xFF ) { 2559 if (aSig) { 2560 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 2561 } 2562 return packFloat64( aSign, 0x7FF, 0 ); 2563 } 2564 if ( aExp == 0 ) { 2565 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 2566 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2567 --aExp; 2568 } 2569 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 2570 2571 } 2572 2573 /*---------------------------------------------------------------------------- 2574 | Returns the result of converting the single-precision floating-point value 2575 | `a' to the extended double-precision floating-point format. The conversion 2576 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2577 | Arithmetic. 2578 *----------------------------------------------------------------------------*/ 2579 2580 floatx80 float32_to_floatx80(float32 a, float_status *status) 2581 { 2582 flag aSign; 2583 int aExp; 2584 uint32_t aSig; 2585 2586 a = float32_squash_input_denormal(a, status); 2587 aSig = extractFloat32Frac( a ); 2588 aExp = extractFloat32Exp( a ); 2589 aSign = extractFloat32Sign( a ); 2590 if ( aExp == 0xFF ) { 2591 if (aSig) { 2592 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 2593 } 2594 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2595 } 2596 if ( aExp == 0 ) { 2597 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2598 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2599 } 2600 aSig |= 0x00800000; 2601 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 2602 2603 } 2604 2605 /*---------------------------------------------------------------------------- 2606 | Returns the result of converting the single-precision floating-point value 2607 | `a' to the double-precision floating-point format. The conversion is 2608 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2609 | Arithmetic. 2610 *----------------------------------------------------------------------------*/ 2611 2612 float128 float32_to_float128(float32 a, float_status *status) 2613 { 2614 flag aSign; 2615 int aExp; 2616 uint32_t aSig; 2617 2618 a = float32_squash_input_denormal(a, status); 2619 aSig = extractFloat32Frac( a ); 2620 aExp = extractFloat32Exp( a ); 2621 aSign = extractFloat32Sign( a ); 2622 if ( aExp == 0xFF ) { 2623 if (aSig) { 2624 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 2625 } 2626 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2627 } 2628 if ( aExp == 0 ) { 2629 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2630 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2631 --aExp; 2632 } 2633 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 2634 2635 } 2636 2637 /*---------------------------------------------------------------------------- 2638 | Rounds the single-precision floating-point value `a' to an integer, and 2639 | returns the result as a single-precision floating-point value. The 2640 | operation is performed according to the IEC/IEEE Standard for Binary 2641 | Floating-Point Arithmetic. 2642 *----------------------------------------------------------------------------*/ 2643 2644 float32 float32_round_to_int(float32 a, float_status *status) 2645 { 2646 flag aSign; 2647 int aExp; 2648 uint32_t lastBitMask, roundBitsMask; 2649 uint32_t z; 2650 a = float32_squash_input_denormal(a, status); 2651 2652 aExp = extractFloat32Exp( a ); 2653 if ( 0x96 <= aExp ) { 2654 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 2655 return propagateFloat32NaN(a, a, status); 2656 } 2657 return a; 2658 } 2659 if ( aExp <= 0x7E ) { 2660 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 2661 status->float_exception_flags |= float_flag_inexact; 2662 aSign = extractFloat32Sign( a ); 2663 switch (status->float_rounding_mode) { 2664 case float_round_nearest_even: 2665 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 2666 return packFloat32( aSign, 0x7F, 0 ); 2667 } 2668 break; 2669 case float_round_ties_away: 2670 if (aExp == 0x7E) { 2671 return packFloat32(aSign, 0x7F, 0); 2672 } 2673 break; 2674 case float_round_down: 2675 return make_float32(aSign ? 0xBF800000 : 0); 2676 case float_round_up: 2677 return make_float32(aSign ? 0x80000000 : 0x3F800000); 2678 } 2679 return packFloat32( aSign, 0, 0 ); 2680 } 2681 lastBitMask = 1; 2682 lastBitMask <<= 0x96 - aExp; 2683 roundBitsMask = lastBitMask - 1; 2684 z = float32_val(a); 2685 switch (status->float_rounding_mode) { 2686 case float_round_nearest_even: 2687 z += lastBitMask>>1; 2688 if ((z & roundBitsMask) == 0) { 2689 z &= ~lastBitMask; 2690 } 2691 break; 2692 case float_round_ties_away: 2693 z += lastBitMask >> 1; 2694 break; 2695 case float_round_to_zero: 2696 break; 2697 case float_round_up: 2698 if (!extractFloat32Sign(make_float32(z))) { 2699 z += roundBitsMask; 2700 } 2701 break; 2702 case float_round_down: 2703 if (extractFloat32Sign(make_float32(z))) { 2704 z += roundBitsMask; 2705 } 2706 break; 2707 default: 2708 abort(); 2709 } 2710 z &= ~ roundBitsMask; 2711 if (z != float32_val(a)) { 2712 status->float_exception_flags |= float_flag_inexact; 2713 } 2714 return make_float32(z); 2715 2716 } 2717 2718 /*---------------------------------------------------------------------------- 2719 | Returns the remainder of the single-precision floating-point value `a' 2720 | with respect to the corresponding value `b'. The operation is performed 2721 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2722 *----------------------------------------------------------------------------*/ 2723 2724 float32 float32_rem(float32 a, float32 b, float_status *status) 2725 { 2726 flag aSign, zSign; 2727 int aExp, bExp, expDiff; 2728 uint32_t aSig, bSig; 2729 uint32_t q; 2730 uint64_t aSig64, bSig64, q64; 2731 uint32_t alternateASig; 2732 int32_t sigMean; 2733 a = float32_squash_input_denormal(a, status); 2734 b = float32_squash_input_denormal(b, status); 2735 2736 aSig = extractFloat32Frac( a ); 2737 aExp = extractFloat32Exp( a ); 2738 aSign = extractFloat32Sign( a ); 2739 bSig = extractFloat32Frac( b ); 2740 bExp = extractFloat32Exp( b ); 2741 if ( aExp == 0xFF ) { 2742 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2743 return propagateFloat32NaN(a, b, status); 2744 } 2745 float_raise(float_flag_invalid, status); 2746 return float32_default_nan(status); 2747 } 2748 if ( bExp == 0xFF ) { 2749 if (bSig) { 2750 return propagateFloat32NaN(a, b, status); 2751 } 2752 return a; 2753 } 2754 if ( bExp == 0 ) { 2755 if ( bSig == 0 ) { 2756 float_raise(float_flag_invalid, status); 2757 return float32_default_nan(status); 2758 } 2759 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2760 } 2761 if ( aExp == 0 ) { 2762 if ( aSig == 0 ) return a; 2763 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2764 } 2765 expDiff = aExp - bExp; 2766 aSig |= 0x00800000; 2767 bSig |= 0x00800000; 2768 if ( expDiff < 32 ) { 2769 aSig <<= 8; 2770 bSig <<= 8; 2771 if ( expDiff < 0 ) { 2772 if ( expDiff < -1 ) return a; 2773 aSig >>= 1; 2774 } 2775 q = ( bSig <= aSig ); 2776 if ( q ) aSig -= bSig; 2777 if ( 0 < expDiff ) { 2778 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2779 q >>= 32 - expDiff; 2780 bSig >>= 2; 2781 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2782 } 2783 else { 2784 aSig >>= 2; 2785 bSig >>= 2; 2786 } 2787 } 2788 else { 2789 if ( bSig <= aSig ) aSig -= bSig; 2790 aSig64 = ( (uint64_t) aSig )<<40; 2791 bSig64 = ( (uint64_t) bSig )<<40; 2792 expDiff -= 64; 2793 while ( 0 < expDiff ) { 2794 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2795 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2796 aSig64 = - ( ( bSig * q64 )<<38 ); 2797 expDiff -= 62; 2798 } 2799 expDiff += 64; 2800 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2801 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2802 q = q64>>( 64 - expDiff ); 2803 bSig <<= 6; 2804 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2805 } 2806 do { 2807 alternateASig = aSig; 2808 ++q; 2809 aSig -= bSig; 2810 } while ( 0 <= (int32_t) aSig ); 2811 sigMean = aSig + alternateASig; 2812 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2813 aSig = alternateASig; 2814 } 2815 zSign = ( (int32_t) aSig < 0 ); 2816 if ( zSign ) aSig = - aSig; 2817 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2818 } 2819 2820 /*---------------------------------------------------------------------------- 2821 | Returns the result of multiplying the single-precision floating-point values 2822 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2823 | multiplication. The operation is performed according to the IEC/IEEE 2824 | Standard for Binary Floating-Point Arithmetic 754-2008. 2825 | The flags argument allows the caller to select negation of the 2826 | addend, the intermediate product, or the final result. (The difference 2827 | between this and having the caller do a separate negation is that negating 2828 | externally will flip the sign bit on NaNs.) 2829 *----------------------------------------------------------------------------*/ 2830 2831 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2832 float_status *status) 2833 { 2834 flag aSign, bSign, cSign, zSign; 2835 int aExp, bExp, cExp, pExp, zExp, expDiff; 2836 uint32_t aSig, bSig, cSig; 2837 flag pInf, pZero, pSign; 2838 uint64_t pSig64, cSig64, zSig64; 2839 uint32_t pSig; 2840 int shiftcount; 2841 flag signflip, infzero; 2842 2843 a = float32_squash_input_denormal(a, status); 2844 b = float32_squash_input_denormal(b, status); 2845 c = float32_squash_input_denormal(c, status); 2846 aSig = extractFloat32Frac(a); 2847 aExp = extractFloat32Exp(a); 2848 aSign = extractFloat32Sign(a); 2849 bSig = extractFloat32Frac(b); 2850 bExp = extractFloat32Exp(b); 2851 bSign = extractFloat32Sign(b); 2852 cSig = extractFloat32Frac(c); 2853 cExp = extractFloat32Exp(c); 2854 cSign = extractFloat32Sign(c); 2855 2856 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2857 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2858 2859 /* It is implementation-defined whether the cases of (0,inf,qnan) 2860 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2861 * they return if they do), so we have to hand this information 2862 * off to the target-specific pick-a-NaN routine. 2863 */ 2864 if (((aExp == 0xff) && aSig) || 2865 ((bExp == 0xff) && bSig) || 2866 ((cExp == 0xff) && cSig)) { 2867 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2868 } 2869 2870 if (infzero) { 2871 float_raise(float_flag_invalid, status); 2872 return float32_default_nan(status); 2873 } 2874 2875 if (flags & float_muladd_negate_c) { 2876 cSign ^= 1; 2877 } 2878 2879 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2880 2881 /* Work out the sign and type of the product */ 2882 pSign = aSign ^ bSign; 2883 if (flags & float_muladd_negate_product) { 2884 pSign ^= 1; 2885 } 2886 pInf = (aExp == 0xff) || (bExp == 0xff); 2887 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2888 2889 if (cExp == 0xff) { 2890 if (pInf && (pSign ^ cSign)) { 2891 /* addition of opposite-signed infinities => InvalidOperation */ 2892 float_raise(float_flag_invalid, status); 2893 return float32_default_nan(status); 2894 } 2895 /* Otherwise generate an infinity of the same sign */ 2896 return packFloat32(cSign ^ signflip, 0xff, 0); 2897 } 2898 2899 if (pInf) { 2900 return packFloat32(pSign ^ signflip, 0xff, 0); 2901 } 2902 2903 if (pZero) { 2904 if (cExp == 0) { 2905 if (cSig == 0) { 2906 /* Adding two exact zeroes */ 2907 if (pSign == cSign) { 2908 zSign = pSign; 2909 } else if (status->float_rounding_mode == float_round_down) { 2910 zSign = 1; 2911 } else { 2912 zSign = 0; 2913 } 2914 return packFloat32(zSign ^ signflip, 0, 0); 2915 } 2916 /* Exact zero plus a denorm */ 2917 if (status->flush_to_zero) { 2918 float_raise(float_flag_output_denormal, status); 2919 return packFloat32(cSign ^ signflip, 0, 0); 2920 } 2921 } 2922 /* Zero plus something non-zero : just return the something */ 2923 if (flags & float_muladd_halve_result) { 2924 if (cExp == 0) { 2925 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2926 } 2927 /* Subtract one to halve, and one again because roundAndPackFloat32 2928 * wants one less than the true exponent. 2929 */ 2930 cExp -= 2; 2931 cSig = (cSig | 0x00800000) << 7; 2932 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2933 } 2934 return packFloat32(cSign ^ signflip, cExp, cSig); 2935 } 2936 2937 if (aExp == 0) { 2938 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2939 } 2940 if (bExp == 0) { 2941 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2942 } 2943 2944 /* Calculate the actual result a * b + c */ 2945 2946 /* Multiply first; this is easy. */ 2947 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2948 * because we want the true exponent, not the "one-less-than" 2949 * flavour that roundAndPackFloat32() takes. 2950 */ 2951 pExp = aExp + bExp - 0x7e; 2952 aSig = (aSig | 0x00800000) << 7; 2953 bSig = (bSig | 0x00800000) << 8; 2954 pSig64 = (uint64_t)aSig * bSig; 2955 if ((int64_t)(pSig64 << 1) >= 0) { 2956 pSig64 <<= 1; 2957 pExp--; 2958 } 2959 2960 zSign = pSign ^ signflip; 2961 2962 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2963 * position 62. 2964 */ 2965 if (cExp == 0) { 2966 if (!cSig) { 2967 /* Throw out the special case of c being an exact zero now */ 2968 shift64RightJamming(pSig64, 32, &pSig64); 2969 pSig = pSig64; 2970 if (flags & float_muladd_halve_result) { 2971 pExp--; 2972 } 2973 return roundAndPackFloat32(zSign, pExp - 1, 2974 pSig, status); 2975 } 2976 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2977 } 2978 2979 cSig64 = (uint64_t)cSig << (62 - 23); 2980 cSig64 |= LIT64(0x4000000000000000); 2981 expDiff = pExp - cExp; 2982 2983 if (pSign == cSign) { 2984 /* Addition */ 2985 if (expDiff > 0) { 2986 /* scale c to match p */ 2987 shift64RightJamming(cSig64, expDiff, &cSig64); 2988 zExp = pExp; 2989 } else if (expDiff < 0) { 2990 /* scale p to match c */ 2991 shift64RightJamming(pSig64, -expDiff, &pSig64); 2992 zExp = cExp; 2993 } else { 2994 /* no scaling needed */ 2995 zExp = cExp; 2996 } 2997 /* Add significands and make sure explicit bit ends up in posn 62 */ 2998 zSig64 = pSig64 + cSig64; 2999 if ((int64_t)zSig64 < 0) { 3000 shift64RightJamming(zSig64, 1, &zSig64); 3001 } else { 3002 zExp--; 3003 } 3004 } else { 3005 /* Subtraction */ 3006 if (expDiff > 0) { 3007 shift64RightJamming(cSig64, expDiff, &cSig64); 3008 zSig64 = pSig64 - cSig64; 3009 zExp = pExp; 3010 } else if (expDiff < 0) { 3011 shift64RightJamming(pSig64, -expDiff, &pSig64); 3012 zSig64 = cSig64 - pSig64; 3013 zExp = cExp; 3014 zSign ^= 1; 3015 } else { 3016 zExp = pExp; 3017 if (cSig64 < pSig64) { 3018 zSig64 = pSig64 - cSig64; 3019 } else if (pSig64 < cSig64) { 3020 zSig64 = cSig64 - pSig64; 3021 zSign ^= 1; 3022 } else { 3023 /* Exact zero */ 3024 zSign = signflip; 3025 if (status->float_rounding_mode == float_round_down) { 3026 zSign ^= 1; 3027 } 3028 return packFloat32(zSign, 0, 0); 3029 } 3030 } 3031 --zExp; 3032 /* Normalize to put the explicit bit back into bit 62. */ 3033 shiftcount = countLeadingZeros64(zSig64) - 1; 3034 zSig64 <<= shiftcount; 3035 zExp -= shiftcount; 3036 } 3037 if (flags & float_muladd_halve_result) { 3038 zExp--; 3039 } 3040 3041 shift64RightJamming(zSig64, 32, &zSig64); 3042 return roundAndPackFloat32(zSign, zExp, zSig64, status); 3043 } 3044 3045 3046 /*---------------------------------------------------------------------------- 3047 | Returns the square root of the single-precision floating-point value `a'. 3048 | The operation is performed according to the IEC/IEEE Standard for Binary 3049 | Floating-Point Arithmetic. 3050 *----------------------------------------------------------------------------*/ 3051 3052 float32 float32_sqrt(float32 a, float_status *status) 3053 { 3054 flag aSign; 3055 int aExp, zExp; 3056 uint32_t aSig, zSig; 3057 uint64_t rem, term; 3058 a = float32_squash_input_denormal(a, status); 3059 3060 aSig = extractFloat32Frac( a ); 3061 aExp = extractFloat32Exp( a ); 3062 aSign = extractFloat32Sign( a ); 3063 if ( aExp == 0xFF ) { 3064 if (aSig) { 3065 return propagateFloat32NaN(a, float32_zero, status); 3066 } 3067 if ( ! aSign ) return a; 3068 float_raise(float_flag_invalid, status); 3069 return float32_default_nan(status); 3070 } 3071 if ( aSign ) { 3072 if ( ( aExp | aSig ) == 0 ) return a; 3073 float_raise(float_flag_invalid, status); 3074 return float32_default_nan(status); 3075 } 3076 if ( aExp == 0 ) { 3077 if ( aSig == 0 ) return float32_zero; 3078 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3079 } 3080 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3081 aSig = ( aSig | 0x00800000 )<<8; 3082 zSig = estimateSqrt32( aExp, aSig ) + 2; 3083 if ( ( zSig & 0x7F ) <= 5 ) { 3084 if ( zSig < 2 ) { 3085 zSig = 0x7FFFFFFF; 3086 goto roundAndPack; 3087 } 3088 aSig >>= aExp & 1; 3089 term = ( (uint64_t) zSig ) * zSig; 3090 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3091 while ( (int64_t) rem < 0 ) { 3092 --zSig; 3093 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3094 } 3095 zSig |= ( rem != 0 ); 3096 } 3097 shift32RightJamming( zSig, 1, &zSig ); 3098 roundAndPack: 3099 return roundAndPackFloat32(0, zExp, zSig, status); 3100 3101 } 3102 3103 /*---------------------------------------------------------------------------- 3104 | Returns the binary exponential of the single-precision floating-point value 3105 | `a'. The operation is performed according to the IEC/IEEE Standard for 3106 | Binary Floating-Point Arithmetic. 3107 | 3108 | Uses the following identities: 3109 | 3110 | 1. ------------------------------------------------------------------------- 3111 | x x*ln(2) 3112 | 2 = e 3113 | 3114 | 2. ------------------------------------------------------------------------- 3115 | 2 3 4 5 n 3116 | x x x x x x x 3117 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3118 | 1! 2! 3! 4! 5! n! 3119 *----------------------------------------------------------------------------*/ 3120 3121 static const float64 float32_exp2_coefficients[15] = 3122 { 3123 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3124 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3125 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3126 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3127 const_float64( 0x3f81111111111111ll ), /* 5 */ 3128 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3129 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3130 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3131 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3132 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3133 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3134 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3135 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3136 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3137 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3138 }; 3139 3140 float32 float32_exp2(float32 a, float_status *status) 3141 { 3142 flag aSign; 3143 int aExp; 3144 uint32_t aSig; 3145 float64 r, x, xn; 3146 int i; 3147 a = float32_squash_input_denormal(a, status); 3148 3149 aSig = extractFloat32Frac( a ); 3150 aExp = extractFloat32Exp( a ); 3151 aSign = extractFloat32Sign( a ); 3152 3153 if ( aExp == 0xFF) { 3154 if (aSig) { 3155 return propagateFloat32NaN(a, float32_zero, status); 3156 } 3157 return (aSign) ? float32_zero : a; 3158 } 3159 if (aExp == 0) { 3160 if (aSig == 0) return float32_one; 3161 } 3162 3163 float_raise(float_flag_inexact, status); 3164 3165 /* ******************************* */ 3166 /* using float64 for approximation */ 3167 /* ******************************* */ 3168 x = float32_to_float64(a, status); 3169 x = float64_mul(x, float64_ln2, status); 3170 3171 xn = x; 3172 r = float64_one; 3173 for (i = 0 ; i < 15 ; i++) { 3174 float64 f; 3175 3176 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3177 r = float64_add(r, f, status); 3178 3179 xn = float64_mul(xn, x, status); 3180 } 3181 3182 return float64_to_float32(r, status); 3183 } 3184 3185 /*---------------------------------------------------------------------------- 3186 | Returns the binary log of the single-precision floating-point value `a'. 3187 | The operation is performed according to the IEC/IEEE Standard for Binary 3188 | Floating-Point Arithmetic. 3189 *----------------------------------------------------------------------------*/ 3190 float32 float32_log2(float32 a, float_status *status) 3191 { 3192 flag aSign, zSign; 3193 int aExp; 3194 uint32_t aSig, zSig, i; 3195 3196 a = float32_squash_input_denormal(a, status); 3197 aSig = extractFloat32Frac( a ); 3198 aExp = extractFloat32Exp( a ); 3199 aSign = extractFloat32Sign( a ); 3200 3201 if ( aExp == 0 ) { 3202 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3203 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3204 } 3205 if ( aSign ) { 3206 float_raise(float_flag_invalid, status); 3207 return float32_default_nan(status); 3208 } 3209 if ( aExp == 0xFF ) { 3210 if (aSig) { 3211 return propagateFloat32NaN(a, float32_zero, status); 3212 } 3213 return a; 3214 } 3215 3216 aExp -= 0x7F; 3217 aSig |= 0x00800000; 3218 zSign = aExp < 0; 3219 zSig = aExp << 23; 3220 3221 for (i = 1 << 22; i > 0; i >>= 1) { 3222 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3223 if ( aSig & 0x01000000 ) { 3224 aSig >>= 1; 3225 zSig |= i; 3226 } 3227 } 3228 3229 if ( zSign ) 3230 zSig = -zSig; 3231 3232 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3233 } 3234 3235 /*---------------------------------------------------------------------------- 3236 | Returns 1 if the single-precision floating-point value `a' is equal to 3237 | the corresponding value `b', and 0 otherwise. The invalid exception is 3238 | raised if either operand is a NaN. Otherwise, the comparison is performed 3239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3240 *----------------------------------------------------------------------------*/ 3241 3242 int float32_eq(float32 a, float32 b, float_status *status) 3243 { 3244 uint32_t av, bv; 3245 a = float32_squash_input_denormal(a, status); 3246 b = float32_squash_input_denormal(b, status); 3247 3248 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3249 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3250 ) { 3251 float_raise(float_flag_invalid, status); 3252 return 0; 3253 } 3254 av = float32_val(a); 3255 bv = float32_val(b); 3256 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3257 } 3258 3259 /*---------------------------------------------------------------------------- 3260 | Returns 1 if the single-precision floating-point value `a' is less than 3261 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3262 | exception is raised if either operand is a NaN. The comparison is performed 3263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3264 *----------------------------------------------------------------------------*/ 3265 3266 int float32_le(float32 a, float32 b, float_status *status) 3267 { 3268 flag aSign, bSign; 3269 uint32_t av, bv; 3270 a = float32_squash_input_denormal(a, status); 3271 b = float32_squash_input_denormal(b, status); 3272 3273 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3274 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3275 ) { 3276 float_raise(float_flag_invalid, status); 3277 return 0; 3278 } 3279 aSign = extractFloat32Sign( a ); 3280 bSign = extractFloat32Sign( b ); 3281 av = float32_val(a); 3282 bv = float32_val(b); 3283 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3284 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3285 3286 } 3287 3288 /*---------------------------------------------------------------------------- 3289 | Returns 1 if the single-precision floating-point value `a' is less than 3290 | the corresponding value `b', and 0 otherwise. The invalid exception is 3291 | raised if either operand is a NaN. The comparison is performed according 3292 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3293 *----------------------------------------------------------------------------*/ 3294 3295 int float32_lt(float32 a, float32 b, float_status *status) 3296 { 3297 flag aSign, bSign; 3298 uint32_t av, bv; 3299 a = float32_squash_input_denormal(a, status); 3300 b = float32_squash_input_denormal(b, status); 3301 3302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3304 ) { 3305 float_raise(float_flag_invalid, status); 3306 return 0; 3307 } 3308 aSign = extractFloat32Sign( a ); 3309 bSign = extractFloat32Sign( b ); 3310 av = float32_val(a); 3311 bv = float32_val(b); 3312 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3313 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3314 3315 } 3316 3317 /*---------------------------------------------------------------------------- 3318 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3319 | be compared, and 0 otherwise. The invalid exception is raised if either 3320 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3321 | Standard for Binary Floating-Point Arithmetic. 3322 *----------------------------------------------------------------------------*/ 3323 3324 int float32_unordered(float32 a, float32 b, float_status *status) 3325 { 3326 a = float32_squash_input_denormal(a, status); 3327 b = float32_squash_input_denormal(b, status); 3328 3329 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3330 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3331 ) { 3332 float_raise(float_flag_invalid, status); 3333 return 1; 3334 } 3335 return 0; 3336 } 3337 3338 /*---------------------------------------------------------------------------- 3339 | Returns 1 if the single-precision floating-point value `a' is equal to 3340 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3341 | exception. The comparison is performed according to the IEC/IEEE Standard 3342 | for Binary Floating-Point Arithmetic. 3343 *----------------------------------------------------------------------------*/ 3344 3345 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3346 { 3347 a = float32_squash_input_denormal(a, status); 3348 b = float32_squash_input_denormal(b, status); 3349 3350 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3351 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3352 ) { 3353 if (float32_is_signaling_nan(a, status) 3354 || float32_is_signaling_nan(b, status)) { 3355 float_raise(float_flag_invalid, status); 3356 } 3357 return 0; 3358 } 3359 return ( float32_val(a) == float32_val(b) ) || 3360 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3361 } 3362 3363 /*---------------------------------------------------------------------------- 3364 | Returns 1 if the single-precision floating-point value `a' is less than or 3365 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3366 | cause an exception. Otherwise, the comparison is performed according to the 3367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3368 *----------------------------------------------------------------------------*/ 3369 3370 int float32_le_quiet(float32 a, float32 b, float_status *status) 3371 { 3372 flag aSign, bSign; 3373 uint32_t av, bv; 3374 a = float32_squash_input_denormal(a, status); 3375 b = float32_squash_input_denormal(b, status); 3376 3377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3379 ) { 3380 if (float32_is_signaling_nan(a, status) 3381 || float32_is_signaling_nan(b, status)) { 3382 float_raise(float_flag_invalid, status); 3383 } 3384 return 0; 3385 } 3386 aSign = extractFloat32Sign( a ); 3387 bSign = extractFloat32Sign( b ); 3388 av = float32_val(a); 3389 bv = float32_val(b); 3390 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3391 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3392 3393 } 3394 3395 /*---------------------------------------------------------------------------- 3396 | Returns 1 if the single-precision floating-point value `a' is less than 3397 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3398 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3399 | Standard for Binary Floating-Point Arithmetic. 3400 *----------------------------------------------------------------------------*/ 3401 3402 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3403 { 3404 flag aSign, bSign; 3405 uint32_t av, bv; 3406 a = float32_squash_input_denormal(a, status); 3407 b = float32_squash_input_denormal(b, status); 3408 3409 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3410 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3411 ) { 3412 if (float32_is_signaling_nan(a, status) 3413 || float32_is_signaling_nan(b, status)) { 3414 float_raise(float_flag_invalid, status); 3415 } 3416 return 0; 3417 } 3418 aSign = extractFloat32Sign( a ); 3419 bSign = extractFloat32Sign( b ); 3420 av = float32_val(a); 3421 bv = float32_val(b); 3422 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3423 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3424 3425 } 3426 3427 /*---------------------------------------------------------------------------- 3428 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3429 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3430 | comparison is performed according to the IEC/IEEE Standard for Binary 3431 | Floating-Point Arithmetic. 3432 *----------------------------------------------------------------------------*/ 3433 3434 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3435 { 3436 a = float32_squash_input_denormal(a, status); 3437 b = float32_squash_input_denormal(b, status); 3438 3439 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3440 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3441 ) { 3442 if (float32_is_signaling_nan(a, status) 3443 || float32_is_signaling_nan(b, status)) { 3444 float_raise(float_flag_invalid, status); 3445 } 3446 return 1; 3447 } 3448 return 0; 3449 } 3450 3451 /*---------------------------------------------------------------------------- 3452 | Returns the result of converting the double-precision floating-point value 3453 | `a' to the 32-bit two's complement integer format. The conversion is 3454 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3455 | Arithmetic---which means in particular that the conversion is rounded 3456 | according to the current rounding mode. If `a' is a NaN, the largest 3457 | positive integer is returned. Otherwise, if the conversion overflows, the 3458 | largest integer with the same sign as `a' is returned. 3459 *----------------------------------------------------------------------------*/ 3460 3461 int32_t float64_to_int32(float64 a, float_status *status) 3462 { 3463 flag aSign; 3464 int aExp; 3465 int shiftCount; 3466 uint64_t aSig; 3467 a = float64_squash_input_denormal(a, status); 3468 3469 aSig = extractFloat64Frac( a ); 3470 aExp = extractFloat64Exp( a ); 3471 aSign = extractFloat64Sign( a ); 3472 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3473 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3474 shiftCount = 0x42C - aExp; 3475 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3476 return roundAndPackInt32(aSign, aSig, status); 3477 3478 } 3479 3480 /*---------------------------------------------------------------------------- 3481 | Returns the result of converting the double-precision floating-point value 3482 | `a' to the 32-bit two's complement integer format. The conversion is 3483 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3484 | Arithmetic, except that the conversion is always rounded toward zero. 3485 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3486 | the conversion overflows, the largest integer with the same sign as `a' is 3487 | returned. 3488 *----------------------------------------------------------------------------*/ 3489 3490 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3491 { 3492 flag aSign; 3493 int aExp; 3494 int shiftCount; 3495 uint64_t aSig, savedASig; 3496 int32_t z; 3497 a = float64_squash_input_denormal(a, status); 3498 3499 aSig = extractFloat64Frac( a ); 3500 aExp = extractFloat64Exp( a ); 3501 aSign = extractFloat64Sign( a ); 3502 if ( 0x41E < aExp ) { 3503 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3504 goto invalid; 3505 } 3506 else if ( aExp < 0x3FF ) { 3507 if (aExp || aSig) { 3508 status->float_exception_flags |= float_flag_inexact; 3509 } 3510 return 0; 3511 } 3512 aSig |= LIT64( 0x0010000000000000 ); 3513 shiftCount = 0x433 - aExp; 3514 savedASig = aSig; 3515 aSig >>= shiftCount; 3516 z = aSig; 3517 if ( aSign ) z = - z; 3518 if ( ( z < 0 ) ^ aSign ) { 3519 invalid: 3520 float_raise(float_flag_invalid, status); 3521 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3522 } 3523 if ( ( aSig<<shiftCount ) != savedASig ) { 3524 status->float_exception_flags |= float_flag_inexact; 3525 } 3526 return z; 3527 3528 } 3529 3530 /*---------------------------------------------------------------------------- 3531 | Returns the result of converting the double-precision floating-point value 3532 | `a' to the 16-bit two's complement integer format. The conversion is 3533 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3534 | Arithmetic, except that the conversion is always rounded toward zero. 3535 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3536 | the conversion overflows, the largest integer with the same sign as `a' is 3537 | returned. 3538 *----------------------------------------------------------------------------*/ 3539 3540 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3541 { 3542 flag aSign; 3543 int aExp; 3544 int shiftCount; 3545 uint64_t aSig, savedASig; 3546 int32_t z; 3547 3548 aSig = extractFloat64Frac( a ); 3549 aExp = extractFloat64Exp( a ); 3550 aSign = extractFloat64Sign( a ); 3551 if ( 0x40E < aExp ) { 3552 if ( ( aExp == 0x7FF ) && aSig ) { 3553 aSign = 0; 3554 } 3555 goto invalid; 3556 } 3557 else if ( aExp < 0x3FF ) { 3558 if ( aExp || aSig ) { 3559 status->float_exception_flags |= float_flag_inexact; 3560 } 3561 return 0; 3562 } 3563 aSig |= LIT64( 0x0010000000000000 ); 3564 shiftCount = 0x433 - aExp; 3565 savedASig = aSig; 3566 aSig >>= shiftCount; 3567 z = aSig; 3568 if ( aSign ) { 3569 z = - z; 3570 } 3571 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3572 invalid: 3573 float_raise(float_flag_invalid, status); 3574 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3575 } 3576 if ( ( aSig<<shiftCount ) != savedASig ) { 3577 status->float_exception_flags |= float_flag_inexact; 3578 } 3579 return z; 3580 } 3581 3582 /*---------------------------------------------------------------------------- 3583 | Returns the result of converting the double-precision floating-point value 3584 | `a' to the 64-bit two's complement integer format. The conversion is 3585 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3586 | Arithmetic---which means in particular that the conversion is rounded 3587 | according to the current rounding mode. If `a' is a NaN, the largest 3588 | positive integer is returned. Otherwise, if the conversion overflows, the 3589 | largest integer with the same sign as `a' is returned. 3590 *----------------------------------------------------------------------------*/ 3591 3592 int64_t float64_to_int64(float64 a, float_status *status) 3593 { 3594 flag aSign; 3595 int aExp; 3596 int shiftCount; 3597 uint64_t aSig, aSigExtra; 3598 a = float64_squash_input_denormal(a, status); 3599 3600 aSig = extractFloat64Frac( a ); 3601 aExp = extractFloat64Exp( a ); 3602 aSign = extractFloat64Sign( a ); 3603 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3604 shiftCount = 0x433 - aExp; 3605 if ( shiftCount <= 0 ) { 3606 if ( 0x43E < aExp ) { 3607 float_raise(float_flag_invalid, status); 3608 if ( ! aSign 3609 || ( ( aExp == 0x7FF ) 3610 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3611 ) { 3612 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3613 } 3614 return (int64_t) LIT64( 0x8000000000000000 ); 3615 } 3616 aSigExtra = 0; 3617 aSig <<= - shiftCount; 3618 } 3619 else { 3620 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3621 } 3622 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3623 3624 } 3625 3626 /*---------------------------------------------------------------------------- 3627 | Returns the result of converting the double-precision floating-point value 3628 | `a' to the 64-bit two's complement integer format. The conversion is 3629 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3630 | Arithmetic, except that the conversion is always rounded toward zero. 3631 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3632 | the conversion overflows, the largest integer with the same sign as `a' is 3633 | returned. 3634 *----------------------------------------------------------------------------*/ 3635 3636 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3637 { 3638 flag aSign; 3639 int aExp; 3640 int shiftCount; 3641 uint64_t aSig; 3642 int64_t z; 3643 a = float64_squash_input_denormal(a, status); 3644 3645 aSig = extractFloat64Frac( a ); 3646 aExp = extractFloat64Exp( a ); 3647 aSign = extractFloat64Sign( a ); 3648 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3649 shiftCount = aExp - 0x433; 3650 if ( 0 <= shiftCount ) { 3651 if ( 0x43E <= aExp ) { 3652 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3653 float_raise(float_flag_invalid, status); 3654 if ( ! aSign 3655 || ( ( aExp == 0x7FF ) 3656 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3657 ) { 3658 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3659 } 3660 } 3661 return (int64_t) LIT64( 0x8000000000000000 ); 3662 } 3663 z = aSig<<shiftCount; 3664 } 3665 else { 3666 if ( aExp < 0x3FE ) { 3667 if (aExp | aSig) { 3668 status->float_exception_flags |= float_flag_inexact; 3669 } 3670 return 0; 3671 } 3672 z = aSig>>( - shiftCount ); 3673 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3674 status->float_exception_flags |= float_flag_inexact; 3675 } 3676 } 3677 if ( aSign ) z = - z; 3678 return z; 3679 3680 } 3681 3682 /*---------------------------------------------------------------------------- 3683 | Returns the result of converting the double-precision floating-point value 3684 | `a' to the single-precision floating-point format. The conversion is 3685 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3686 | Arithmetic. 3687 *----------------------------------------------------------------------------*/ 3688 3689 float32 float64_to_float32(float64 a, float_status *status) 3690 { 3691 flag aSign; 3692 int aExp; 3693 uint64_t aSig; 3694 uint32_t zSig; 3695 a = float64_squash_input_denormal(a, status); 3696 3697 aSig = extractFloat64Frac( a ); 3698 aExp = extractFloat64Exp( a ); 3699 aSign = extractFloat64Sign( a ); 3700 if ( aExp == 0x7FF ) { 3701 if (aSig) { 3702 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3703 } 3704 return packFloat32( aSign, 0xFF, 0 ); 3705 } 3706 shift64RightJamming( aSig, 22, &aSig ); 3707 zSig = aSig; 3708 if ( aExp || zSig ) { 3709 zSig |= 0x40000000; 3710 aExp -= 0x381; 3711 } 3712 return roundAndPackFloat32(aSign, aExp, zSig, status); 3713 3714 } 3715 3716 3717 /*---------------------------------------------------------------------------- 3718 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3719 | half-precision floating-point value, returning the result. After being 3720 | shifted into the proper positions, the three fields are simply added 3721 | together to form the result. This means that any integer portion of `zSig' 3722 | will be added into the exponent. Since a properly normalized significand 3723 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3724 | than the desired result exponent whenever `zSig' is a complete, normalized 3725 | significand. 3726 *----------------------------------------------------------------------------*/ 3727 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3728 { 3729 return make_float16( 3730 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3731 } 3732 3733 /*---------------------------------------------------------------------------- 3734 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3735 | and significand `zSig', and returns the proper half-precision floating- 3736 | point value corresponding to the abstract input. Ordinarily, the abstract 3737 | value is simply rounded and packed into the half-precision format, with 3738 | the inexact exception raised if the abstract input cannot be represented 3739 | exactly. However, if the abstract value is too large, the overflow and 3740 | inexact exceptions are raised and an infinity or maximal finite value is 3741 | returned. If the abstract value is too small, the input value is rounded to 3742 | a subnormal number, and the underflow and inexact exceptions are raised if 3743 | the abstract input cannot be represented exactly as a subnormal half- 3744 | precision floating-point number. 3745 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3746 | ARM-style "alternative representation", which omits the NaN and Inf 3747 | encodings in order to raise the maximum representable exponent by one. 3748 | The input significand `zSig' has its binary point between bits 22 3749 | and 23, which is 13 bits to the left of the usual location. This shifted 3750 | significand must be normalized or smaller. If `zSig' is not normalized, 3751 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3752 | and it must not require rounding. In the usual case that `zSig' is 3753 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3754 | Note the slightly odd position of the binary point in zSig compared with the 3755 | other roundAndPackFloat functions. This should probably be fixed if we 3756 | need to implement more float16 routines than just conversion. 3757 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3758 | Binary Floating-Point Arithmetic. 3759 *----------------------------------------------------------------------------*/ 3760 3761 static float16 roundAndPackFloat16(flag zSign, int zExp, 3762 uint32_t zSig, flag ieee, 3763 float_status *status) 3764 { 3765 int maxexp = ieee ? 29 : 30; 3766 uint32_t mask; 3767 uint32_t increment; 3768 bool rounding_bumps_exp; 3769 bool is_tiny = false; 3770 3771 /* Calculate the mask of bits of the mantissa which are not 3772 * representable in half-precision and will be lost. 3773 */ 3774 if (zExp < 1) { 3775 /* Will be denormal in halfprec */ 3776 mask = 0x00ffffff; 3777 if (zExp >= -11) { 3778 mask >>= 11 + zExp; 3779 } 3780 } else { 3781 /* Normal number in halfprec */ 3782 mask = 0x00001fff; 3783 } 3784 3785 switch (status->float_rounding_mode) { 3786 case float_round_nearest_even: 3787 increment = (mask + 1) >> 1; 3788 if ((zSig & mask) == increment) { 3789 increment = zSig & (increment << 1); 3790 } 3791 break; 3792 case float_round_ties_away: 3793 increment = (mask + 1) >> 1; 3794 break; 3795 case float_round_up: 3796 increment = zSign ? 0 : mask; 3797 break; 3798 case float_round_down: 3799 increment = zSign ? mask : 0; 3800 break; 3801 default: /* round_to_zero */ 3802 increment = 0; 3803 break; 3804 } 3805 3806 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3807 3808 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3809 if (ieee) { 3810 float_raise(float_flag_overflow | float_flag_inexact, status); 3811 return packFloat16(zSign, 0x1f, 0); 3812 } else { 3813 float_raise(float_flag_invalid, status); 3814 return packFloat16(zSign, 0x1f, 0x3ff); 3815 } 3816 } 3817 3818 if (zExp < 0) { 3819 /* Note that flush-to-zero does not affect half-precision results */ 3820 is_tiny = 3821 (status->float_detect_tininess == float_tininess_before_rounding) 3822 || (zExp < -1) 3823 || (!rounding_bumps_exp); 3824 } 3825 if (zSig & mask) { 3826 float_raise(float_flag_inexact, status); 3827 if (is_tiny) { 3828 float_raise(float_flag_underflow, status); 3829 } 3830 } 3831 3832 zSig += increment; 3833 if (rounding_bumps_exp) { 3834 zSig >>= 1; 3835 zExp++; 3836 } 3837 3838 if (zExp < -10) { 3839 return packFloat16(zSign, 0, 0); 3840 } 3841 if (zExp < 0) { 3842 zSig >>= -zExp; 3843 zExp = 0; 3844 } 3845 return packFloat16(zSign, zExp, zSig >> 13); 3846 } 3847 3848 /*---------------------------------------------------------------------------- 3849 | If `a' is denormal and we are in flush-to-zero mode then set the 3850 | input-denormal exception and return zero. Otherwise just return the value. 3851 *----------------------------------------------------------------------------*/ 3852 float16 float16_squash_input_denormal(float16 a, float_status *status) 3853 { 3854 if (status->flush_inputs_to_zero) { 3855 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3856 float_raise(float_flag_input_denormal, status); 3857 return make_float16(float16_val(a) & 0x8000); 3858 } 3859 } 3860 return a; 3861 } 3862 3863 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3864 uint32_t *zSigPtr) 3865 { 3866 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3867 *zSigPtr = aSig << shiftCount; 3868 *zExpPtr = 1 - shiftCount; 3869 } 3870 3871 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3872 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3873 3874 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3875 { 3876 flag aSign; 3877 int aExp; 3878 uint32_t aSig; 3879 3880 aSign = extractFloat16Sign(a); 3881 aExp = extractFloat16Exp(a); 3882 aSig = extractFloat16Frac(a); 3883 3884 if (aExp == 0x1f && ieee) { 3885 if (aSig) { 3886 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3887 } 3888 return packFloat32(aSign, 0xff, 0); 3889 } 3890 if (aExp == 0) { 3891 if (aSig == 0) { 3892 return packFloat32(aSign, 0, 0); 3893 } 3894 3895 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3896 aExp--; 3897 } 3898 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3899 } 3900 3901 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3902 { 3903 flag aSign; 3904 int aExp; 3905 uint32_t aSig; 3906 3907 a = float32_squash_input_denormal(a, status); 3908 3909 aSig = extractFloat32Frac( a ); 3910 aExp = extractFloat32Exp( a ); 3911 aSign = extractFloat32Sign( a ); 3912 if ( aExp == 0xFF ) { 3913 if (aSig) { 3914 /* Input is a NaN */ 3915 if (!ieee) { 3916 float_raise(float_flag_invalid, status); 3917 return packFloat16(aSign, 0, 0); 3918 } 3919 return commonNaNToFloat16( 3920 float32ToCommonNaN(a, status), status); 3921 } 3922 /* Infinity */ 3923 if (!ieee) { 3924 float_raise(float_flag_invalid, status); 3925 return packFloat16(aSign, 0x1f, 0x3ff); 3926 } 3927 return packFloat16(aSign, 0x1f, 0); 3928 } 3929 if (aExp == 0 && aSig == 0) { 3930 return packFloat16(aSign, 0, 0); 3931 } 3932 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3933 * even if the input is denormal; however this is harmless because 3934 * the largest possible single-precision denormal is still smaller 3935 * than the smallest representable half-precision denormal, and so we 3936 * will end up ignoring aSig and returning via the "always return zero" 3937 * codepath. 3938 */ 3939 aSig |= 0x00800000; 3940 aExp -= 0x71; 3941 3942 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3943 } 3944 3945 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3946 { 3947 flag aSign; 3948 int aExp; 3949 uint32_t aSig; 3950 3951 aSign = extractFloat16Sign(a); 3952 aExp = extractFloat16Exp(a); 3953 aSig = extractFloat16Frac(a); 3954 3955 if (aExp == 0x1f && ieee) { 3956 if (aSig) { 3957 return commonNaNToFloat64( 3958 float16ToCommonNaN(a, status), status); 3959 } 3960 return packFloat64(aSign, 0x7ff, 0); 3961 } 3962 if (aExp == 0) { 3963 if (aSig == 0) { 3964 return packFloat64(aSign, 0, 0); 3965 } 3966 3967 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3968 aExp--; 3969 } 3970 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3971 } 3972 3973 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3974 { 3975 flag aSign; 3976 int aExp; 3977 uint64_t aSig; 3978 uint32_t zSig; 3979 3980 a = float64_squash_input_denormal(a, status); 3981 3982 aSig = extractFloat64Frac(a); 3983 aExp = extractFloat64Exp(a); 3984 aSign = extractFloat64Sign(a); 3985 if (aExp == 0x7FF) { 3986 if (aSig) { 3987 /* Input is a NaN */ 3988 if (!ieee) { 3989 float_raise(float_flag_invalid, status); 3990 return packFloat16(aSign, 0, 0); 3991 } 3992 return commonNaNToFloat16( 3993 float64ToCommonNaN(a, status), status); 3994 } 3995 /* Infinity */ 3996 if (!ieee) { 3997 float_raise(float_flag_invalid, status); 3998 return packFloat16(aSign, 0x1f, 0x3ff); 3999 } 4000 return packFloat16(aSign, 0x1f, 0); 4001 } 4002 shift64RightJamming(aSig, 29, &aSig); 4003 zSig = aSig; 4004 if (aExp == 0 && zSig == 0) { 4005 return packFloat16(aSign, 0, 0); 4006 } 4007 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4008 * even if the input is denormal; however this is harmless because 4009 * the largest possible single-precision denormal is still smaller 4010 * than the smallest representable half-precision denormal, and so we 4011 * will end up ignoring aSig and returning via the "always return zero" 4012 * codepath. 4013 */ 4014 zSig |= 0x00800000; 4015 aExp -= 0x3F1; 4016 4017 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4018 } 4019 4020 /*---------------------------------------------------------------------------- 4021 | Returns the result of converting the double-precision floating-point value 4022 | `a' to the extended double-precision floating-point format. The conversion 4023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4024 | Arithmetic. 4025 *----------------------------------------------------------------------------*/ 4026 4027 floatx80 float64_to_floatx80(float64 a, float_status *status) 4028 { 4029 flag aSign; 4030 int aExp; 4031 uint64_t aSig; 4032 4033 a = float64_squash_input_denormal(a, status); 4034 aSig = extractFloat64Frac( a ); 4035 aExp = extractFloat64Exp( a ); 4036 aSign = extractFloat64Sign( a ); 4037 if ( aExp == 0x7FF ) { 4038 if (aSig) { 4039 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4040 } 4041 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4042 } 4043 if ( aExp == 0 ) { 4044 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4045 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4046 } 4047 return 4048 packFloatx80( 4049 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4050 4051 } 4052 4053 /*---------------------------------------------------------------------------- 4054 | Returns the result of converting the double-precision floating-point value 4055 | `a' to the quadruple-precision floating-point format. The conversion is 4056 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4057 | Arithmetic. 4058 *----------------------------------------------------------------------------*/ 4059 4060 float128 float64_to_float128(float64 a, float_status *status) 4061 { 4062 flag aSign; 4063 int aExp; 4064 uint64_t aSig, zSig0, zSig1; 4065 4066 a = float64_squash_input_denormal(a, status); 4067 aSig = extractFloat64Frac( a ); 4068 aExp = extractFloat64Exp( a ); 4069 aSign = extractFloat64Sign( a ); 4070 if ( aExp == 0x7FF ) { 4071 if (aSig) { 4072 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4073 } 4074 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4075 } 4076 if ( aExp == 0 ) { 4077 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4078 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4079 --aExp; 4080 } 4081 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4082 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4083 4084 } 4085 4086 /*---------------------------------------------------------------------------- 4087 | Rounds the double-precision floating-point value `a' to an integer, and 4088 | returns the result as a double-precision floating-point value. The 4089 | operation is performed according to the IEC/IEEE Standard for Binary 4090 | Floating-Point Arithmetic. 4091 *----------------------------------------------------------------------------*/ 4092 4093 float64 float64_round_to_int(float64 a, float_status *status) 4094 { 4095 flag aSign; 4096 int aExp; 4097 uint64_t lastBitMask, roundBitsMask; 4098 uint64_t z; 4099 a = float64_squash_input_denormal(a, status); 4100 4101 aExp = extractFloat64Exp( a ); 4102 if ( 0x433 <= aExp ) { 4103 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 4104 return propagateFloat64NaN(a, a, status); 4105 } 4106 return a; 4107 } 4108 if ( aExp < 0x3FF ) { 4109 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 4110 status->float_exception_flags |= float_flag_inexact; 4111 aSign = extractFloat64Sign( a ); 4112 switch (status->float_rounding_mode) { 4113 case float_round_nearest_even: 4114 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 4115 return packFloat64( aSign, 0x3FF, 0 ); 4116 } 4117 break; 4118 case float_round_ties_away: 4119 if (aExp == 0x3FE) { 4120 return packFloat64(aSign, 0x3ff, 0); 4121 } 4122 break; 4123 case float_round_down: 4124 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 4125 case float_round_up: 4126 return make_float64( 4127 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 4128 } 4129 return packFloat64( aSign, 0, 0 ); 4130 } 4131 lastBitMask = 1; 4132 lastBitMask <<= 0x433 - aExp; 4133 roundBitsMask = lastBitMask - 1; 4134 z = float64_val(a); 4135 switch (status->float_rounding_mode) { 4136 case float_round_nearest_even: 4137 z += lastBitMask >> 1; 4138 if ((z & roundBitsMask) == 0) { 4139 z &= ~lastBitMask; 4140 } 4141 break; 4142 case float_round_ties_away: 4143 z += lastBitMask >> 1; 4144 break; 4145 case float_round_to_zero: 4146 break; 4147 case float_round_up: 4148 if (!extractFloat64Sign(make_float64(z))) { 4149 z += roundBitsMask; 4150 } 4151 break; 4152 case float_round_down: 4153 if (extractFloat64Sign(make_float64(z))) { 4154 z += roundBitsMask; 4155 } 4156 break; 4157 default: 4158 abort(); 4159 } 4160 z &= ~ roundBitsMask; 4161 if (z != float64_val(a)) { 4162 status->float_exception_flags |= float_flag_inexact; 4163 } 4164 return make_float64(z); 4165 4166 } 4167 4168 float64 float64_trunc_to_int(float64 a, float_status *status) 4169 { 4170 int oldmode; 4171 float64 res; 4172 oldmode = status->float_rounding_mode; 4173 status->float_rounding_mode = float_round_to_zero; 4174 res = float64_round_to_int(a, status); 4175 status->float_rounding_mode = oldmode; 4176 return res; 4177 } 4178 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns the remainder of the double-precision floating-point value `a' 4182 | with respect to the corresponding value `b'. The operation is performed 4183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4184 *----------------------------------------------------------------------------*/ 4185 4186 float64 float64_rem(float64 a, float64 b, float_status *status) 4187 { 4188 flag aSign, zSign; 4189 int aExp, bExp, expDiff; 4190 uint64_t aSig, bSig; 4191 uint64_t q, alternateASig; 4192 int64_t sigMean; 4193 4194 a = float64_squash_input_denormal(a, status); 4195 b = float64_squash_input_denormal(b, status); 4196 aSig = extractFloat64Frac( a ); 4197 aExp = extractFloat64Exp( a ); 4198 aSign = extractFloat64Sign( a ); 4199 bSig = extractFloat64Frac( b ); 4200 bExp = extractFloat64Exp( b ); 4201 if ( aExp == 0x7FF ) { 4202 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4203 return propagateFloat64NaN(a, b, status); 4204 } 4205 float_raise(float_flag_invalid, status); 4206 return float64_default_nan(status); 4207 } 4208 if ( bExp == 0x7FF ) { 4209 if (bSig) { 4210 return propagateFloat64NaN(a, b, status); 4211 } 4212 return a; 4213 } 4214 if ( bExp == 0 ) { 4215 if ( bSig == 0 ) { 4216 float_raise(float_flag_invalid, status); 4217 return float64_default_nan(status); 4218 } 4219 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4220 } 4221 if ( aExp == 0 ) { 4222 if ( aSig == 0 ) return a; 4223 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4224 } 4225 expDiff = aExp - bExp; 4226 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4227 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4228 if ( expDiff < 0 ) { 4229 if ( expDiff < -1 ) return a; 4230 aSig >>= 1; 4231 } 4232 q = ( bSig <= aSig ); 4233 if ( q ) aSig -= bSig; 4234 expDiff -= 64; 4235 while ( 0 < expDiff ) { 4236 q = estimateDiv128To64( aSig, 0, bSig ); 4237 q = ( 2 < q ) ? q - 2 : 0; 4238 aSig = - ( ( bSig>>2 ) * q ); 4239 expDiff -= 62; 4240 } 4241 expDiff += 64; 4242 if ( 0 < expDiff ) { 4243 q = estimateDiv128To64( aSig, 0, bSig ); 4244 q = ( 2 < q ) ? q - 2 : 0; 4245 q >>= 64 - expDiff; 4246 bSig >>= 2; 4247 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4248 } 4249 else { 4250 aSig >>= 2; 4251 bSig >>= 2; 4252 } 4253 do { 4254 alternateASig = aSig; 4255 ++q; 4256 aSig -= bSig; 4257 } while ( 0 <= (int64_t) aSig ); 4258 sigMean = aSig + alternateASig; 4259 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4260 aSig = alternateASig; 4261 } 4262 zSign = ( (int64_t) aSig < 0 ); 4263 if ( zSign ) aSig = - aSig; 4264 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4265 4266 } 4267 4268 /*---------------------------------------------------------------------------- 4269 | Returns the result of multiplying the double-precision floating-point values 4270 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4271 | multiplication. The operation is performed according to the IEC/IEEE 4272 | Standard for Binary Floating-Point Arithmetic 754-2008. 4273 | The flags argument allows the caller to select negation of the 4274 | addend, the intermediate product, or the final result. (The difference 4275 | between this and having the caller do a separate negation is that negating 4276 | externally will flip the sign bit on NaNs.) 4277 *----------------------------------------------------------------------------*/ 4278 4279 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4280 float_status *status) 4281 { 4282 flag aSign, bSign, cSign, zSign; 4283 int aExp, bExp, cExp, pExp, zExp, expDiff; 4284 uint64_t aSig, bSig, cSig; 4285 flag pInf, pZero, pSign; 4286 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4287 int shiftcount; 4288 flag signflip, infzero; 4289 4290 a = float64_squash_input_denormal(a, status); 4291 b = float64_squash_input_denormal(b, status); 4292 c = float64_squash_input_denormal(c, status); 4293 aSig = extractFloat64Frac(a); 4294 aExp = extractFloat64Exp(a); 4295 aSign = extractFloat64Sign(a); 4296 bSig = extractFloat64Frac(b); 4297 bExp = extractFloat64Exp(b); 4298 bSign = extractFloat64Sign(b); 4299 cSig = extractFloat64Frac(c); 4300 cExp = extractFloat64Exp(c); 4301 cSign = extractFloat64Sign(c); 4302 4303 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4304 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4305 4306 /* It is implementation-defined whether the cases of (0,inf,qnan) 4307 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4308 * they return if they do), so we have to hand this information 4309 * off to the target-specific pick-a-NaN routine. 4310 */ 4311 if (((aExp == 0x7ff) && aSig) || 4312 ((bExp == 0x7ff) && bSig) || 4313 ((cExp == 0x7ff) && cSig)) { 4314 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4315 } 4316 4317 if (infzero) { 4318 float_raise(float_flag_invalid, status); 4319 return float64_default_nan(status); 4320 } 4321 4322 if (flags & float_muladd_negate_c) { 4323 cSign ^= 1; 4324 } 4325 4326 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4327 4328 /* Work out the sign and type of the product */ 4329 pSign = aSign ^ bSign; 4330 if (flags & float_muladd_negate_product) { 4331 pSign ^= 1; 4332 } 4333 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4334 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4335 4336 if (cExp == 0x7ff) { 4337 if (pInf && (pSign ^ cSign)) { 4338 /* addition of opposite-signed infinities => InvalidOperation */ 4339 float_raise(float_flag_invalid, status); 4340 return float64_default_nan(status); 4341 } 4342 /* Otherwise generate an infinity of the same sign */ 4343 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4344 } 4345 4346 if (pInf) { 4347 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4348 } 4349 4350 if (pZero) { 4351 if (cExp == 0) { 4352 if (cSig == 0) { 4353 /* Adding two exact zeroes */ 4354 if (pSign == cSign) { 4355 zSign = pSign; 4356 } else if (status->float_rounding_mode == float_round_down) { 4357 zSign = 1; 4358 } else { 4359 zSign = 0; 4360 } 4361 return packFloat64(zSign ^ signflip, 0, 0); 4362 } 4363 /* Exact zero plus a denorm */ 4364 if (status->flush_to_zero) { 4365 float_raise(float_flag_output_denormal, status); 4366 return packFloat64(cSign ^ signflip, 0, 0); 4367 } 4368 } 4369 /* Zero plus something non-zero : just return the something */ 4370 if (flags & float_muladd_halve_result) { 4371 if (cExp == 0) { 4372 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4373 } 4374 /* Subtract one to halve, and one again because roundAndPackFloat64 4375 * wants one less than the true exponent. 4376 */ 4377 cExp -= 2; 4378 cSig = (cSig | 0x0010000000000000ULL) << 10; 4379 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4380 } 4381 return packFloat64(cSign ^ signflip, cExp, cSig); 4382 } 4383 4384 if (aExp == 0) { 4385 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4386 } 4387 if (bExp == 0) { 4388 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4389 } 4390 4391 /* Calculate the actual result a * b + c */ 4392 4393 /* Multiply first; this is easy. */ 4394 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4395 * because we want the true exponent, not the "one-less-than" 4396 * flavour that roundAndPackFloat64() takes. 4397 */ 4398 pExp = aExp + bExp - 0x3fe; 4399 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4400 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4401 mul64To128(aSig, bSig, &pSig0, &pSig1); 4402 if ((int64_t)(pSig0 << 1) >= 0) { 4403 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4404 pExp--; 4405 } 4406 4407 zSign = pSign ^ signflip; 4408 4409 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4410 * bit in position 126. 4411 */ 4412 if (cExp == 0) { 4413 if (!cSig) { 4414 /* Throw out the special case of c being an exact zero now */ 4415 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4416 if (flags & float_muladd_halve_result) { 4417 pExp--; 4418 } 4419 return roundAndPackFloat64(zSign, pExp - 1, 4420 pSig1, status); 4421 } 4422 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4423 } 4424 4425 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4426 * significand of the addend, with the explicit bit in position 126. 4427 */ 4428 cSig0 = cSig << (126 - 64 - 52); 4429 cSig1 = 0; 4430 cSig0 |= LIT64(0x4000000000000000); 4431 expDiff = pExp - cExp; 4432 4433 if (pSign == cSign) { 4434 /* Addition */ 4435 if (expDiff > 0) { 4436 /* scale c to match p */ 4437 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4438 zExp = pExp; 4439 } else if (expDiff < 0) { 4440 /* scale p to match c */ 4441 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4442 zExp = cExp; 4443 } else { 4444 /* no scaling needed */ 4445 zExp = cExp; 4446 } 4447 /* Add significands and make sure explicit bit ends up in posn 126 */ 4448 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4449 if ((int64_t)zSig0 < 0) { 4450 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4451 } else { 4452 zExp--; 4453 } 4454 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4455 if (flags & float_muladd_halve_result) { 4456 zExp--; 4457 } 4458 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4459 } else { 4460 /* Subtraction */ 4461 if (expDiff > 0) { 4462 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4463 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4464 zExp = pExp; 4465 } else if (expDiff < 0) { 4466 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4467 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4468 zExp = cExp; 4469 zSign ^= 1; 4470 } else { 4471 zExp = pExp; 4472 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4473 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4474 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4475 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4476 zSign ^= 1; 4477 } else { 4478 /* Exact zero */ 4479 zSign = signflip; 4480 if (status->float_rounding_mode == float_round_down) { 4481 zSign ^= 1; 4482 } 4483 return packFloat64(zSign, 0, 0); 4484 } 4485 } 4486 --zExp; 4487 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4488 * starting with the significand in a pair of uint64_t. 4489 */ 4490 if (zSig0) { 4491 shiftcount = countLeadingZeros64(zSig0) - 1; 4492 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4493 if (zSig1) { 4494 zSig0 |= 1; 4495 } 4496 zExp -= shiftcount; 4497 } else { 4498 shiftcount = countLeadingZeros64(zSig1); 4499 if (shiftcount == 0) { 4500 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4501 zExp -= 63; 4502 } else { 4503 shiftcount--; 4504 zSig0 = zSig1 << shiftcount; 4505 zExp -= (shiftcount + 64); 4506 } 4507 } 4508 if (flags & float_muladd_halve_result) { 4509 zExp--; 4510 } 4511 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4512 } 4513 } 4514 4515 /*---------------------------------------------------------------------------- 4516 | Returns the square root of the double-precision floating-point value `a'. 4517 | The operation is performed according to the IEC/IEEE Standard for Binary 4518 | Floating-Point Arithmetic. 4519 *----------------------------------------------------------------------------*/ 4520 4521 float64 float64_sqrt(float64 a, float_status *status) 4522 { 4523 flag aSign; 4524 int aExp, zExp; 4525 uint64_t aSig, zSig, doubleZSig; 4526 uint64_t rem0, rem1, term0, term1; 4527 a = float64_squash_input_denormal(a, status); 4528 4529 aSig = extractFloat64Frac( a ); 4530 aExp = extractFloat64Exp( a ); 4531 aSign = extractFloat64Sign( a ); 4532 if ( aExp == 0x7FF ) { 4533 if (aSig) { 4534 return propagateFloat64NaN(a, a, status); 4535 } 4536 if ( ! aSign ) return a; 4537 float_raise(float_flag_invalid, status); 4538 return float64_default_nan(status); 4539 } 4540 if ( aSign ) { 4541 if ( ( aExp | aSig ) == 0 ) return a; 4542 float_raise(float_flag_invalid, status); 4543 return float64_default_nan(status); 4544 } 4545 if ( aExp == 0 ) { 4546 if ( aSig == 0 ) return float64_zero; 4547 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4548 } 4549 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4550 aSig |= LIT64( 0x0010000000000000 ); 4551 zSig = estimateSqrt32( aExp, aSig>>21 ); 4552 aSig <<= 9 - ( aExp & 1 ); 4553 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4554 if ( ( zSig & 0x1FF ) <= 5 ) { 4555 doubleZSig = zSig<<1; 4556 mul64To128( zSig, zSig, &term0, &term1 ); 4557 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4558 while ( (int64_t) rem0 < 0 ) { 4559 --zSig; 4560 doubleZSig -= 2; 4561 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4562 } 4563 zSig |= ( ( rem0 | rem1 ) != 0 ); 4564 } 4565 return roundAndPackFloat64(0, zExp, zSig, status); 4566 4567 } 4568 4569 /*---------------------------------------------------------------------------- 4570 | Returns the binary log of the double-precision floating-point value `a'. 4571 | The operation is performed according to the IEC/IEEE Standard for Binary 4572 | Floating-Point Arithmetic. 4573 *----------------------------------------------------------------------------*/ 4574 float64 float64_log2(float64 a, float_status *status) 4575 { 4576 flag aSign, zSign; 4577 int aExp; 4578 uint64_t aSig, aSig0, aSig1, zSig, i; 4579 a = float64_squash_input_denormal(a, status); 4580 4581 aSig = extractFloat64Frac( a ); 4582 aExp = extractFloat64Exp( a ); 4583 aSign = extractFloat64Sign( a ); 4584 4585 if ( aExp == 0 ) { 4586 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4587 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4588 } 4589 if ( aSign ) { 4590 float_raise(float_flag_invalid, status); 4591 return float64_default_nan(status); 4592 } 4593 if ( aExp == 0x7FF ) { 4594 if (aSig) { 4595 return propagateFloat64NaN(a, float64_zero, status); 4596 } 4597 return a; 4598 } 4599 4600 aExp -= 0x3FF; 4601 aSig |= LIT64( 0x0010000000000000 ); 4602 zSign = aExp < 0; 4603 zSig = (uint64_t)aExp << 52; 4604 for (i = 1LL << 51; i > 0; i >>= 1) { 4605 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4606 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4607 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4608 aSig >>= 1; 4609 zSig |= i; 4610 } 4611 } 4612 4613 if ( zSign ) 4614 zSig = -zSig; 4615 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4616 } 4617 4618 /*---------------------------------------------------------------------------- 4619 | Returns 1 if the double-precision floating-point value `a' is equal to the 4620 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4621 | if either operand is a NaN. Otherwise, the comparison is performed 4622 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4623 *----------------------------------------------------------------------------*/ 4624 4625 int float64_eq(float64 a, float64 b, float_status *status) 4626 { 4627 uint64_t av, bv; 4628 a = float64_squash_input_denormal(a, status); 4629 b = float64_squash_input_denormal(b, status); 4630 4631 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4632 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4633 ) { 4634 float_raise(float_flag_invalid, status); 4635 return 0; 4636 } 4637 av = float64_val(a); 4638 bv = float64_val(b); 4639 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4640 4641 } 4642 4643 /*---------------------------------------------------------------------------- 4644 | Returns 1 if the double-precision floating-point value `a' is less than or 4645 | equal to the corresponding value `b', and 0 otherwise. The invalid 4646 | exception is raised if either operand is a NaN. The comparison is performed 4647 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4648 *----------------------------------------------------------------------------*/ 4649 4650 int float64_le(float64 a, float64 b, float_status *status) 4651 { 4652 flag aSign, bSign; 4653 uint64_t av, bv; 4654 a = float64_squash_input_denormal(a, status); 4655 b = float64_squash_input_denormal(b, status); 4656 4657 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4658 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4659 ) { 4660 float_raise(float_flag_invalid, status); 4661 return 0; 4662 } 4663 aSign = extractFloat64Sign( a ); 4664 bSign = extractFloat64Sign( b ); 4665 av = float64_val(a); 4666 bv = float64_val(b); 4667 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4668 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4669 4670 } 4671 4672 /*---------------------------------------------------------------------------- 4673 | Returns 1 if the double-precision floating-point value `a' is less than 4674 | the corresponding value `b', and 0 otherwise. The invalid exception is 4675 | raised if either operand is a NaN. The comparison is performed according 4676 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4677 *----------------------------------------------------------------------------*/ 4678 4679 int float64_lt(float64 a, float64 b, float_status *status) 4680 { 4681 flag aSign, bSign; 4682 uint64_t av, bv; 4683 4684 a = float64_squash_input_denormal(a, status); 4685 b = float64_squash_input_denormal(b, status); 4686 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4687 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4688 ) { 4689 float_raise(float_flag_invalid, status); 4690 return 0; 4691 } 4692 aSign = extractFloat64Sign( a ); 4693 bSign = extractFloat64Sign( b ); 4694 av = float64_val(a); 4695 bv = float64_val(b); 4696 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4697 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4698 4699 } 4700 4701 /*---------------------------------------------------------------------------- 4702 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4703 | be compared, and 0 otherwise. The invalid exception is raised if either 4704 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4705 | Standard for Binary Floating-Point Arithmetic. 4706 *----------------------------------------------------------------------------*/ 4707 4708 int float64_unordered(float64 a, float64 b, float_status *status) 4709 { 4710 a = float64_squash_input_denormal(a, status); 4711 b = float64_squash_input_denormal(b, status); 4712 4713 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4714 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4715 ) { 4716 float_raise(float_flag_invalid, status); 4717 return 1; 4718 } 4719 return 0; 4720 } 4721 4722 /*---------------------------------------------------------------------------- 4723 | Returns 1 if the double-precision floating-point value `a' is equal to the 4724 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4725 | exception.The comparison is performed according to the IEC/IEEE Standard 4726 | for Binary Floating-Point Arithmetic. 4727 *----------------------------------------------------------------------------*/ 4728 4729 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4730 { 4731 uint64_t av, bv; 4732 a = float64_squash_input_denormal(a, status); 4733 b = float64_squash_input_denormal(b, status); 4734 4735 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4736 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4737 ) { 4738 if (float64_is_signaling_nan(a, status) 4739 || float64_is_signaling_nan(b, status)) { 4740 float_raise(float_flag_invalid, status); 4741 } 4742 return 0; 4743 } 4744 av = float64_val(a); 4745 bv = float64_val(b); 4746 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4747 4748 } 4749 4750 /*---------------------------------------------------------------------------- 4751 | Returns 1 if the double-precision floating-point value `a' is less than or 4752 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4753 | cause an exception. Otherwise, the comparison is performed according to the 4754 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4755 *----------------------------------------------------------------------------*/ 4756 4757 int float64_le_quiet(float64 a, float64 b, float_status *status) 4758 { 4759 flag aSign, bSign; 4760 uint64_t av, bv; 4761 a = float64_squash_input_denormal(a, status); 4762 b = float64_squash_input_denormal(b, status); 4763 4764 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4765 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4766 ) { 4767 if (float64_is_signaling_nan(a, status) 4768 || float64_is_signaling_nan(b, status)) { 4769 float_raise(float_flag_invalid, status); 4770 } 4771 return 0; 4772 } 4773 aSign = extractFloat64Sign( a ); 4774 bSign = extractFloat64Sign( b ); 4775 av = float64_val(a); 4776 bv = float64_val(b); 4777 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4778 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4779 4780 } 4781 4782 /*---------------------------------------------------------------------------- 4783 | Returns 1 if the double-precision floating-point value `a' is less than 4784 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4785 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4786 | Standard for Binary Floating-Point Arithmetic. 4787 *----------------------------------------------------------------------------*/ 4788 4789 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4790 { 4791 flag aSign, bSign; 4792 uint64_t av, bv; 4793 a = float64_squash_input_denormal(a, status); 4794 b = float64_squash_input_denormal(b, status); 4795 4796 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4797 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4798 ) { 4799 if (float64_is_signaling_nan(a, status) 4800 || float64_is_signaling_nan(b, status)) { 4801 float_raise(float_flag_invalid, status); 4802 } 4803 return 0; 4804 } 4805 aSign = extractFloat64Sign( a ); 4806 bSign = extractFloat64Sign( b ); 4807 av = float64_val(a); 4808 bv = float64_val(b); 4809 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4810 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4811 4812 } 4813 4814 /*---------------------------------------------------------------------------- 4815 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4816 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4817 | comparison is performed according to the IEC/IEEE Standard for Binary 4818 | Floating-Point Arithmetic. 4819 *----------------------------------------------------------------------------*/ 4820 4821 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4822 { 4823 a = float64_squash_input_denormal(a, status); 4824 b = float64_squash_input_denormal(b, status); 4825 4826 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4827 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4828 ) { 4829 if (float64_is_signaling_nan(a, status) 4830 || float64_is_signaling_nan(b, status)) { 4831 float_raise(float_flag_invalid, status); 4832 } 4833 return 1; 4834 } 4835 return 0; 4836 } 4837 4838 /*---------------------------------------------------------------------------- 4839 | Returns the result of converting the extended double-precision floating- 4840 | point value `a' to the 32-bit two's complement integer format. The 4841 | conversion is performed according to the IEC/IEEE Standard for Binary 4842 | Floating-Point Arithmetic---which means in particular that the conversion 4843 | is rounded according to the current rounding mode. If `a' is a NaN, the 4844 | largest positive integer is returned. Otherwise, if the conversion 4845 | overflows, the largest integer with the same sign as `a' is returned. 4846 *----------------------------------------------------------------------------*/ 4847 4848 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4849 { 4850 flag aSign; 4851 int32_t aExp, shiftCount; 4852 uint64_t aSig; 4853 4854 if (floatx80_invalid_encoding(a)) { 4855 float_raise(float_flag_invalid, status); 4856 return 1 << 31; 4857 } 4858 aSig = extractFloatx80Frac( a ); 4859 aExp = extractFloatx80Exp( a ); 4860 aSign = extractFloatx80Sign( a ); 4861 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4862 shiftCount = 0x4037 - aExp; 4863 if ( shiftCount <= 0 ) shiftCount = 1; 4864 shift64RightJamming( aSig, shiftCount, &aSig ); 4865 return roundAndPackInt32(aSign, aSig, status); 4866 4867 } 4868 4869 /*---------------------------------------------------------------------------- 4870 | Returns the result of converting the extended double-precision floating- 4871 | point value `a' to the 32-bit two's complement integer format. The 4872 | conversion is performed according to the IEC/IEEE Standard for Binary 4873 | Floating-Point Arithmetic, except that the conversion is always rounded 4874 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4875 | Otherwise, if the conversion overflows, the largest integer with the same 4876 | sign as `a' is returned. 4877 *----------------------------------------------------------------------------*/ 4878 4879 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4880 { 4881 flag aSign; 4882 int32_t aExp, shiftCount; 4883 uint64_t aSig, savedASig; 4884 int32_t z; 4885 4886 if (floatx80_invalid_encoding(a)) { 4887 float_raise(float_flag_invalid, status); 4888 return 1 << 31; 4889 } 4890 aSig = extractFloatx80Frac( a ); 4891 aExp = extractFloatx80Exp( a ); 4892 aSign = extractFloatx80Sign( a ); 4893 if ( 0x401E < aExp ) { 4894 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4895 goto invalid; 4896 } 4897 else if ( aExp < 0x3FFF ) { 4898 if (aExp || aSig) { 4899 status->float_exception_flags |= float_flag_inexact; 4900 } 4901 return 0; 4902 } 4903 shiftCount = 0x403E - aExp; 4904 savedASig = aSig; 4905 aSig >>= shiftCount; 4906 z = aSig; 4907 if ( aSign ) z = - z; 4908 if ( ( z < 0 ) ^ aSign ) { 4909 invalid: 4910 float_raise(float_flag_invalid, status); 4911 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4912 } 4913 if ( ( aSig<<shiftCount ) != savedASig ) { 4914 status->float_exception_flags |= float_flag_inexact; 4915 } 4916 return z; 4917 4918 } 4919 4920 /*---------------------------------------------------------------------------- 4921 | Returns the result of converting the extended double-precision floating- 4922 | point value `a' to the 64-bit two's complement integer format. The 4923 | conversion is performed according to the IEC/IEEE Standard for Binary 4924 | Floating-Point Arithmetic---which means in particular that the conversion 4925 | is rounded according to the current rounding mode. If `a' is a NaN, 4926 | the largest positive integer is returned. Otherwise, if the conversion 4927 | overflows, the largest integer with the same sign as `a' is returned. 4928 *----------------------------------------------------------------------------*/ 4929 4930 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4931 { 4932 flag aSign; 4933 int32_t aExp, shiftCount; 4934 uint64_t aSig, aSigExtra; 4935 4936 if (floatx80_invalid_encoding(a)) { 4937 float_raise(float_flag_invalid, status); 4938 return 1ULL << 63; 4939 } 4940 aSig = extractFloatx80Frac( a ); 4941 aExp = extractFloatx80Exp( a ); 4942 aSign = extractFloatx80Sign( a ); 4943 shiftCount = 0x403E - aExp; 4944 if ( shiftCount <= 0 ) { 4945 if ( shiftCount ) { 4946 float_raise(float_flag_invalid, status); 4947 if ( ! aSign 4948 || ( ( aExp == 0x7FFF ) 4949 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4950 ) { 4951 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4952 } 4953 return (int64_t) LIT64( 0x8000000000000000 ); 4954 } 4955 aSigExtra = 0; 4956 } 4957 else { 4958 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4959 } 4960 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4961 4962 } 4963 4964 /*---------------------------------------------------------------------------- 4965 | Returns the result of converting the extended double-precision floating- 4966 | point value `a' to the 64-bit two's complement integer format. The 4967 | conversion is performed according to the IEC/IEEE Standard for Binary 4968 | Floating-Point Arithmetic, except that the conversion is always rounded 4969 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4970 | Otherwise, if the conversion overflows, the largest integer with the same 4971 | sign as `a' is returned. 4972 *----------------------------------------------------------------------------*/ 4973 4974 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4975 { 4976 flag aSign; 4977 int32_t aExp, shiftCount; 4978 uint64_t aSig; 4979 int64_t z; 4980 4981 if (floatx80_invalid_encoding(a)) { 4982 float_raise(float_flag_invalid, status); 4983 return 1ULL << 63; 4984 } 4985 aSig = extractFloatx80Frac( a ); 4986 aExp = extractFloatx80Exp( a ); 4987 aSign = extractFloatx80Sign( a ); 4988 shiftCount = aExp - 0x403E; 4989 if ( 0 <= shiftCount ) { 4990 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4991 if ( ( a.high != 0xC03E ) || aSig ) { 4992 float_raise(float_flag_invalid, status); 4993 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4994 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4995 } 4996 } 4997 return (int64_t) LIT64( 0x8000000000000000 ); 4998 } 4999 else if ( aExp < 0x3FFF ) { 5000 if (aExp | aSig) { 5001 status->float_exception_flags |= float_flag_inexact; 5002 } 5003 return 0; 5004 } 5005 z = aSig>>( - shiftCount ); 5006 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5007 status->float_exception_flags |= float_flag_inexact; 5008 } 5009 if ( aSign ) z = - z; 5010 return z; 5011 5012 } 5013 5014 /*---------------------------------------------------------------------------- 5015 | Returns the result of converting the extended double-precision floating- 5016 | point value `a' to the single-precision floating-point format. The 5017 | conversion is performed according to the IEC/IEEE Standard for Binary 5018 | Floating-Point Arithmetic. 5019 *----------------------------------------------------------------------------*/ 5020 5021 float32 floatx80_to_float32(floatx80 a, float_status *status) 5022 { 5023 flag aSign; 5024 int32_t aExp; 5025 uint64_t aSig; 5026 5027 if (floatx80_invalid_encoding(a)) { 5028 float_raise(float_flag_invalid, status); 5029 return float32_default_nan(status); 5030 } 5031 aSig = extractFloatx80Frac( a ); 5032 aExp = extractFloatx80Exp( a ); 5033 aSign = extractFloatx80Sign( a ); 5034 if ( aExp == 0x7FFF ) { 5035 if ( (uint64_t) ( aSig<<1 ) ) { 5036 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5037 } 5038 return packFloat32( aSign, 0xFF, 0 ); 5039 } 5040 shift64RightJamming( aSig, 33, &aSig ); 5041 if ( aExp || aSig ) aExp -= 0x3F81; 5042 return roundAndPackFloat32(aSign, aExp, aSig, status); 5043 5044 } 5045 5046 /*---------------------------------------------------------------------------- 5047 | Returns the result of converting the extended double-precision floating- 5048 | point value `a' to the double-precision floating-point format. The 5049 | conversion is performed according to the IEC/IEEE Standard for Binary 5050 | Floating-Point Arithmetic. 5051 *----------------------------------------------------------------------------*/ 5052 5053 float64 floatx80_to_float64(floatx80 a, float_status *status) 5054 { 5055 flag aSign; 5056 int32_t aExp; 5057 uint64_t aSig, zSig; 5058 5059 if (floatx80_invalid_encoding(a)) { 5060 float_raise(float_flag_invalid, status); 5061 return float64_default_nan(status); 5062 } 5063 aSig = extractFloatx80Frac( a ); 5064 aExp = extractFloatx80Exp( a ); 5065 aSign = extractFloatx80Sign( a ); 5066 if ( aExp == 0x7FFF ) { 5067 if ( (uint64_t) ( aSig<<1 ) ) { 5068 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5069 } 5070 return packFloat64( aSign, 0x7FF, 0 ); 5071 } 5072 shift64RightJamming( aSig, 1, &zSig ); 5073 if ( aExp || aSig ) aExp -= 0x3C01; 5074 return roundAndPackFloat64(aSign, aExp, zSig, status); 5075 5076 } 5077 5078 /*---------------------------------------------------------------------------- 5079 | Returns the result of converting the extended double-precision floating- 5080 | point value `a' to the quadruple-precision floating-point format. The 5081 | conversion is performed according to the IEC/IEEE Standard for Binary 5082 | Floating-Point Arithmetic. 5083 *----------------------------------------------------------------------------*/ 5084 5085 float128 floatx80_to_float128(floatx80 a, float_status *status) 5086 { 5087 flag aSign; 5088 int aExp; 5089 uint64_t aSig, zSig0, zSig1; 5090 5091 if (floatx80_invalid_encoding(a)) { 5092 float_raise(float_flag_invalid, status); 5093 return float128_default_nan(status); 5094 } 5095 aSig = extractFloatx80Frac( a ); 5096 aExp = extractFloatx80Exp( a ); 5097 aSign = extractFloatx80Sign( a ); 5098 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5099 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5100 } 5101 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5102 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5103 5104 } 5105 5106 /*---------------------------------------------------------------------------- 5107 | Rounds the extended double-precision floating-point value `a' 5108 | to the precision provided by floatx80_rounding_precision and returns the 5109 | result as an extended double-precision floating-point value. 5110 | The operation is performed according to the IEC/IEEE Standard for Binary 5111 | Floating-Point Arithmetic. 5112 *----------------------------------------------------------------------------*/ 5113 5114 floatx80 floatx80_round(floatx80 a, float_status *status) 5115 { 5116 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5117 extractFloatx80Sign(a), 5118 extractFloatx80Exp(a), 5119 extractFloatx80Frac(a), 0, status); 5120 } 5121 5122 /*---------------------------------------------------------------------------- 5123 | Rounds the extended double-precision floating-point value `a' to an integer, 5124 | and returns the result as an extended quadruple-precision floating-point 5125 | value. The operation is performed according to the IEC/IEEE Standard for 5126 | Binary Floating-Point Arithmetic. 5127 *----------------------------------------------------------------------------*/ 5128 5129 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5130 { 5131 flag aSign; 5132 int32_t aExp; 5133 uint64_t lastBitMask, roundBitsMask; 5134 floatx80 z; 5135 5136 if (floatx80_invalid_encoding(a)) { 5137 float_raise(float_flag_invalid, status); 5138 return floatx80_default_nan(status); 5139 } 5140 aExp = extractFloatx80Exp( a ); 5141 if ( 0x403E <= aExp ) { 5142 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5143 return propagateFloatx80NaN(a, a, status); 5144 } 5145 return a; 5146 } 5147 if ( aExp < 0x3FFF ) { 5148 if ( ( aExp == 0 ) 5149 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5150 return a; 5151 } 5152 status->float_exception_flags |= float_flag_inexact; 5153 aSign = extractFloatx80Sign( a ); 5154 switch (status->float_rounding_mode) { 5155 case float_round_nearest_even: 5156 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5157 ) { 5158 return 5159 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5160 } 5161 break; 5162 case float_round_ties_away: 5163 if (aExp == 0x3FFE) { 5164 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5165 } 5166 break; 5167 case float_round_down: 5168 return 5169 aSign ? 5170 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5171 : packFloatx80( 0, 0, 0 ); 5172 case float_round_up: 5173 return 5174 aSign ? packFloatx80( 1, 0, 0 ) 5175 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5176 } 5177 return packFloatx80( aSign, 0, 0 ); 5178 } 5179 lastBitMask = 1; 5180 lastBitMask <<= 0x403E - aExp; 5181 roundBitsMask = lastBitMask - 1; 5182 z = a; 5183 switch (status->float_rounding_mode) { 5184 case float_round_nearest_even: 5185 z.low += lastBitMask>>1; 5186 if ((z.low & roundBitsMask) == 0) { 5187 z.low &= ~lastBitMask; 5188 } 5189 break; 5190 case float_round_ties_away: 5191 z.low += lastBitMask >> 1; 5192 break; 5193 case float_round_to_zero: 5194 break; 5195 case float_round_up: 5196 if (!extractFloatx80Sign(z)) { 5197 z.low += roundBitsMask; 5198 } 5199 break; 5200 case float_round_down: 5201 if (extractFloatx80Sign(z)) { 5202 z.low += roundBitsMask; 5203 } 5204 break; 5205 default: 5206 abort(); 5207 } 5208 z.low &= ~ roundBitsMask; 5209 if ( z.low == 0 ) { 5210 ++z.high; 5211 z.low = LIT64( 0x8000000000000000 ); 5212 } 5213 if (z.low != a.low) { 5214 status->float_exception_flags |= float_flag_inexact; 5215 } 5216 return z; 5217 5218 } 5219 5220 /*---------------------------------------------------------------------------- 5221 | Returns the result of adding the absolute values of the extended double- 5222 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5223 | negated before being returned. `zSign' is ignored if the result is a NaN. 5224 | The addition is performed according to the IEC/IEEE Standard for Binary 5225 | Floating-Point Arithmetic. 5226 *----------------------------------------------------------------------------*/ 5227 5228 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5229 float_status *status) 5230 { 5231 int32_t aExp, bExp, zExp; 5232 uint64_t aSig, bSig, zSig0, zSig1; 5233 int32_t expDiff; 5234 5235 aSig = extractFloatx80Frac( a ); 5236 aExp = extractFloatx80Exp( a ); 5237 bSig = extractFloatx80Frac( b ); 5238 bExp = extractFloatx80Exp( b ); 5239 expDiff = aExp - bExp; 5240 if ( 0 < expDiff ) { 5241 if ( aExp == 0x7FFF ) { 5242 if ((uint64_t)(aSig << 1)) { 5243 return propagateFloatx80NaN(a, b, status); 5244 } 5245 return a; 5246 } 5247 if ( bExp == 0 ) --expDiff; 5248 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5249 zExp = aExp; 5250 } 5251 else if ( expDiff < 0 ) { 5252 if ( bExp == 0x7FFF ) { 5253 if ((uint64_t)(bSig << 1)) { 5254 return propagateFloatx80NaN(a, b, status); 5255 } 5256 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5257 } 5258 if ( aExp == 0 ) ++expDiff; 5259 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5260 zExp = bExp; 5261 } 5262 else { 5263 if ( aExp == 0x7FFF ) { 5264 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5265 return propagateFloatx80NaN(a, b, status); 5266 } 5267 return a; 5268 } 5269 zSig1 = 0; 5270 zSig0 = aSig + bSig; 5271 if ( aExp == 0 ) { 5272 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5273 goto roundAndPack; 5274 } 5275 zExp = aExp; 5276 goto shiftRight1; 5277 } 5278 zSig0 = aSig + bSig; 5279 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5280 shiftRight1: 5281 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5282 zSig0 |= LIT64( 0x8000000000000000 ); 5283 ++zExp; 5284 roundAndPack: 5285 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5286 zSign, zExp, zSig0, zSig1, status); 5287 } 5288 5289 /*---------------------------------------------------------------------------- 5290 | Returns the result of subtracting the absolute values of the extended 5291 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5292 | difference is negated before being returned. `zSign' is ignored if the 5293 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5294 | Standard for Binary Floating-Point Arithmetic. 5295 *----------------------------------------------------------------------------*/ 5296 5297 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5298 float_status *status) 5299 { 5300 int32_t aExp, bExp, zExp; 5301 uint64_t aSig, bSig, zSig0, zSig1; 5302 int32_t expDiff; 5303 5304 aSig = extractFloatx80Frac( a ); 5305 aExp = extractFloatx80Exp( a ); 5306 bSig = extractFloatx80Frac( b ); 5307 bExp = extractFloatx80Exp( b ); 5308 expDiff = aExp - bExp; 5309 if ( 0 < expDiff ) goto aExpBigger; 5310 if ( expDiff < 0 ) goto bExpBigger; 5311 if ( aExp == 0x7FFF ) { 5312 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5313 return propagateFloatx80NaN(a, b, status); 5314 } 5315 float_raise(float_flag_invalid, status); 5316 return floatx80_default_nan(status); 5317 } 5318 if ( aExp == 0 ) { 5319 aExp = 1; 5320 bExp = 1; 5321 } 5322 zSig1 = 0; 5323 if ( bSig < aSig ) goto aBigger; 5324 if ( aSig < bSig ) goto bBigger; 5325 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5326 bExpBigger: 5327 if ( bExp == 0x7FFF ) { 5328 if ((uint64_t)(bSig << 1)) { 5329 return propagateFloatx80NaN(a, b, status); 5330 } 5331 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5332 } 5333 if ( aExp == 0 ) ++expDiff; 5334 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5335 bBigger: 5336 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5337 zExp = bExp; 5338 zSign ^= 1; 5339 goto normalizeRoundAndPack; 5340 aExpBigger: 5341 if ( aExp == 0x7FFF ) { 5342 if ((uint64_t)(aSig << 1)) { 5343 return propagateFloatx80NaN(a, b, status); 5344 } 5345 return a; 5346 } 5347 if ( bExp == 0 ) --expDiff; 5348 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5349 aBigger: 5350 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5351 zExp = aExp; 5352 normalizeRoundAndPack: 5353 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5354 zSign, zExp, zSig0, zSig1, status); 5355 } 5356 5357 /*---------------------------------------------------------------------------- 5358 | Returns the result of adding the extended double-precision floating-point 5359 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5360 | Standard for Binary Floating-Point Arithmetic. 5361 *----------------------------------------------------------------------------*/ 5362 5363 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5364 { 5365 flag aSign, bSign; 5366 5367 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5368 float_raise(float_flag_invalid, status); 5369 return floatx80_default_nan(status); 5370 } 5371 aSign = extractFloatx80Sign( a ); 5372 bSign = extractFloatx80Sign( b ); 5373 if ( aSign == bSign ) { 5374 return addFloatx80Sigs(a, b, aSign, status); 5375 } 5376 else { 5377 return subFloatx80Sigs(a, b, aSign, status); 5378 } 5379 5380 } 5381 5382 /*---------------------------------------------------------------------------- 5383 | Returns the result of subtracting the extended double-precision floating- 5384 | point values `a' and `b'. The operation is performed according to the 5385 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5386 *----------------------------------------------------------------------------*/ 5387 5388 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5389 { 5390 flag aSign, bSign; 5391 5392 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5393 float_raise(float_flag_invalid, status); 5394 return floatx80_default_nan(status); 5395 } 5396 aSign = extractFloatx80Sign( a ); 5397 bSign = extractFloatx80Sign( b ); 5398 if ( aSign == bSign ) { 5399 return subFloatx80Sigs(a, b, aSign, status); 5400 } 5401 else { 5402 return addFloatx80Sigs(a, b, aSign, status); 5403 } 5404 5405 } 5406 5407 /*---------------------------------------------------------------------------- 5408 | Returns the result of multiplying the extended double-precision floating- 5409 | point values `a' and `b'. The operation is performed according to the 5410 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5411 *----------------------------------------------------------------------------*/ 5412 5413 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5414 { 5415 flag aSign, bSign, zSign; 5416 int32_t aExp, bExp, zExp; 5417 uint64_t aSig, bSig, zSig0, zSig1; 5418 5419 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5420 float_raise(float_flag_invalid, status); 5421 return floatx80_default_nan(status); 5422 } 5423 aSig = extractFloatx80Frac( a ); 5424 aExp = extractFloatx80Exp( a ); 5425 aSign = extractFloatx80Sign( a ); 5426 bSig = extractFloatx80Frac( b ); 5427 bExp = extractFloatx80Exp( b ); 5428 bSign = extractFloatx80Sign( b ); 5429 zSign = aSign ^ bSign; 5430 if ( aExp == 0x7FFF ) { 5431 if ( (uint64_t) ( aSig<<1 ) 5432 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5433 return propagateFloatx80NaN(a, b, status); 5434 } 5435 if ( ( bExp | bSig ) == 0 ) goto invalid; 5436 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5437 } 5438 if ( bExp == 0x7FFF ) { 5439 if ((uint64_t)(bSig << 1)) { 5440 return propagateFloatx80NaN(a, b, status); 5441 } 5442 if ( ( aExp | aSig ) == 0 ) { 5443 invalid: 5444 float_raise(float_flag_invalid, status); 5445 return floatx80_default_nan(status); 5446 } 5447 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5448 } 5449 if ( aExp == 0 ) { 5450 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5451 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5452 } 5453 if ( bExp == 0 ) { 5454 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5455 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5456 } 5457 zExp = aExp + bExp - 0x3FFE; 5458 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5459 if ( 0 < (int64_t) zSig0 ) { 5460 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5461 --zExp; 5462 } 5463 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5464 zSign, zExp, zSig0, zSig1, status); 5465 } 5466 5467 /*---------------------------------------------------------------------------- 5468 | Returns the result of dividing the extended double-precision floating-point 5469 | value `a' by the corresponding value `b'. The operation is performed 5470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5471 *----------------------------------------------------------------------------*/ 5472 5473 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5474 { 5475 flag aSign, bSign, zSign; 5476 int32_t aExp, bExp, zExp; 5477 uint64_t aSig, bSig, zSig0, zSig1; 5478 uint64_t rem0, rem1, rem2, term0, term1, term2; 5479 5480 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5481 float_raise(float_flag_invalid, status); 5482 return floatx80_default_nan(status); 5483 } 5484 aSig = extractFloatx80Frac( a ); 5485 aExp = extractFloatx80Exp( a ); 5486 aSign = extractFloatx80Sign( a ); 5487 bSig = extractFloatx80Frac( b ); 5488 bExp = extractFloatx80Exp( b ); 5489 bSign = extractFloatx80Sign( b ); 5490 zSign = aSign ^ bSign; 5491 if ( aExp == 0x7FFF ) { 5492 if ((uint64_t)(aSig << 1)) { 5493 return propagateFloatx80NaN(a, b, status); 5494 } 5495 if ( bExp == 0x7FFF ) { 5496 if ((uint64_t)(bSig << 1)) { 5497 return propagateFloatx80NaN(a, b, status); 5498 } 5499 goto invalid; 5500 } 5501 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5502 } 5503 if ( bExp == 0x7FFF ) { 5504 if ((uint64_t)(bSig << 1)) { 5505 return propagateFloatx80NaN(a, b, status); 5506 } 5507 return packFloatx80( zSign, 0, 0 ); 5508 } 5509 if ( bExp == 0 ) { 5510 if ( bSig == 0 ) { 5511 if ( ( aExp | aSig ) == 0 ) { 5512 invalid: 5513 float_raise(float_flag_invalid, status); 5514 return floatx80_default_nan(status); 5515 } 5516 float_raise(float_flag_divbyzero, status); 5517 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5518 } 5519 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5520 } 5521 if ( aExp == 0 ) { 5522 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5523 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5524 } 5525 zExp = aExp - bExp + 0x3FFE; 5526 rem1 = 0; 5527 if ( bSig <= aSig ) { 5528 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5529 ++zExp; 5530 } 5531 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5532 mul64To128( bSig, zSig0, &term0, &term1 ); 5533 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5534 while ( (int64_t) rem0 < 0 ) { 5535 --zSig0; 5536 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5537 } 5538 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5539 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5540 mul64To128( bSig, zSig1, &term1, &term2 ); 5541 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5542 while ( (int64_t) rem1 < 0 ) { 5543 --zSig1; 5544 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5545 } 5546 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5547 } 5548 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5549 zSign, zExp, zSig0, zSig1, status); 5550 } 5551 5552 /*---------------------------------------------------------------------------- 5553 | Returns the remainder of the extended double-precision floating-point value 5554 | `a' with respect to the corresponding value `b'. The operation is performed 5555 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5556 *----------------------------------------------------------------------------*/ 5557 5558 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5559 { 5560 flag aSign, zSign; 5561 int32_t aExp, bExp, expDiff; 5562 uint64_t aSig0, aSig1, bSig; 5563 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5564 5565 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5566 float_raise(float_flag_invalid, status); 5567 return floatx80_default_nan(status); 5568 } 5569 aSig0 = extractFloatx80Frac( a ); 5570 aExp = extractFloatx80Exp( a ); 5571 aSign = extractFloatx80Sign( a ); 5572 bSig = extractFloatx80Frac( b ); 5573 bExp = extractFloatx80Exp( b ); 5574 if ( aExp == 0x7FFF ) { 5575 if ( (uint64_t) ( aSig0<<1 ) 5576 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5577 return propagateFloatx80NaN(a, b, status); 5578 } 5579 goto invalid; 5580 } 5581 if ( bExp == 0x7FFF ) { 5582 if ((uint64_t)(bSig << 1)) { 5583 return propagateFloatx80NaN(a, b, status); 5584 } 5585 return a; 5586 } 5587 if ( bExp == 0 ) { 5588 if ( bSig == 0 ) { 5589 invalid: 5590 float_raise(float_flag_invalid, status); 5591 return floatx80_default_nan(status); 5592 } 5593 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5594 } 5595 if ( aExp == 0 ) { 5596 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5597 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5598 } 5599 bSig |= LIT64( 0x8000000000000000 ); 5600 zSign = aSign; 5601 expDiff = aExp - bExp; 5602 aSig1 = 0; 5603 if ( expDiff < 0 ) { 5604 if ( expDiff < -1 ) return a; 5605 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5606 expDiff = 0; 5607 } 5608 q = ( bSig <= aSig0 ); 5609 if ( q ) aSig0 -= bSig; 5610 expDiff -= 64; 5611 while ( 0 < expDiff ) { 5612 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5613 q = ( 2 < q ) ? q - 2 : 0; 5614 mul64To128( bSig, q, &term0, &term1 ); 5615 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5616 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5617 expDiff -= 62; 5618 } 5619 expDiff += 64; 5620 if ( 0 < expDiff ) { 5621 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5622 q = ( 2 < q ) ? q - 2 : 0; 5623 q >>= 64 - expDiff; 5624 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5625 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5626 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5627 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5628 ++q; 5629 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5630 } 5631 } 5632 else { 5633 term1 = 0; 5634 term0 = bSig; 5635 } 5636 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5637 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5638 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5639 && ( q & 1 ) ) 5640 ) { 5641 aSig0 = alternateASig0; 5642 aSig1 = alternateASig1; 5643 zSign = ! zSign; 5644 } 5645 return 5646 normalizeRoundAndPackFloatx80( 5647 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5648 5649 } 5650 5651 /*---------------------------------------------------------------------------- 5652 | Returns the square root of the extended double-precision floating-point 5653 | value `a'. The operation is performed according to the IEC/IEEE Standard 5654 | for Binary Floating-Point Arithmetic. 5655 *----------------------------------------------------------------------------*/ 5656 5657 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5658 { 5659 flag aSign; 5660 int32_t aExp, zExp; 5661 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5662 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5663 5664 if (floatx80_invalid_encoding(a)) { 5665 float_raise(float_flag_invalid, status); 5666 return floatx80_default_nan(status); 5667 } 5668 aSig0 = extractFloatx80Frac( a ); 5669 aExp = extractFloatx80Exp( a ); 5670 aSign = extractFloatx80Sign( a ); 5671 if ( aExp == 0x7FFF ) { 5672 if ((uint64_t)(aSig0 << 1)) { 5673 return propagateFloatx80NaN(a, a, status); 5674 } 5675 if ( ! aSign ) return a; 5676 goto invalid; 5677 } 5678 if ( aSign ) { 5679 if ( ( aExp | aSig0 ) == 0 ) return a; 5680 invalid: 5681 float_raise(float_flag_invalid, status); 5682 return floatx80_default_nan(status); 5683 } 5684 if ( aExp == 0 ) { 5685 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5686 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5687 } 5688 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5689 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5690 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5691 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5692 doubleZSig0 = zSig0<<1; 5693 mul64To128( zSig0, zSig0, &term0, &term1 ); 5694 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5695 while ( (int64_t) rem0 < 0 ) { 5696 --zSig0; 5697 doubleZSig0 -= 2; 5698 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5699 } 5700 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5701 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5702 if ( zSig1 == 0 ) zSig1 = 1; 5703 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5704 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5705 mul64To128( zSig1, zSig1, &term2, &term3 ); 5706 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5707 while ( (int64_t) rem1 < 0 ) { 5708 --zSig1; 5709 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5710 term3 |= 1; 5711 term2 |= doubleZSig0; 5712 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5713 } 5714 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5715 } 5716 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5717 zSig0 |= doubleZSig0; 5718 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5719 0, zExp, zSig0, zSig1, status); 5720 } 5721 5722 /*---------------------------------------------------------------------------- 5723 | Returns 1 if the extended double-precision floating-point value `a' is equal 5724 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5725 | raised if either operand is a NaN. Otherwise, the comparison is performed 5726 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5727 *----------------------------------------------------------------------------*/ 5728 5729 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5730 { 5731 5732 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5733 || (extractFloatx80Exp(a) == 0x7FFF 5734 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5735 || (extractFloatx80Exp(b) == 0x7FFF 5736 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5737 ) { 5738 float_raise(float_flag_invalid, status); 5739 return 0; 5740 } 5741 return 5742 ( a.low == b.low ) 5743 && ( ( a.high == b.high ) 5744 || ( ( a.low == 0 ) 5745 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5746 ); 5747 5748 } 5749 5750 /*---------------------------------------------------------------------------- 5751 | Returns 1 if the extended double-precision floating-point value `a' is 5752 | less than or equal to the corresponding value `b', and 0 otherwise. The 5753 | invalid exception is raised if either operand is a NaN. The comparison is 5754 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5755 | Arithmetic. 5756 *----------------------------------------------------------------------------*/ 5757 5758 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5759 { 5760 flag aSign, bSign; 5761 5762 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5763 || (extractFloatx80Exp(a) == 0x7FFF 5764 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5765 || (extractFloatx80Exp(b) == 0x7FFF 5766 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5767 ) { 5768 float_raise(float_flag_invalid, status); 5769 return 0; 5770 } 5771 aSign = extractFloatx80Sign( a ); 5772 bSign = extractFloatx80Sign( b ); 5773 if ( aSign != bSign ) { 5774 return 5775 aSign 5776 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5777 == 0 ); 5778 } 5779 return 5780 aSign ? le128( b.high, b.low, a.high, a.low ) 5781 : le128( a.high, a.low, b.high, b.low ); 5782 5783 } 5784 5785 /*---------------------------------------------------------------------------- 5786 | Returns 1 if the extended double-precision floating-point value `a' is 5787 | less than the corresponding value `b', and 0 otherwise. The invalid 5788 | exception is raised if either operand is a NaN. The comparison is performed 5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5790 *----------------------------------------------------------------------------*/ 5791 5792 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5793 { 5794 flag aSign, bSign; 5795 5796 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5797 || (extractFloatx80Exp(a) == 0x7FFF 5798 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5799 || (extractFloatx80Exp(b) == 0x7FFF 5800 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5801 ) { 5802 float_raise(float_flag_invalid, status); 5803 return 0; 5804 } 5805 aSign = extractFloatx80Sign( a ); 5806 bSign = extractFloatx80Sign( b ); 5807 if ( aSign != bSign ) { 5808 return 5809 aSign 5810 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5811 != 0 ); 5812 } 5813 return 5814 aSign ? lt128( b.high, b.low, a.high, a.low ) 5815 : lt128( a.high, a.low, b.high, b.low ); 5816 5817 } 5818 5819 /*---------------------------------------------------------------------------- 5820 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5821 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5822 | either operand is a NaN. The comparison is performed according to the 5823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5824 *----------------------------------------------------------------------------*/ 5825 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5826 { 5827 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5828 || (extractFloatx80Exp(a) == 0x7FFF 5829 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5830 || (extractFloatx80Exp(b) == 0x7FFF 5831 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5832 ) { 5833 float_raise(float_flag_invalid, status); 5834 return 1; 5835 } 5836 return 0; 5837 } 5838 5839 /*---------------------------------------------------------------------------- 5840 | Returns 1 if the extended double-precision floating-point value `a' is 5841 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5842 | cause an exception. The comparison is performed according to the IEC/IEEE 5843 | Standard for Binary Floating-Point Arithmetic. 5844 *----------------------------------------------------------------------------*/ 5845 5846 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5847 { 5848 5849 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5850 float_raise(float_flag_invalid, status); 5851 return 0; 5852 } 5853 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5854 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5855 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5856 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5857 ) { 5858 if (floatx80_is_signaling_nan(a, status) 5859 || floatx80_is_signaling_nan(b, status)) { 5860 float_raise(float_flag_invalid, status); 5861 } 5862 return 0; 5863 } 5864 return 5865 ( a.low == b.low ) 5866 && ( ( a.high == b.high ) 5867 || ( ( a.low == 0 ) 5868 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5869 ); 5870 5871 } 5872 5873 /*---------------------------------------------------------------------------- 5874 | Returns 1 if the extended double-precision floating-point value `a' is less 5875 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5876 | do not cause an exception. Otherwise, the comparison is performed according 5877 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5878 *----------------------------------------------------------------------------*/ 5879 5880 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5881 { 5882 flag aSign, bSign; 5883 5884 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5885 float_raise(float_flag_invalid, status); 5886 return 0; 5887 } 5888 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5889 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5890 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5891 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5892 ) { 5893 if (floatx80_is_signaling_nan(a, status) 5894 || floatx80_is_signaling_nan(b, status)) { 5895 float_raise(float_flag_invalid, status); 5896 } 5897 return 0; 5898 } 5899 aSign = extractFloatx80Sign( a ); 5900 bSign = extractFloatx80Sign( b ); 5901 if ( aSign != bSign ) { 5902 return 5903 aSign 5904 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5905 == 0 ); 5906 } 5907 return 5908 aSign ? le128( b.high, b.low, a.high, a.low ) 5909 : le128( a.high, a.low, b.high, b.low ); 5910 5911 } 5912 5913 /*---------------------------------------------------------------------------- 5914 | Returns 1 if the extended double-precision floating-point value `a' is less 5915 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5916 | an exception. Otherwise, the comparison is performed according to the 5917 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5918 *----------------------------------------------------------------------------*/ 5919 5920 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5921 { 5922 flag aSign, bSign; 5923 5924 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5925 float_raise(float_flag_invalid, status); 5926 return 0; 5927 } 5928 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5929 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5930 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5931 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5932 ) { 5933 if (floatx80_is_signaling_nan(a, status) 5934 || floatx80_is_signaling_nan(b, status)) { 5935 float_raise(float_flag_invalid, status); 5936 } 5937 return 0; 5938 } 5939 aSign = extractFloatx80Sign( a ); 5940 bSign = extractFloatx80Sign( b ); 5941 if ( aSign != bSign ) { 5942 return 5943 aSign 5944 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5945 != 0 ); 5946 } 5947 return 5948 aSign ? lt128( b.high, b.low, a.high, a.low ) 5949 : lt128( a.high, a.low, b.high, b.low ); 5950 5951 } 5952 5953 /*---------------------------------------------------------------------------- 5954 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5955 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5956 | The comparison is performed according to the IEC/IEEE Standard for Binary 5957 | Floating-Point Arithmetic. 5958 *----------------------------------------------------------------------------*/ 5959 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5960 { 5961 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5962 float_raise(float_flag_invalid, status); 5963 return 1; 5964 } 5965 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5966 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5967 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5968 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5969 ) { 5970 if (floatx80_is_signaling_nan(a, status) 5971 || floatx80_is_signaling_nan(b, status)) { 5972 float_raise(float_flag_invalid, status); 5973 } 5974 return 1; 5975 } 5976 return 0; 5977 } 5978 5979 /*---------------------------------------------------------------------------- 5980 | Returns the result of converting the quadruple-precision floating-point 5981 | value `a' to the 32-bit two's complement integer format. The conversion 5982 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5983 | Arithmetic---which means in particular that the conversion is rounded 5984 | according to the current rounding mode. If `a' is a NaN, the largest 5985 | positive integer is returned. Otherwise, if the conversion overflows, the 5986 | largest integer with the same sign as `a' is returned. 5987 *----------------------------------------------------------------------------*/ 5988 5989 int32_t float128_to_int32(float128 a, float_status *status) 5990 { 5991 flag aSign; 5992 int32_t aExp, shiftCount; 5993 uint64_t aSig0, aSig1; 5994 5995 aSig1 = extractFloat128Frac1( a ); 5996 aSig0 = extractFloat128Frac0( a ); 5997 aExp = extractFloat128Exp( a ); 5998 aSign = extractFloat128Sign( a ); 5999 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6000 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6001 aSig0 |= ( aSig1 != 0 ); 6002 shiftCount = 0x4028 - aExp; 6003 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6004 return roundAndPackInt32(aSign, aSig0, status); 6005 6006 } 6007 6008 /*---------------------------------------------------------------------------- 6009 | Returns the result of converting the quadruple-precision floating-point 6010 | value `a' to the 32-bit two's complement integer format. The conversion 6011 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6012 | Arithmetic, except that the conversion is always rounded toward zero. If 6013 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6014 | conversion overflows, the largest integer with the same sign as `a' is 6015 | returned. 6016 *----------------------------------------------------------------------------*/ 6017 6018 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6019 { 6020 flag aSign; 6021 int32_t aExp, shiftCount; 6022 uint64_t aSig0, aSig1, savedASig; 6023 int32_t z; 6024 6025 aSig1 = extractFloat128Frac1( a ); 6026 aSig0 = extractFloat128Frac0( a ); 6027 aExp = extractFloat128Exp( a ); 6028 aSign = extractFloat128Sign( a ); 6029 aSig0 |= ( aSig1 != 0 ); 6030 if ( 0x401E < aExp ) { 6031 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6032 goto invalid; 6033 } 6034 else if ( aExp < 0x3FFF ) { 6035 if (aExp || aSig0) { 6036 status->float_exception_flags |= float_flag_inexact; 6037 } 6038 return 0; 6039 } 6040 aSig0 |= LIT64( 0x0001000000000000 ); 6041 shiftCount = 0x402F - aExp; 6042 savedASig = aSig0; 6043 aSig0 >>= shiftCount; 6044 z = aSig0; 6045 if ( aSign ) z = - z; 6046 if ( ( z < 0 ) ^ aSign ) { 6047 invalid: 6048 float_raise(float_flag_invalid, status); 6049 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6050 } 6051 if ( ( aSig0<<shiftCount ) != savedASig ) { 6052 status->float_exception_flags |= float_flag_inexact; 6053 } 6054 return z; 6055 6056 } 6057 6058 /*---------------------------------------------------------------------------- 6059 | Returns the result of converting the quadruple-precision floating-point 6060 | value `a' to the 64-bit two's complement integer format. The conversion 6061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6062 | Arithmetic---which means in particular that the conversion is rounded 6063 | according to the current rounding mode. If `a' is a NaN, the largest 6064 | positive integer is returned. Otherwise, if the conversion overflows, the 6065 | largest integer with the same sign as `a' is returned. 6066 *----------------------------------------------------------------------------*/ 6067 6068 int64_t float128_to_int64(float128 a, float_status *status) 6069 { 6070 flag aSign; 6071 int32_t aExp, shiftCount; 6072 uint64_t aSig0, aSig1; 6073 6074 aSig1 = extractFloat128Frac1( a ); 6075 aSig0 = extractFloat128Frac0( a ); 6076 aExp = extractFloat128Exp( a ); 6077 aSign = extractFloat128Sign( a ); 6078 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6079 shiftCount = 0x402F - aExp; 6080 if ( shiftCount <= 0 ) { 6081 if ( 0x403E < aExp ) { 6082 float_raise(float_flag_invalid, status); 6083 if ( ! aSign 6084 || ( ( aExp == 0x7FFF ) 6085 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6086 ) 6087 ) { 6088 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6089 } 6090 return (int64_t) LIT64( 0x8000000000000000 ); 6091 } 6092 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6093 } 6094 else { 6095 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6096 } 6097 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6098 6099 } 6100 6101 /*---------------------------------------------------------------------------- 6102 | Returns the result of converting the quadruple-precision floating-point 6103 | value `a' to the 64-bit two's complement integer format. The conversion 6104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6105 | Arithmetic, except that the conversion is always rounded toward zero. 6106 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6107 | the conversion overflows, the largest integer with the same sign as `a' is 6108 | returned. 6109 *----------------------------------------------------------------------------*/ 6110 6111 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6112 { 6113 flag aSign; 6114 int32_t aExp, shiftCount; 6115 uint64_t aSig0, aSig1; 6116 int64_t z; 6117 6118 aSig1 = extractFloat128Frac1( a ); 6119 aSig0 = extractFloat128Frac0( a ); 6120 aExp = extractFloat128Exp( a ); 6121 aSign = extractFloat128Sign( a ); 6122 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6123 shiftCount = aExp - 0x402F; 6124 if ( 0 < shiftCount ) { 6125 if ( 0x403E <= aExp ) { 6126 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6127 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6128 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6129 if (aSig1) { 6130 status->float_exception_flags |= float_flag_inexact; 6131 } 6132 } 6133 else { 6134 float_raise(float_flag_invalid, status); 6135 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6136 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6137 } 6138 } 6139 return (int64_t) LIT64( 0x8000000000000000 ); 6140 } 6141 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6142 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6143 status->float_exception_flags |= float_flag_inexact; 6144 } 6145 } 6146 else { 6147 if ( aExp < 0x3FFF ) { 6148 if ( aExp | aSig0 | aSig1 ) { 6149 status->float_exception_flags |= float_flag_inexact; 6150 } 6151 return 0; 6152 } 6153 z = aSig0>>( - shiftCount ); 6154 if ( aSig1 6155 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6156 status->float_exception_flags |= float_flag_inexact; 6157 } 6158 } 6159 if ( aSign ) z = - z; 6160 return z; 6161 6162 } 6163 6164 /*---------------------------------------------------------------------------- 6165 | Returns the result of converting the quadruple-precision floating-point value 6166 | `a' to the 64-bit unsigned integer format. The conversion is 6167 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6168 | Arithmetic---which means in particular that the conversion is rounded 6169 | according to the current rounding mode. If `a' is a NaN, the largest 6170 | positive integer is returned. If the conversion overflows, the 6171 | largest unsigned integer is returned. If 'a' is negative, the value is 6172 | rounded and zero is returned; negative values that do not round to zero 6173 | will raise the inexact exception. 6174 *----------------------------------------------------------------------------*/ 6175 6176 uint64_t float128_to_uint64(float128 a, float_status *status) 6177 { 6178 flag aSign; 6179 int aExp; 6180 int shiftCount; 6181 uint64_t aSig0, aSig1; 6182 6183 aSig0 = extractFloat128Frac0(a); 6184 aSig1 = extractFloat128Frac1(a); 6185 aExp = extractFloat128Exp(a); 6186 aSign = extractFloat128Sign(a); 6187 if (aSign && (aExp > 0x3FFE)) { 6188 float_raise(float_flag_invalid, status); 6189 if (float128_is_any_nan(a)) { 6190 return LIT64(0xFFFFFFFFFFFFFFFF); 6191 } else { 6192 return 0; 6193 } 6194 } 6195 if (aExp) { 6196 aSig0 |= LIT64(0x0001000000000000); 6197 } 6198 shiftCount = 0x402F - aExp; 6199 if (shiftCount <= 0) { 6200 if (0x403E < aExp) { 6201 float_raise(float_flag_invalid, status); 6202 return LIT64(0xFFFFFFFFFFFFFFFF); 6203 } 6204 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6205 } else { 6206 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6207 } 6208 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6209 } 6210 6211 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6212 { 6213 uint64_t v; 6214 signed char current_rounding_mode = status->float_rounding_mode; 6215 6216 set_float_rounding_mode(float_round_to_zero, status); 6217 v = float128_to_uint64(a, status); 6218 set_float_rounding_mode(current_rounding_mode, status); 6219 6220 return v; 6221 } 6222 6223 /*---------------------------------------------------------------------------- 6224 | Returns the result of converting the quadruple-precision floating-point 6225 | value `a' to the 32-bit unsigned integer format. The conversion 6226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6227 | Arithmetic except that the conversion is always rounded toward zero. 6228 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6229 | if the conversion overflows, the largest unsigned integer is returned. 6230 | If 'a' is negative, the value is rounded and zero is returned; negative 6231 | values that do not round to zero will raise the inexact exception. 6232 *----------------------------------------------------------------------------*/ 6233 6234 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6235 { 6236 uint64_t v; 6237 uint32_t res; 6238 int old_exc_flags = get_float_exception_flags(status); 6239 6240 v = float128_to_uint64_round_to_zero(a, status); 6241 if (v > 0xffffffff) { 6242 res = 0xffffffff; 6243 } else { 6244 return v; 6245 } 6246 set_float_exception_flags(old_exc_flags, status); 6247 float_raise(float_flag_invalid, status); 6248 return res; 6249 } 6250 6251 /*---------------------------------------------------------------------------- 6252 | Returns the result of converting the quadruple-precision floating-point 6253 | value `a' to the single-precision floating-point format. The conversion 6254 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6255 | Arithmetic. 6256 *----------------------------------------------------------------------------*/ 6257 6258 float32 float128_to_float32(float128 a, float_status *status) 6259 { 6260 flag aSign; 6261 int32_t aExp; 6262 uint64_t aSig0, aSig1; 6263 uint32_t zSig; 6264 6265 aSig1 = extractFloat128Frac1( a ); 6266 aSig0 = extractFloat128Frac0( a ); 6267 aExp = extractFloat128Exp( a ); 6268 aSign = extractFloat128Sign( a ); 6269 if ( aExp == 0x7FFF ) { 6270 if ( aSig0 | aSig1 ) { 6271 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6272 } 6273 return packFloat32( aSign, 0xFF, 0 ); 6274 } 6275 aSig0 |= ( aSig1 != 0 ); 6276 shift64RightJamming( aSig0, 18, &aSig0 ); 6277 zSig = aSig0; 6278 if ( aExp || zSig ) { 6279 zSig |= 0x40000000; 6280 aExp -= 0x3F81; 6281 } 6282 return roundAndPackFloat32(aSign, aExp, zSig, status); 6283 6284 } 6285 6286 /*---------------------------------------------------------------------------- 6287 | Returns the result of converting the quadruple-precision floating-point 6288 | value `a' to the double-precision floating-point format. The conversion 6289 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6290 | Arithmetic. 6291 *----------------------------------------------------------------------------*/ 6292 6293 float64 float128_to_float64(float128 a, float_status *status) 6294 { 6295 flag aSign; 6296 int32_t aExp; 6297 uint64_t aSig0, aSig1; 6298 6299 aSig1 = extractFloat128Frac1( a ); 6300 aSig0 = extractFloat128Frac0( a ); 6301 aExp = extractFloat128Exp( a ); 6302 aSign = extractFloat128Sign( a ); 6303 if ( aExp == 0x7FFF ) { 6304 if ( aSig0 | aSig1 ) { 6305 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6306 } 6307 return packFloat64( aSign, 0x7FF, 0 ); 6308 } 6309 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6310 aSig0 |= ( aSig1 != 0 ); 6311 if ( aExp || aSig0 ) { 6312 aSig0 |= LIT64( 0x4000000000000000 ); 6313 aExp -= 0x3C01; 6314 } 6315 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6316 6317 } 6318 6319 /*---------------------------------------------------------------------------- 6320 | Returns the result of converting the quadruple-precision floating-point 6321 | value `a' to the extended double-precision floating-point format. The 6322 | conversion is performed according to the IEC/IEEE Standard for Binary 6323 | Floating-Point Arithmetic. 6324 *----------------------------------------------------------------------------*/ 6325 6326 floatx80 float128_to_floatx80(float128 a, float_status *status) 6327 { 6328 flag aSign; 6329 int32_t aExp; 6330 uint64_t aSig0, aSig1; 6331 6332 aSig1 = extractFloat128Frac1( a ); 6333 aSig0 = extractFloat128Frac0( a ); 6334 aExp = extractFloat128Exp( a ); 6335 aSign = extractFloat128Sign( a ); 6336 if ( aExp == 0x7FFF ) { 6337 if ( aSig0 | aSig1 ) { 6338 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6339 } 6340 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6341 } 6342 if ( aExp == 0 ) { 6343 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6344 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6345 } 6346 else { 6347 aSig0 |= LIT64( 0x0001000000000000 ); 6348 } 6349 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6350 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6351 6352 } 6353 6354 /*---------------------------------------------------------------------------- 6355 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6356 | returns the result as a quadruple-precision floating-point value. The 6357 | operation is performed according to the IEC/IEEE Standard for Binary 6358 | Floating-Point Arithmetic. 6359 *----------------------------------------------------------------------------*/ 6360 6361 float128 float128_round_to_int(float128 a, float_status *status) 6362 { 6363 flag aSign; 6364 int32_t aExp; 6365 uint64_t lastBitMask, roundBitsMask; 6366 float128 z; 6367 6368 aExp = extractFloat128Exp( a ); 6369 if ( 0x402F <= aExp ) { 6370 if ( 0x406F <= aExp ) { 6371 if ( ( aExp == 0x7FFF ) 6372 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6373 ) { 6374 return propagateFloat128NaN(a, a, status); 6375 } 6376 return a; 6377 } 6378 lastBitMask = 1; 6379 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6380 roundBitsMask = lastBitMask - 1; 6381 z = a; 6382 switch (status->float_rounding_mode) { 6383 case float_round_nearest_even: 6384 if ( lastBitMask ) { 6385 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6386 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6387 } 6388 else { 6389 if ( (int64_t) z.low < 0 ) { 6390 ++z.high; 6391 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6392 } 6393 } 6394 break; 6395 case float_round_ties_away: 6396 if (lastBitMask) { 6397 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6398 } else { 6399 if ((int64_t) z.low < 0) { 6400 ++z.high; 6401 } 6402 } 6403 break; 6404 case float_round_to_zero: 6405 break; 6406 case float_round_up: 6407 if (!extractFloat128Sign(z)) { 6408 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6409 } 6410 break; 6411 case float_round_down: 6412 if (extractFloat128Sign(z)) { 6413 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6414 } 6415 break; 6416 default: 6417 abort(); 6418 } 6419 z.low &= ~ roundBitsMask; 6420 } 6421 else { 6422 if ( aExp < 0x3FFF ) { 6423 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6424 status->float_exception_flags |= float_flag_inexact; 6425 aSign = extractFloat128Sign( a ); 6426 switch (status->float_rounding_mode) { 6427 case float_round_nearest_even: 6428 if ( ( aExp == 0x3FFE ) 6429 && ( extractFloat128Frac0( a ) 6430 | extractFloat128Frac1( a ) ) 6431 ) { 6432 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6433 } 6434 break; 6435 case float_round_ties_away: 6436 if (aExp == 0x3FFE) { 6437 return packFloat128(aSign, 0x3FFF, 0, 0); 6438 } 6439 break; 6440 case float_round_down: 6441 return 6442 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6443 : packFloat128( 0, 0, 0, 0 ); 6444 case float_round_up: 6445 return 6446 aSign ? packFloat128( 1, 0, 0, 0 ) 6447 : packFloat128( 0, 0x3FFF, 0, 0 ); 6448 } 6449 return packFloat128( aSign, 0, 0, 0 ); 6450 } 6451 lastBitMask = 1; 6452 lastBitMask <<= 0x402F - aExp; 6453 roundBitsMask = lastBitMask - 1; 6454 z.low = 0; 6455 z.high = a.high; 6456 switch (status->float_rounding_mode) { 6457 case float_round_nearest_even: 6458 z.high += lastBitMask>>1; 6459 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6460 z.high &= ~ lastBitMask; 6461 } 6462 break; 6463 case float_round_ties_away: 6464 z.high += lastBitMask>>1; 6465 break; 6466 case float_round_to_zero: 6467 break; 6468 case float_round_up: 6469 if (!extractFloat128Sign(z)) { 6470 z.high |= ( a.low != 0 ); 6471 z.high += roundBitsMask; 6472 } 6473 break; 6474 case float_round_down: 6475 if (extractFloat128Sign(z)) { 6476 z.high |= (a.low != 0); 6477 z.high += roundBitsMask; 6478 } 6479 break; 6480 default: 6481 abort(); 6482 } 6483 z.high &= ~ roundBitsMask; 6484 } 6485 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6486 status->float_exception_flags |= float_flag_inexact; 6487 } 6488 return z; 6489 6490 } 6491 6492 /*---------------------------------------------------------------------------- 6493 | Returns the result of adding the absolute values of the quadruple-precision 6494 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6495 | before being returned. `zSign' is ignored if the result is a NaN. 6496 | The addition is performed according to the IEC/IEEE Standard for Binary 6497 | Floating-Point Arithmetic. 6498 *----------------------------------------------------------------------------*/ 6499 6500 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6501 float_status *status) 6502 { 6503 int32_t aExp, bExp, zExp; 6504 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6505 int32_t expDiff; 6506 6507 aSig1 = extractFloat128Frac1( a ); 6508 aSig0 = extractFloat128Frac0( a ); 6509 aExp = extractFloat128Exp( a ); 6510 bSig1 = extractFloat128Frac1( b ); 6511 bSig0 = extractFloat128Frac0( b ); 6512 bExp = extractFloat128Exp( b ); 6513 expDiff = aExp - bExp; 6514 if ( 0 < expDiff ) { 6515 if ( aExp == 0x7FFF ) { 6516 if (aSig0 | aSig1) { 6517 return propagateFloat128NaN(a, b, status); 6518 } 6519 return a; 6520 } 6521 if ( bExp == 0 ) { 6522 --expDiff; 6523 } 6524 else { 6525 bSig0 |= LIT64( 0x0001000000000000 ); 6526 } 6527 shift128ExtraRightJamming( 6528 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6529 zExp = aExp; 6530 } 6531 else if ( expDiff < 0 ) { 6532 if ( bExp == 0x7FFF ) { 6533 if (bSig0 | bSig1) { 6534 return propagateFloat128NaN(a, b, status); 6535 } 6536 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6537 } 6538 if ( aExp == 0 ) { 6539 ++expDiff; 6540 } 6541 else { 6542 aSig0 |= LIT64( 0x0001000000000000 ); 6543 } 6544 shift128ExtraRightJamming( 6545 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6546 zExp = bExp; 6547 } 6548 else { 6549 if ( aExp == 0x7FFF ) { 6550 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6551 return propagateFloat128NaN(a, b, status); 6552 } 6553 return a; 6554 } 6555 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6556 if ( aExp == 0 ) { 6557 if (status->flush_to_zero) { 6558 if (zSig0 | zSig1) { 6559 float_raise(float_flag_output_denormal, status); 6560 } 6561 return packFloat128(zSign, 0, 0, 0); 6562 } 6563 return packFloat128( zSign, 0, zSig0, zSig1 ); 6564 } 6565 zSig2 = 0; 6566 zSig0 |= LIT64( 0x0002000000000000 ); 6567 zExp = aExp; 6568 goto shiftRight1; 6569 } 6570 aSig0 |= LIT64( 0x0001000000000000 ); 6571 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6572 --zExp; 6573 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6574 ++zExp; 6575 shiftRight1: 6576 shift128ExtraRightJamming( 6577 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6578 roundAndPack: 6579 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6580 6581 } 6582 6583 /*---------------------------------------------------------------------------- 6584 | Returns the result of subtracting the absolute values of the quadruple- 6585 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6586 | difference is negated before being returned. `zSign' is ignored if the 6587 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6588 | Standard for Binary Floating-Point Arithmetic. 6589 *----------------------------------------------------------------------------*/ 6590 6591 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6592 float_status *status) 6593 { 6594 int32_t aExp, bExp, zExp; 6595 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6596 int32_t expDiff; 6597 6598 aSig1 = extractFloat128Frac1( a ); 6599 aSig0 = extractFloat128Frac0( a ); 6600 aExp = extractFloat128Exp( a ); 6601 bSig1 = extractFloat128Frac1( b ); 6602 bSig0 = extractFloat128Frac0( b ); 6603 bExp = extractFloat128Exp( b ); 6604 expDiff = aExp - bExp; 6605 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6606 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6607 if ( 0 < expDiff ) goto aExpBigger; 6608 if ( expDiff < 0 ) goto bExpBigger; 6609 if ( aExp == 0x7FFF ) { 6610 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6611 return propagateFloat128NaN(a, b, status); 6612 } 6613 float_raise(float_flag_invalid, status); 6614 return float128_default_nan(status); 6615 } 6616 if ( aExp == 0 ) { 6617 aExp = 1; 6618 bExp = 1; 6619 } 6620 if ( bSig0 < aSig0 ) goto aBigger; 6621 if ( aSig0 < bSig0 ) goto bBigger; 6622 if ( bSig1 < aSig1 ) goto aBigger; 6623 if ( aSig1 < bSig1 ) goto bBigger; 6624 return packFloat128(status->float_rounding_mode == float_round_down, 6625 0, 0, 0); 6626 bExpBigger: 6627 if ( bExp == 0x7FFF ) { 6628 if (bSig0 | bSig1) { 6629 return propagateFloat128NaN(a, b, status); 6630 } 6631 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6632 } 6633 if ( aExp == 0 ) { 6634 ++expDiff; 6635 } 6636 else { 6637 aSig0 |= LIT64( 0x4000000000000000 ); 6638 } 6639 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6640 bSig0 |= LIT64( 0x4000000000000000 ); 6641 bBigger: 6642 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6643 zExp = bExp; 6644 zSign ^= 1; 6645 goto normalizeRoundAndPack; 6646 aExpBigger: 6647 if ( aExp == 0x7FFF ) { 6648 if (aSig0 | aSig1) { 6649 return propagateFloat128NaN(a, b, status); 6650 } 6651 return a; 6652 } 6653 if ( bExp == 0 ) { 6654 --expDiff; 6655 } 6656 else { 6657 bSig0 |= LIT64( 0x4000000000000000 ); 6658 } 6659 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6660 aSig0 |= LIT64( 0x4000000000000000 ); 6661 aBigger: 6662 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6663 zExp = aExp; 6664 normalizeRoundAndPack: 6665 --zExp; 6666 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6667 status); 6668 6669 } 6670 6671 /*---------------------------------------------------------------------------- 6672 | Returns the result of adding the quadruple-precision floating-point values 6673 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6674 | for Binary Floating-Point Arithmetic. 6675 *----------------------------------------------------------------------------*/ 6676 6677 float128 float128_add(float128 a, float128 b, float_status *status) 6678 { 6679 flag aSign, bSign; 6680 6681 aSign = extractFloat128Sign( a ); 6682 bSign = extractFloat128Sign( b ); 6683 if ( aSign == bSign ) { 6684 return addFloat128Sigs(a, b, aSign, status); 6685 } 6686 else { 6687 return subFloat128Sigs(a, b, aSign, status); 6688 } 6689 6690 } 6691 6692 /*---------------------------------------------------------------------------- 6693 | Returns the result of subtracting the quadruple-precision floating-point 6694 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6695 | Standard for Binary Floating-Point Arithmetic. 6696 *----------------------------------------------------------------------------*/ 6697 6698 float128 float128_sub(float128 a, float128 b, float_status *status) 6699 { 6700 flag aSign, bSign; 6701 6702 aSign = extractFloat128Sign( a ); 6703 bSign = extractFloat128Sign( b ); 6704 if ( aSign == bSign ) { 6705 return subFloat128Sigs(a, b, aSign, status); 6706 } 6707 else { 6708 return addFloat128Sigs(a, b, aSign, status); 6709 } 6710 6711 } 6712 6713 /*---------------------------------------------------------------------------- 6714 | Returns the result of multiplying the quadruple-precision floating-point 6715 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6716 | Standard for Binary Floating-Point Arithmetic. 6717 *----------------------------------------------------------------------------*/ 6718 6719 float128 float128_mul(float128 a, float128 b, float_status *status) 6720 { 6721 flag aSign, bSign, zSign; 6722 int32_t aExp, bExp, zExp; 6723 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6724 6725 aSig1 = extractFloat128Frac1( a ); 6726 aSig0 = extractFloat128Frac0( a ); 6727 aExp = extractFloat128Exp( a ); 6728 aSign = extractFloat128Sign( a ); 6729 bSig1 = extractFloat128Frac1( b ); 6730 bSig0 = extractFloat128Frac0( b ); 6731 bExp = extractFloat128Exp( b ); 6732 bSign = extractFloat128Sign( b ); 6733 zSign = aSign ^ bSign; 6734 if ( aExp == 0x7FFF ) { 6735 if ( ( aSig0 | aSig1 ) 6736 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6737 return propagateFloat128NaN(a, b, status); 6738 } 6739 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6740 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6741 } 6742 if ( bExp == 0x7FFF ) { 6743 if (bSig0 | bSig1) { 6744 return propagateFloat128NaN(a, b, status); 6745 } 6746 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6747 invalid: 6748 float_raise(float_flag_invalid, status); 6749 return float128_default_nan(status); 6750 } 6751 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6752 } 6753 if ( aExp == 0 ) { 6754 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6755 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6756 } 6757 if ( bExp == 0 ) { 6758 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6759 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6760 } 6761 zExp = aExp + bExp - 0x4000; 6762 aSig0 |= LIT64( 0x0001000000000000 ); 6763 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6764 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6765 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6766 zSig2 |= ( zSig3 != 0 ); 6767 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6768 shift128ExtraRightJamming( 6769 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6770 ++zExp; 6771 } 6772 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6773 6774 } 6775 6776 /*---------------------------------------------------------------------------- 6777 | Returns the result of dividing the quadruple-precision floating-point value 6778 | `a' by the corresponding value `b'. The operation is performed according to 6779 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6780 *----------------------------------------------------------------------------*/ 6781 6782 float128 float128_div(float128 a, float128 b, float_status *status) 6783 { 6784 flag aSign, bSign, zSign; 6785 int32_t aExp, bExp, zExp; 6786 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6787 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6788 6789 aSig1 = extractFloat128Frac1( a ); 6790 aSig0 = extractFloat128Frac0( a ); 6791 aExp = extractFloat128Exp( a ); 6792 aSign = extractFloat128Sign( a ); 6793 bSig1 = extractFloat128Frac1( b ); 6794 bSig0 = extractFloat128Frac0( b ); 6795 bExp = extractFloat128Exp( b ); 6796 bSign = extractFloat128Sign( b ); 6797 zSign = aSign ^ bSign; 6798 if ( aExp == 0x7FFF ) { 6799 if (aSig0 | aSig1) { 6800 return propagateFloat128NaN(a, b, status); 6801 } 6802 if ( bExp == 0x7FFF ) { 6803 if (bSig0 | bSig1) { 6804 return propagateFloat128NaN(a, b, status); 6805 } 6806 goto invalid; 6807 } 6808 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6809 } 6810 if ( bExp == 0x7FFF ) { 6811 if (bSig0 | bSig1) { 6812 return propagateFloat128NaN(a, b, status); 6813 } 6814 return packFloat128( zSign, 0, 0, 0 ); 6815 } 6816 if ( bExp == 0 ) { 6817 if ( ( bSig0 | bSig1 ) == 0 ) { 6818 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6819 invalid: 6820 float_raise(float_flag_invalid, status); 6821 return float128_default_nan(status); 6822 } 6823 float_raise(float_flag_divbyzero, status); 6824 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6825 } 6826 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6827 } 6828 if ( aExp == 0 ) { 6829 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6830 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6831 } 6832 zExp = aExp - bExp + 0x3FFD; 6833 shortShift128Left( 6834 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6835 shortShift128Left( 6836 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6837 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6838 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6839 ++zExp; 6840 } 6841 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6842 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6843 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6844 while ( (int64_t) rem0 < 0 ) { 6845 --zSig0; 6846 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6847 } 6848 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6849 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6850 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6851 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6852 while ( (int64_t) rem1 < 0 ) { 6853 --zSig1; 6854 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6855 } 6856 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6857 } 6858 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6859 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6860 6861 } 6862 6863 /*---------------------------------------------------------------------------- 6864 | Returns the remainder of the quadruple-precision floating-point value `a' 6865 | with respect to the corresponding value `b'. The operation is performed 6866 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6867 *----------------------------------------------------------------------------*/ 6868 6869 float128 float128_rem(float128 a, float128 b, float_status *status) 6870 { 6871 flag aSign, zSign; 6872 int32_t aExp, bExp, expDiff; 6873 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6874 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6875 int64_t sigMean0; 6876 6877 aSig1 = extractFloat128Frac1( a ); 6878 aSig0 = extractFloat128Frac0( a ); 6879 aExp = extractFloat128Exp( a ); 6880 aSign = extractFloat128Sign( a ); 6881 bSig1 = extractFloat128Frac1( b ); 6882 bSig0 = extractFloat128Frac0( b ); 6883 bExp = extractFloat128Exp( b ); 6884 if ( aExp == 0x7FFF ) { 6885 if ( ( aSig0 | aSig1 ) 6886 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6887 return propagateFloat128NaN(a, b, status); 6888 } 6889 goto invalid; 6890 } 6891 if ( bExp == 0x7FFF ) { 6892 if (bSig0 | bSig1) { 6893 return propagateFloat128NaN(a, b, status); 6894 } 6895 return a; 6896 } 6897 if ( bExp == 0 ) { 6898 if ( ( bSig0 | bSig1 ) == 0 ) { 6899 invalid: 6900 float_raise(float_flag_invalid, status); 6901 return float128_default_nan(status); 6902 } 6903 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6904 } 6905 if ( aExp == 0 ) { 6906 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6907 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6908 } 6909 expDiff = aExp - bExp; 6910 if ( expDiff < -1 ) return a; 6911 shortShift128Left( 6912 aSig0 | LIT64( 0x0001000000000000 ), 6913 aSig1, 6914 15 - ( expDiff < 0 ), 6915 &aSig0, 6916 &aSig1 6917 ); 6918 shortShift128Left( 6919 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6920 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6921 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6922 expDiff -= 64; 6923 while ( 0 < expDiff ) { 6924 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6925 q = ( 4 < q ) ? q - 4 : 0; 6926 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6927 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6928 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6929 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6930 expDiff -= 61; 6931 } 6932 if ( -64 < expDiff ) { 6933 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6934 q = ( 4 < q ) ? q - 4 : 0; 6935 q >>= - expDiff; 6936 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6937 expDiff += 52; 6938 if ( expDiff < 0 ) { 6939 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6940 } 6941 else { 6942 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6943 } 6944 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6945 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6946 } 6947 else { 6948 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6949 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6950 } 6951 do { 6952 alternateASig0 = aSig0; 6953 alternateASig1 = aSig1; 6954 ++q; 6955 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6956 } while ( 0 <= (int64_t) aSig0 ); 6957 add128( 6958 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6959 if ( ( sigMean0 < 0 ) 6960 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6961 aSig0 = alternateASig0; 6962 aSig1 = alternateASig1; 6963 } 6964 zSign = ( (int64_t) aSig0 < 0 ); 6965 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6966 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6967 status); 6968 } 6969 6970 /*---------------------------------------------------------------------------- 6971 | Returns the square root of the quadruple-precision floating-point value `a'. 6972 | The operation is performed according to the IEC/IEEE Standard for Binary 6973 | Floating-Point Arithmetic. 6974 *----------------------------------------------------------------------------*/ 6975 6976 float128 float128_sqrt(float128 a, float_status *status) 6977 { 6978 flag aSign; 6979 int32_t aExp, zExp; 6980 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6981 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6982 6983 aSig1 = extractFloat128Frac1( a ); 6984 aSig0 = extractFloat128Frac0( a ); 6985 aExp = extractFloat128Exp( a ); 6986 aSign = extractFloat128Sign( a ); 6987 if ( aExp == 0x7FFF ) { 6988 if (aSig0 | aSig1) { 6989 return propagateFloat128NaN(a, a, status); 6990 } 6991 if ( ! aSign ) return a; 6992 goto invalid; 6993 } 6994 if ( aSign ) { 6995 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6996 invalid: 6997 float_raise(float_flag_invalid, status); 6998 return float128_default_nan(status); 6999 } 7000 if ( aExp == 0 ) { 7001 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7002 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7003 } 7004 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7005 aSig0 |= LIT64( 0x0001000000000000 ); 7006 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7007 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7008 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7009 doubleZSig0 = zSig0<<1; 7010 mul64To128( zSig0, zSig0, &term0, &term1 ); 7011 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7012 while ( (int64_t) rem0 < 0 ) { 7013 --zSig0; 7014 doubleZSig0 -= 2; 7015 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7016 } 7017 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7018 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7019 if ( zSig1 == 0 ) zSig1 = 1; 7020 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7021 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7022 mul64To128( zSig1, zSig1, &term2, &term3 ); 7023 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7024 while ( (int64_t) rem1 < 0 ) { 7025 --zSig1; 7026 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7027 term3 |= 1; 7028 term2 |= doubleZSig0; 7029 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7030 } 7031 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7032 } 7033 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7034 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7035 7036 } 7037 7038 /*---------------------------------------------------------------------------- 7039 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7040 | the corresponding value `b', and 0 otherwise. The invalid exception is 7041 | raised if either operand is a NaN. Otherwise, the comparison is performed 7042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7043 *----------------------------------------------------------------------------*/ 7044 7045 int float128_eq(float128 a, float128 b, float_status *status) 7046 { 7047 7048 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7049 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7050 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7051 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7052 ) { 7053 float_raise(float_flag_invalid, status); 7054 return 0; 7055 } 7056 return 7057 ( a.low == b.low ) 7058 && ( ( a.high == b.high ) 7059 || ( ( a.low == 0 ) 7060 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7061 ); 7062 7063 } 7064 7065 /*---------------------------------------------------------------------------- 7066 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7067 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7068 | exception is raised if either operand is a NaN. The comparison is performed 7069 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7070 *----------------------------------------------------------------------------*/ 7071 7072 int float128_le(float128 a, float128 b, float_status *status) 7073 { 7074 flag aSign, bSign; 7075 7076 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7077 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7078 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7079 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7080 ) { 7081 float_raise(float_flag_invalid, status); 7082 return 0; 7083 } 7084 aSign = extractFloat128Sign( a ); 7085 bSign = extractFloat128Sign( b ); 7086 if ( aSign != bSign ) { 7087 return 7088 aSign 7089 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7090 == 0 ); 7091 } 7092 return 7093 aSign ? le128( b.high, b.low, a.high, a.low ) 7094 : le128( a.high, a.low, b.high, b.low ); 7095 7096 } 7097 7098 /*---------------------------------------------------------------------------- 7099 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7100 | the corresponding value `b', and 0 otherwise. The invalid exception is 7101 | raised if either operand is a NaN. The comparison is performed according 7102 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7103 *----------------------------------------------------------------------------*/ 7104 7105 int float128_lt(float128 a, float128 b, float_status *status) 7106 { 7107 flag aSign, bSign; 7108 7109 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7110 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7111 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7112 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7113 ) { 7114 float_raise(float_flag_invalid, status); 7115 return 0; 7116 } 7117 aSign = extractFloat128Sign( a ); 7118 bSign = extractFloat128Sign( b ); 7119 if ( aSign != bSign ) { 7120 return 7121 aSign 7122 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7123 != 0 ); 7124 } 7125 return 7126 aSign ? lt128( b.high, b.low, a.high, a.low ) 7127 : lt128( a.high, a.low, b.high, b.low ); 7128 7129 } 7130 7131 /*---------------------------------------------------------------------------- 7132 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7133 | be compared, and 0 otherwise. The invalid exception is raised if either 7134 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7135 | Standard for Binary Floating-Point Arithmetic. 7136 *----------------------------------------------------------------------------*/ 7137 7138 int float128_unordered(float128 a, float128 b, float_status *status) 7139 { 7140 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7141 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7142 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7143 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7144 ) { 7145 float_raise(float_flag_invalid, status); 7146 return 1; 7147 } 7148 return 0; 7149 } 7150 7151 /*---------------------------------------------------------------------------- 7152 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7153 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7154 | exception. The comparison is performed according to the IEC/IEEE Standard 7155 | for Binary Floating-Point Arithmetic. 7156 *----------------------------------------------------------------------------*/ 7157 7158 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7159 { 7160 7161 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7162 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7163 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7164 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7165 ) { 7166 if (float128_is_signaling_nan(a, status) 7167 || float128_is_signaling_nan(b, status)) { 7168 float_raise(float_flag_invalid, status); 7169 } 7170 return 0; 7171 } 7172 return 7173 ( a.low == b.low ) 7174 && ( ( a.high == b.high ) 7175 || ( ( a.low == 0 ) 7176 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7177 ); 7178 7179 } 7180 7181 /*---------------------------------------------------------------------------- 7182 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7183 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7184 | cause an exception. Otherwise, the comparison is performed according to the 7185 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7186 *----------------------------------------------------------------------------*/ 7187 7188 int float128_le_quiet(float128 a, float128 b, float_status *status) 7189 { 7190 flag aSign, bSign; 7191 7192 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7193 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7194 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7195 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7196 ) { 7197 if (float128_is_signaling_nan(a, status) 7198 || float128_is_signaling_nan(b, status)) { 7199 float_raise(float_flag_invalid, status); 7200 } 7201 return 0; 7202 } 7203 aSign = extractFloat128Sign( a ); 7204 bSign = extractFloat128Sign( b ); 7205 if ( aSign != bSign ) { 7206 return 7207 aSign 7208 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7209 == 0 ); 7210 } 7211 return 7212 aSign ? le128( b.high, b.low, a.high, a.low ) 7213 : le128( a.high, a.low, b.high, b.low ); 7214 7215 } 7216 7217 /*---------------------------------------------------------------------------- 7218 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7219 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7220 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7221 | Standard for Binary Floating-Point Arithmetic. 7222 *----------------------------------------------------------------------------*/ 7223 7224 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7225 { 7226 flag aSign, bSign; 7227 7228 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7229 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7230 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7231 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7232 ) { 7233 if (float128_is_signaling_nan(a, status) 7234 || float128_is_signaling_nan(b, status)) { 7235 float_raise(float_flag_invalid, status); 7236 } 7237 return 0; 7238 } 7239 aSign = extractFloat128Sign( a ); 7240 bSign = extractFloat128Sign( b ); 7241 if ( aSign != bSign ) { 7242 return 7243 aSign 7244 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7245 != 0 ); 7246 } 7247 return 7248 aSign ? lt128( b.high, b.low, a.high, a.low ) 7249 : lt128( a.high, a.low, b.high, b.low ); 7250 7251 } 7252 7253 /*---------------------------------------------------------------------------- 7254 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7255 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7256 | comparison is performed according to the IEC/IEEE Standard for Binary 7257 | Floating-Point Arithmetic. 7258 *----------------------------------------------------------------------------*/ 7259 7260 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7261 { 7262 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7263 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7264 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7265 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7266 ) { 7267 if (float128_is_signaling_nan(a, status) 7268 || float128_is_signaling_nan(b, status)) { 7269 float_raise(float_flag_invalid, status); 7270 } 7271 return 1; 7272 } 7273 return 0; 7274 } 7275 7276 /* misc functions */ 7277 float32 uint32_to_float32(uint32_t a, float_status *status) 7278 { 7279 return int64_to_float32(a, status); 7280 } 7281 7282 float64 uint32_to_float64(uint32_t a, float_status *status) 7283 { 7284 return int64_to_float64(a, status); 7285 } 7286 7287 uint32_t float32_to_uint32(float32 a, float_status *status) 7288 { 7289 int64_t v; 7290 uint32_t res; 7291 int old_exc_flags = get_float_exception_flags(status); 7292 7293 v = float32_to_int64(a, status); 7294 if (v < 0) { 7295 res = 0; 7296 } else if (v > 0xffffffff) { 7297 res = 0xffffffff; 7298 } else { 7299 return v; 7300 } 7301 set_float_exception_flags(old_exc_flags, status); 7302 float_raise(float_flag_invalid, status); 7303 return res; 7304 } 7305 7306 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7307 { 7308 int64_t v; 7309 uint32_t res; 7310 int old_exc_flags = get_float_exception_flags(status); 7311 7312 v = float32_to_int64_round_to_zero(a, status); 7313 if (v < 0) { 7314 res = 0; 7315 } else if (v > 0xffffffff) { 7316 res = 0xffffffff; 7317 } else { 7318 return v; 7319 } 7320 set_float_exception_flags(old_exc_flags, status); 7321 float_raise(float_flag_invalid, status); 7322 return res; 7323 } 7324 7325 int16_t float32_to_int16(float32 a, float_status *status) 7326 { 7327 int32_t v; 7328 int16_t res; 7329 int old_exc_flags = get_float_exception_flags(status); 7330 7331 v = float32_to_int32(a, status); 7332 if (v < -0x8000) { 7333 res = -0x8000; 7334 } else if (v > 0x7fff) { 7335 res = 0x7fff; 7336 } else { 7337 return v; 7338 } 7339 7340 set_float_exception_flags(old_exc_flags, status); 7341 float_raise(float_flag_invalid, status); 7342 return res; 7343 } 7344 7345 uint16_t float32_to_uint16(float32 a, float_status *status) 7346 { 7347 int32_t v; 7348 uint16_t res; 7349 int old_exc_flags = get_float_exception_flags(status); 7350 7351 v = float32_to_int32(a, status); 7352 if (v < 0) { 7353 res = 0; 7354 } else if (v > 0xffff) { 7355 res = 0xffff; 7356 } else { 7357 return v; 7358 } 7359 7360 set_float_exception_flags(old_exc_flags, status); 7361 float_raise(float_flag_invalid, status); 7362 return res; 7363 } 7364 7365 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7366 { 7367 int64_t v; 7368 uint16_t res; 7369 int old_exc_flags = get_float_exception_flags(status); 7370 7371 v = float32_to_int64_round_to_zero(a, status); 7372 if (v < 0) { 7373 res = 0; 7374 } else if (v > 0xffff) { 7375 res = 0xffff; 7376 } else { 7377 return v; 7378 } 7379 set_float_exception_flags(old_exc_flags, status); 7380 float_raise(float_flag_invalid, status); 7381 return res; 7382 } 7383 7384 uint32_t float64_to_uint32(float64 a, float_status *status) 7385 { 7386 uint64_t v; 7387 uint32_t res; 7388 int old_exc_flags = get_float_exception_flags(status); 7389 7390 v = float64_to_uint64(a, status); 7391 if (v > 0xffffffff) { 7392 res = 0xffffffff; 7393 } else { 7394 return v; 7395 } 7396 set_float_exception_flags(old_exc_flags, status); 7397 float_raise(float_flag_invalid, status); 7398 return res; 7399 } 7400 7401 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7402 { 7403 uint64_t v; 7404 uint32_t res; 7405 int old_exc_flags = get_float_exception_flags(status); 7406 7407 v = float64_to_uint64_round_to_zero(a, status); 7408 if (v > 0xffffffff) { 7409 res = 0xffffffff; 7410 } else { 7411 return v; 7412 } 7413 set_float_exception_flags(old_exc_flags, status); 7414 float_raise(float_flag_invalid, status); 7415 return res; 7416 } 7417 7418 int16_t float64_to_int16(float64 a, float_status *status) 7419 { 7420 int64_t v; 7421 int16_t res; 7422 int old_exc_flags = get_float_exception_flags(status); 7423 7424 v = float64_to_int32(a, status); 7425 if (v < -0x8000) { 7426 res = -0x8000; 7427 } else if (v > 0x7fff) { 7428 res = 0x7fff; 7429 } else { 7430 return v; 7431 } 7432 7433 set_float_exception_flags(old_exc_flags, status); 7434 float_raise(float_flag_invalid, status); 7435 return res; 7436 } 7437 7438 uint16_t float64_to_uint16(float64 a, float_status *status) 7439 { 7440 int64_t v; 7441 uint16_t res; 7442 int old_exc_flags = get_float_exception_flags(status); 7443 7444 v = float64_to_int32(a, status); 7445 if (v < 0) { 7446 res = 0; 7447 } else if (v > 0xffff) { 7448 res = 0xffff; 7449 } else { 7450 return v; 7451 } 7452 7453 set_float_exception_flags(old_exc_flags, status); 7454 float_raise(float_flag_invalid, status); 7455 return res; 7456 } 7457 7458 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7459 { 7460 int64_t v; 7461 uint16_t res; 7462 int old_exc_flags = get_float_exception_flags(status); 7463 7464 v = float64_to_int64_round_to_zero(a, status); 7465 if (v < 0) { 7466 res = 0; 7467 } else if (v > 0xffff) { 7468 res = 0xffff; 7469 } else { 7470 return v; 7471 } 7472 set_float_exception_flags(old_exc_flags, status); 7473 float_raise(float_flag_invalid, status); 7474 return res; 7475 } 7476 7477 /*---------------------------------------------------------------------------- 7478 | Returns the result of converting the double-precision floating-point value 7479 | `a' to the 64-bit unsigned integer format. The conversion is 7480 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7481 | Arithmetic---which means in particular that the conversion is rounded 7482 | according to the current rounding mode. If `a' is a NaN, the largest 7483 | positive integer is returned. If the conversion overflows, the 7484 | largest unsigned integer is returned. If 'a' is negative, the value is 7485 | rounded and zero is returned; negative values that do not round to zero 7486 | will raise the inexact exception. 7487 *----------------------------------------------------------------------------*/ 7488 7489 uint64_t float64_to_uint64(float64 a, float_status *status) 7490 { 7491 flag aSign; 7492 int aExp; 7493 int shiftCount; 7494 uint64_t aSig, aSigExtra; 7495 a = float64_squash_input_denormal(a, status); 7496 7497 aSig = extractFloat64Frac(a); 7498 aExp = extractFloat64Exp(a); 7499 aSign = extractFloat64Sign(a); 7500 if (aSign && (aExp > 1022)) { 7501 float_raise(float_flag_invalid, status); 7502 if (float64_is_any_nan(a)) { 7503 return LIT64(0xFFFFFFFFFFFFFFFF); 7504 } else { 7505 return 0; 7506 } 7507 } 7508 if (aExp) { 7509 aSig |= LIT64(0x0010000000000000); 7510 } 7511 shiftCount = 0x433 - aExp; 7512 if (shiftCount <= 0) { 7513 if (0x43E < aExp) { 7514 float_raise(float_flag_invalid, status); 7515 return LIT64(0xFFFFFFFFFFFFFFFF); 7516 } 7517 aSigExtra = 0; 7518 aSig <<= -shiftCount; 7519 } else { 7520 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7521 } 7522 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7523 } 7524 7525 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7526 { 7527 signed char current_rounding_mode = status->float_rounding_mode; 7528 set_float_rounding_mode(float_round_to_zero, status); 7529 uint64_t v = float64_to_uint64(a, status); 7530 set_float_rounding_mode(current_rounding_mode, status); 7531 return v; 7532 } 7533 7534 #define COMPARE(s, nan_exp) \ 7535 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7536 int is_quiet, float_status *status) \ 7537 { \ 7538 flag aSign, bSign; \ 7539 uint ## s ## _t av, bv; \ 7540 a = float ## s ## _squash_input_denormal(a, status); \ 7541 b = float ## s ## _squash_input_denormal(b, status); \ 7542 \ 7543 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7544 extractFloat ## s ## Frac( a ) ) || \ 7545 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7546 extractFloat ## s ## Frac( b ) )) { \ 7547 if (!is_quiet || \ 7548 float ## s ## _is_signaling_nan(a, status) || \ 7549 float ## s ## _is_signaling_nan(b, status)) { \ 7550 float_raise(float_flag_invalid, status); \ 7551 } \ 7552 return float_relation_unordered; \ 7553 } \ 7554 aSign = extractFloat ## s ## Sign( a ); \ 7555 bSign = extractFloat ## s ## Sign( b ); \ 7556 av = float ## s ## _val(a); \ 7557 bv = float ## s ## _val(b); \ 7558 if ( aSign != bSign ) { \ 7559 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7560 /* zero case */ \ 7561 return float_relation_equal; \ 7562 } else { \ 7563 return 1 - (2 * aSign); \ 7564 } \ 7565 } else { \ 7566 if (av == bv) { \ 7567 return float_relation_equal; \ 7568 } else { \ 7569 return 1 - 2 * (aSign ^ ( av < bv )); \ 7570 } \ 7571 } \ 7572 } \ 7573 \ 7574 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7575 { \ 7576 return float ## s ## _compare_internal(a, b, 0, status); \ 7577 } \ 7578 \ 7579 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7580 float_status *status) \ 7581 { \ 7582 return float ## s ## _compare_internal(a, b, 1, status); \ 7583 } 7584 7585 COMPARE(32, 0xff) 7586 COMPARE(64, 0x7ff) 7587 7588 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7589 int is_quiet, float_status *status) 7590 { 7591 flag aSign, bSign; 7592 7593 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7594 float_raise(float_flag_invalid, status); 7595 return float_relation_unordered; 7596 } 7597 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7598 ( extractFloatx80Frac( a )<<1 ) ) || 7599 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7600 ( extractFloatx80Frac( b )<<1 ) )) { 7601 if (!is_quiet || 7602 floatx80_is_signaling_nan(a, status) || 7603 floatx80_is_signaling_nan(b, status)) { 7604 float_raise(float_flag_invalid, status); 7605 } 7606 return float_relation_unordered; 7607 } 7608 aSign = extractFloatx80Sign( a ); 7609 bSign = extractFloatx80Sign( b ); 7610 if ( aSign != bSign ) { 7611 7612 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7613 ( ( a.low | b.low ) == 0 ) ) { 7614 /* zero case */ 7615 return float_relation_equal; 7616 } else { 7617 return 1 - (2 * aSign); 7618 } 7619 } else { 7620 if (a.low == b.low && a.high == b.high) { 7621 return float_relation_equal; 7622 } else { 7623 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7624 } 7625 } 7626 } 7627 7628 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7629 { 7630 return floatx80_compare_internal(a, b, 0, status); 7631 } 7632 7633 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7634 { 7635 return floatx80_compare_internal(a, b, 1, status); 7636 } 7637 7638 static inline int float128_compare_internal(float128 a, float128 b, 7639 int is_quiet, float_status *status) 7640 { 7641 flag aSign, bSign; 7642 7643 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7644 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7645 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7646 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7647 if (!is_quiet || 7648 float128_is_signaling_nan(a, status) || 7649 float128_is_signaling_nan(b, status)) { 7650 float_raise(float_flag_invalid, status); 7651 } 7652 return float_relation_unordered; 7653 } 7654 aSign = extractFloat128Sign( a ); 7655 bSign = extractFloat128Sign( b ); 7656 if ( aSign != bSign ) { 7657 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7658 /* zero case */ 7659 return float_relation_equal; 7660 } else { 7661 return 1 - (2 * aSign); 7662 } 7663 } else { 7664 if (a.low == b.low && a.high == b.high) { 7665 return float_relation_equal; 7666 } else { 7667 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7668 } 7669 } 7670 } 7671 7672 int float128_compare(float128 a, float128 b, float_status *status) 7673 { 7674 return float128_compare_internal(a, b, 0, status); 7675 } 7676 7677 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7678 { 7679 return float128_compare_internal(a, b, 1, status); 7680 } 7681 7682 /* min() and max() functions. These can't be implemented as 7683 * 'compare and pick one input' because that would mishandle 7684 * NaNs and +0 vs -0. 7685 * 7686 * minnum() and maxnum() functions. These are similar to the min() 7687 * and max() functions but if one of the arguments is a QNaN and 7688 * the other is numerical then the numerical argument is returned. 7689 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7690 * and maxNum() operations. min() and max() are the typical min/max 7691 * semantics provided by many CPUs which predate that specification. 7692 * 7693 * minnummag() and maxnummag() functions correspond to minNumMag() 7694 * and minNumMag() from the IEEE-754 2008. 7695 */ 7696 #define MINMAX(s) \ 7697 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7698 int ismin, int isieee, \ 7699 int ismag, \ 7700 float_status *status) \ 7701 { \ 7702 flag aSign, bSign; \ 7703 uint ## s ## _t av, bv, aav, abv; \ 7704 a = float ## s ## _squash_input_denormal(a, status); \ 7705 b = float ## s ## _squash_input_denormal(b, status); \ 7706 if (float ## s ## _is_any_nan(a) || \ 7707 float ## s ## _is_any_nan(b)) { \ 7708 if (isieee) { \ 7709 if (float ## s ## _is_quiet_nan(a, status) && \ 7710 !float ## s ##_is_any_nan(b)) { \ 7711 return b; \ 7712 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7713 !float ## s ## _is_any_nan(a)) { \ 7714 return a; \ 7715 } \ 7716 } \ 7717 return propagateFloat ## s ## NaN(a, b, status); \ 7718 } \ 7719 aSign = extractFloat ## s ## Sign(a); \ 7720 bSign = extractFloat ## s ## Sign(b); \ 7721 av = float ## s ## _val(a); \ 7722 bv = float ## s ## _val(b); \ 7723 if (ismag) { \ 7724 aav = float ## s ## _abs(av); \ 7725 abv = float ## s ## _abs(bv); \ 7726 if (aav != abv) { \ 7727 if (ismin) { \ 7728 return (aav < abv) ? a : b; \ 7729 } else { \ 7730 return (aav < abv) ? b : a; \ 7731 } \ 7732 } \ 7733 } \ 7734 if (aSign != bSign) { \ 7735 if (ismin) { \ 7736 return aSign ? a : b; \ 7737 } else { \ 7738 return aSign ? b : a; \ 7739 } \ 7740 } else { \ 7741 if (ismin) { \ 7742 return (aSign ^ (av < bv)) ? a : b; \ 7743 } else { \ 7744 return (aSign ^ (av < bv)) ? b : a; \ 7745 } \ 7746 } \ 7747 } \ 7748 \ 7749 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7750 float_status *status) \ 7751 { \ 7752 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7753 } \ 7754 \ 7755 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7756 float_status *status) \ 7757 { \ 7758 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7759 } \ 7760 \ 7761 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7762 float_status *status) \ 7763 { \ 7764 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7765 } \ 7766 \ 7767 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7768 float_status *status) \ 7769 { \ 7770 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7771 } \ 7772 \ 7773 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7774 float_status *status) \ 7775 { \ 7776 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7777 } \ 7778 \ 7779 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7780 float_status *status) \ 7781 { \ 7782 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7783 } 7784 7785 MINMAX(32) 7786 MINMAX(64) 7787 7788 7789 /* Multiply A by 2 raised to the power N. */ 7790 float32 float32_scalbn(float32 a, int n, float_status *status) 7791 { 7792 flag aSign; 7793 int16_t aExp; 7794 uint32_t aSig; 7795 7796 a = float32_squash_input_denormal(a, status); 7797 aSig = extractFloat32Frac( a ); 7798 aExp = extractFloat32Exp( a ); 7799 aSign = extractFloat32Sign( a ); 7800 7801 if ( aExp == 0xFF ) { 7802 if ( aSig ) { 7803 return propagateFloat32NaN(a, a, status); 7804 } 7805 return a; 7806 } 7807 if (aExp != 0) { 7808 aSig |= 0x00800000; 7809 } else if (aSig == 0) { 7810 return a; 7811 } else { 7812 aExp++; 7813 } 7814 7815 if (n > 0x200) { 7816 n = 0x200; 7817 } else if (n < -0x200) { 7818 n = -0x200; 7819 } 7820 7821 aExp += n - 1; 7822 aSig <<= 7; 7823 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7824 } 7825 7826 float64 float64_scalbn(float64 a, int n, float_status *status) 7827 { 7828 flag aSign; 7829 int16_t aExp; 7830 uint64_t aSig; 7831 7832 a = float64_squash_input_denormal(a, status); 7833 aSig = extractFloat64Frac( a ); 7834 aExp = extractFloat64Exp( a ); 7835 aSign = extractFloat64Sign( a ); 7836 7837 if ( aExp == 0x7FF ) { 7838 if ( aSig ) { 7839 return propagateFloat64NaN(a, a, status); 7840 } 7841 return a; 7842 } 7843 if (aExp != 0) { 7844 aSig |= LIT64( 0x0010000000000000 ); 7845 } else if (aSig == 0) { 7846 return a; 7847 } else { 7848 aExp++; 7849 } 7850 7851 if (n > 0x1000) { 7852 n = 0x1000; 7853 } else if (n < -0x1000) { 7854 n = -0x1000; 7855 } 7856 7857 aExp += n - 1; 7858 aSig <<= 10; 7859 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7860 } 7861 7862 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7863 { 7864 flag aSign; 7865 int32_t aExp; 7866 uint64_t aSig; 7867 7868 if (floatx80_invalid_encoding(a)) { 7869 float_raise(float_flag_invalid, status); 7870 return floatx80_default_nan(status); 7871 } 7872 aSig = extractFloatx80Frac( a ); 7873 aExp = extractFloatx80Exp( a ); 7874 aSign = extractFloatx80Sign( a ); 7875 7876 if ( aExp == 0x7FFF ) { 7877 if ( aSig<<1 ) { 7878 return propagateFloatx80NaN(a, a, status); 7879 } 7880 return a; 7881 } 7882 7883 if (aExp == 0) { 7884 if (aSig == 0) { 7885 return a; 7886 } 7887 aExp++; 7888 } 7889 7890 if (n > 0x10000) { 7891 n = 0x10000; 7892 } else if (n < -0x10000) { 7893 n = -0x10000; 7894 } 7895 7896 aExp += n; 7897 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7898 aSign, aExp, aSig, 0, status); 7899 } 7900 7901 float128 float128_scalbn(float128 a, int n, float_status *status) 7902 { 7903 flag aSign; 7904 int32_t aExp; 7905 uint64_t aSig0, aSig1; 7906 7907 aSig1 = extractFloat128Frac1( a ); 7908 aSig0 = extractFloat128Frac0( a ); 7909 aExp = extractFloat128Exp( a ); 7910 aSign = extractFloat128Sign( a ); 7911 if ( aExp == 0x7FFF ) { 7912 if ( aSig0 | aSig1 ) { 7913 return propagateFloat128NaN(a, a, status); 7914 } 7915 return a; 7916 } 7917 if (aExp != 0) { 7918 aSig0 |= LIT64( 0x0001000000000000 ); 7919 } else if (aSig0 == 0 && aSig1 == 0) { 7920 return a; 7921 } else { 7922 aExp++; 7923 } 7924 7925 if (n > 0x10000) { 7926 n = 0x10000; 7927 } else if (n < -0x10000) { 7928 n = -0x10000; 7929 } 7930 7931 aExp += n - 1; 7932 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7933 , status); 7934 7935 } 7936