1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, 1522 int flags, float_status *status) 1523 { 1524 FloatParts pa = float32_unpack_canonical(a, status); 1525 FloatParts pb = float32_unpack_canonical(b, status); 1526 FloatParts pc = float32_unpack_canonical(c, status); 1527 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1528 1529 return float32_round_pack_canonical(pr, status); 1530 } 1531 1532 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, 1533 int flags, float_status *status) 1534 { 1535 FloatParts pa = float64_unpack_canonical(a, status); 1536 FloatParts pb = float64_unpack_canonical(b, status); 1537 FloatParts pc = float64_unpack_canonical(c, status); 1538 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1539 1540 return float64_round_pack_canonical(pr, status); 1541 } 1542 1543 /* 1544 * Returns the result of dividing the floating-point value `a' by the 1545 * corresponding value `b'. The operation is performed according to 1546 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1547 */ 1548 1549 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1550 { 1551 bool sign = a.sign ^ b.sign; 1552 1553 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1554 uint64_t n0, n1, q, r; 1555 int exp = a.exp - b.exp; 1556 1557 /* 1558 * We want a 2*N / N-bit division to produce exactly an N-bit 1559 * result, so that we do not lose any precision and so that we 1560 * do not have to renormalize afterward. If A.frac < B.frac, 1561 * then division would produce an (N-1)-bit result; shift A left 1562 * by one to produce the an N-bit result, and decrement the 1563 * exponent to match. 1564 * 1565 * The udiv_qrnnd algorithm that we're using requires normalization, 1566 * i.e. the msb of the denominator must be set. Since we know that 1567 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1568 * by one (more), and the remainder must be shifted right by one. 1569 */ 1570 if (a.frac < b.frac) { 1571 exp -= 1; 1572 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1573 } else { 1574 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1575 } 1576 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1577 1578 /* 1579 * Set lsb if there is a remainder, to set inexact. 1580 * As mentioned above, to find the actual value of the remainder we 1581 * would need to shift right, but (1) we are only concerned about 1582 * non-zero-ness, and (2) the remainder will always be even because 1583 * both inputs to the division primitive are even. 1584 */ 1585 a.frac = q | (r != 0); 1586 a.sign = sign; 1587 a.exp = exp; 1588 return a; 1589 } 1590 /* handle all the NaN cases */ 1591 if (is_nan(a.cls) || is_nan(b.cls)) { 1592 return pick_nan(a, b, s); 1593 } 1594 /* 0/0 or Inf/Inf */ 1595 if (a.cls == b.cls 1596 && 1597 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1598 s->float_exception_flags |= float_flag_invalid; 1599 return parts_default_nan(s); 1600 } 1601 /* Inf / x or 0 / x */ 1602 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1603 a.sign = sign; 1604 return a; 1605 } 1606 /* Div 0 => Inf */ 1607 if (b.cls == float_class_zero) { 1608 s->float_exception_flags |= float_flag_divbyzero; 1609 a.cls = float_class_inf; 1610 a.sign = sign; 1611 return a; 1612 } 1613 /* Div by Inf */ 1614 if (b.cls == float_class_inf) { 1615 a.cls = float_class_zero; 1616 a.sign = sign; 1617 return a; 1618 } 1619 g_assert_not_reached(); 1620 } 1621 1622 float16 float16_div(float16 a, float16 b, float_status *status) 1623 { 1624 FloatParts pa = float16_unpack_canonical(a, status); 1625 FloatParts pb = float16_unpack_canonical(b, status); 1626 FloatParts pr = div_floats(pa, pb, status); 1627 1628 return float16_round_pack_canonical(pr, status); 1629 } 1630 1631 static float32 QEMU_SOFTFLOAT_ATTR 1632 soft_f32_div(float32 a, float32 b, float_status *status) 1633 { 1634 FloatParts pa = float32_unpack_canonical(a, status); 1635 FloatParts pb = float32_unpack_canonical(b, status); 1636 FloatParts pr = div_floats(pa, pb, status); 1637 1638 return float32_round_pack_canonical(pr, status); 1639 } 1640 1641 static float64 QEMU_SOFTFLOAT_ATTR 1642 soft_f64_div(float64 a, float64 b, float_status *status) 1643 { 1644 FloatParts pa = float64_unpack_canonical(a, status); 1645 FloatParts pb = float64_unpack_canonical(b, status); 1646 FloatParts pr = div_floats(pa, pb, status); 1647 1648 return float64_round_pack_canonical(pr, status); 1649 } 1650 1651 static float hard_f32_div(float a, float b) 1652 { 1653 return a / b; 1654 } 1655 1656 static double hard_f64_div(double a, double b) 1657 { 1658 return a / b; 1659 } 1660 1661 static bool f32_div_pre(union_float32 a, union_float32 b) 1662 { 1663 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1664 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1665 fpclassify(b.h) == FP_NORMAL; 1666 } 1667 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1668 } 1669 1670 static bool f64_div_pre(union_float64 a, union_float64 b) 1671 { 1672 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1673 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1674 fpclassify(b.h) == FP_NORMAL; 1675 } 1676 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1677 } 1678 1679 static bool f32_div_post(union_float32 a, union_float32 b) 1680 { 1681 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1682 return fpclassify(a.h) != FP_ZERO; 1683 } 1684 return !float32_is_zero(a.s); 1685 } 1686 1687 static bool f64_div_post(union_float64 a, union_float64 b) 1688 { 1689 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1690 return fpclassify(a.h) != FP_ZERO; 1691 } 1692 return !float64_is_zero(a.s); 1693 } 1694 1695 float32 QEMU_FLATTEN 1696 float32_div(float32 a, float32 b, float_status *s) 1697 { 1698 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1699 f32_div_pre, f32_div_post, NULL, NULL); 1700 } 1701 1702 float64 QEMU_FLATTEN 1703 float64_div(float64 a, float64 b, float_status *s) 1704 { 1705 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1706 f64_div_pre, f64_div_post, NULL, NULL); 1707 } 1708 1709 /* 1710 * Float to Float conversions 1711 * 1712 * Returns the result of converting one float format to another. The 1713 * conversion is performed according to the IEC/IEEE Standard for 1714 * Binary Floating-Point Arithmetic. 1715 * 1716 * The float_to_float helper only needs to take care of raising 1717 * invalid exceptions and handling the conversion on NaNs. 1718 */ 1719 1720 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1721 float_status *s) 1722 { 1723 if (dstf->arm_althp) { 1724 switch (a.cls) { 1725 case float_class_qnan: 1726 case float_class_snan: 1727 /* There is no NaN in the destination format. Raise Invalid 1728 * and return a zero with the sign of the input NaN. 1729 */ 1730 s->float_exception_flags |= float_flag_invalid; 1731 a.cls = float_class_zero; 1732 a.frac = 0; 1733 a.exp = 0; 1734 break; 1735 1736 case float_class_inf: 1737 /* There is no Inf in the destination format. Raise Invalid 1738 * and return the maximum normal with the correct sign. 1739 */ 1740 s->float_exception_flags |= float_flag_invalid; 1741 a.cls = float_class_normal; 1742 a.exp = dstf->exp_max; 1743 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1744 break; 1745 1746 default: 1747 break; 1748 } 1749 } else if (is_nan(a.cls)) { 1750 if (is_snan(a.cls)) { 1751 s->float_exception_flags |= float_flag_invalid; 1752 a = parts_silence_nan(a, s); 1753 } 1754 if (s->default_nan_mode) { 1755 return parts_default_nan(s); 1756 } 1757 } 1758 return a; 1759 } 1760 1761 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1762 { 1763 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1764 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1765 FloatParts pr = float_to_float(p, &float32_params, s); 1766 return float32_round_pack_canonical(pr, s); 1767 } 1768 1769 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1770 { 1771 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1772 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1773 FloatParts pr = float_to_float(p, &float64_params, s); 1774 return float64_round_pack_canonical(pr, s); 1775 } 1776 1777 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1778 { 1779 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1780 FloatParts p = float32_unpack_canonical(a, s); 1781 FloatParts pr = float_to_float(p, fmt16, s); 1782 return float16a_round_pack_canonical(pr, s, fmt16); 1783 } 1784 1785 float64 float32_to_float64(float32 a, float_status *s) 1786 { 1787 FloatParts p = float32_unpack_canonical(a, s); 1788 FloatParts pr = float_to_float(p, &float64_params, s); 1789 return float64_round_pack_canonical(pr, s); 1790 } 1791 1792 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1793 { 1794 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1795 FloatParts p = float64_unpack_canonical(a, s); 1796 FloatParts pr = float_to_float(p, fmt16, s); 1797 return float16a_round_pack_canonical(pr, s, fmt16); 1798 } 1799 1800 float32 float64_to_float32(float64 a, float_status *s) 1801 { 1802 FloatParts p = float64_unpack_canonical(a, s); 1803 FloatParts pr = float_to_float(p, &float32_params, s); 1804 return float32_round_pack_canonical(pr, s); 1805 } 1806 1807 /* 1808 * Rounds the floating-point value `a' to an integer, and returns the 1809 * result as a floating-point value. The operation is performed 1810 * according to the IEC/IEEE Standard for Binary Floating-Point 1811 * Arithmetic. 1812 */ 1813 1814 static FloatParts round_to_int(FloatParts a, int rmode, 1815 int scale, float_status *s) 1816 { 1817 switch (a.cls) { 1818 case float_class_qnan: 1819 case float_class_snan: 1820 return return_nan(a, s); 1821 1822 case float_class_zero: 1823 case float_class_inf: 1824 /* already "integral" */ 1825 break; 1826 1827 case float_class_normal: 1828 scale = MIN(MAX(scale, -0x10000), 0x10000); 1829 a.exp += scale; 1830 1831 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1832 /* already integral */ 1833 break; 1834 } 1835 if (a.exp < 0) { 1836 bool one; 1837 /* all fractional */ 1838 s->float_exception_flags |= float_flag_inexact; 1839 switch (rmode) { 1840 case float_round_nearest_even: 1841 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1842 break; 1843 case float_round_ties_away: 1844 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1845 break; 1846 case float_round_to_zero: 1847 one = false; 1848 break; 1849 case float_round_up: 1850 one = !a.sign; 1851 break; 1852 case float_round_down: 1853 one = a.sign; 1854 break; 1855 default: 1856 g_assert_not_reached(); 1857 } 1858 1859 if (one) { 1860 a.frac = DECOMPOSED_IMPLICIT_BIT; 1861 a.exp = 0; 1862 } else { 1863 a.cls = float_class_zero; 1864 } 1865 } else { 1866 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1867 uint64_t frac_lsbm1 = frac_lsb >> 1; 1868 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1869 uint64_t rnd_mask = rnd_even_mask >> 1; 1870 uint64_t inc; 1871 1872 switch (rmode) { 1873 case float_round_nearest_even: 1874 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1875 break; 1876 case float_round_ties_away: 1877 inc = frac_lsbm1; 1878 break; 1879 case float_round_to_zero: 1880 inc = 0; 1881 break; 1882 case float_round_up: 1883 inc = a.sign ? 0 : rnd_mask; 1884 break; 1885 case float_round_down: 1886 inc = a.sign ? rnd_mask : 0; 1887 break; 1888 default: 1889 g_assert_not_reached(); 1890 } 1891 1892 if (a.frac & rnd_mask) { 1893 s->float_exception_flags |= float_flag_inexact; 1894 a.frac += inc; 1895 a.frac &= ~rnd_mask; 1896 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1897 a.frac >>= 1; 1898 a.exp++; 1899 } 1900 } 1901 } 1902 break; 1903 default: 1904 g_assert_not_reached(); 1905 } 1906 return a; 1907 } 1908 1909 float16 float16_round_to_int(float16 a, float_status *s) 1910 { 1911 FloatParts pa = float16_unpack_canonical(a, s); 1912 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1913 return float16_round_pack_canonical(pr, s); 1914 } 1915 1916 float32 float32_round_to_int(float32 a, float_status *s) 1917 { 1918 FloatParts pa = float32_unpack_canonical(a, s); 1919 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1920 return float32_round_pack_canonical(pr, s); 1921 } 1922 1923 float64 float64_round_to_int(float64 a, float_status *s) 1924 { 1925 FloatParts pa = float64_unpack_canonical(a, s); 1926 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1927 return float64_round_pack_canonical(pr, s); 1928 } 1929 1930 /* 1931 * Returns the result of converting the floating-point value `a' to 1932 * the two's complement integer format. The conversion is performed 1933 * according to the IEC/IEEE Standard for Binary Floating-Point 1934 * Arithmetic---which means in particular that the conversion is 1935 * rounded according to the current rounding mode. If `a' is a NaN, 1936 * the largest positive integer is returned. Otherwise, if the 1937 * conversion overflows, the largest integer with the same sign as `a' 1938 * is returned. 1939 */ 1940 1941 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1942 int64_t min, int64_t max, 1943 float_status *s) 1944 { 1945 uint64_t r; 1946 int orig_flags = get_float_exception_flags(s); 1947 FloatParts p = round_to_int(in, rmode, scale, s); 1948 1949 switch (p.cls) { 1950 case float_class_snan: 1951 case float_class_qnan: 1952 s->float_exception_flags = orig_flags | float_flag_invalid; 1953 return max; 1954 case float_class_inf: 1955 s->float_exception_flags = orig_flags | float_flag_invalid; 1956 return p.sign ? min : max; 1957 case float_class_zero: 1958 return 0; 1959 case float_class_normal: 1960 if (p.exp < DECOMPOSED_BINARY_POINT) { 1961 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1962 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1963 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1964 } else { 1965 r = UINT64_MAX; 1966 } 1967 if (p.sign) { 1968 if (r <= -(uint64_t) min) { 1969 return -r; 1970 } else { 1971 s->float_exception_flags = orig_flags | float_flag_invalid; 1972 return min; 1973 } 1974 } else { 1975 if (r <= max) { 1976 return r; 1977 } else { 1978 s->float_exception_flags = orig_flags | float_flag_invalid; 1979 return max; 1980 } 1981 } 1982 default: 1983 g_assert_not_reached(); 1984 } 1985 } 1986 1987 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1988 float_status *s) 1989 { 1990 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1991 rmode, scale, INT16_MIN, INT16_MAX, s); 1992 } 1993 1994 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1995 float_status *s) 1996 { 1997 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1998 rmode, scale, INT32_MIN, INT32_MAX, s); 1999 } 2000 2001 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2002 float_status *s) 2003 { 2004 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2005 rmode, scale, INT64_MIN, INT64_MAX, s); 2006 } 2007 2008 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2009 float_status *s) 2010 { 2011 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2012 rmode, scale, INT16_MIN, INT16_MAX, s); 2013 } 2014 2015 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2016 float_status *s) 2017 { 2018 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2019 rmode, scale, INT32_MIN, INT32_MAX, s); 2020 } 2021 2022 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2023 float_status *s) 2024 { 2025 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2026 rmode, scale, INT64_MIN, INT64_MAX, s); 2027 } 2028 2029 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2030 float_status *s) 2031 { 2032 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2033 rmode, scale, INT16_MIN, INT16_MAX, s); 2034 } 2035 2036 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2037 float_status *s) 2038 { 2039 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2040 rmode, scale, INT32_MIN, INT32_MAX, s); 2041 } 2042 2043 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2044 float_status *s) 2045 { 2046 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2047 rmode, scale, INT64_MIN, INT64_MAX, s); 2048 } 2049 2050 int16_t float16_to_int16(float16 a, float_status *s) 2051 { 2052 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2053 } 2054 2055 int32_t float16_to_int32(float16 a, float_status *s) 2056 { 2057 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2058 } 2059 2060 int64_t float16_to_int64(float16 a, float_status *s) 2061 { 2062 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2063 } 2064 2065 int16_t float32_to_int16(float32 a, float_status *s) 2066 { 2067 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2068 } 2069 2070 int32_t float32_to_int32(float32 a, float_status *s) 2071 { 2072 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2073 } 2074 2075 int64_t float32_to_int64(float32 a, float_status *s) 2076 { 2077 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2078 } 2079 2080 int16_t float64_to_int16(float64 a, float_status *s) 2081 { 2082 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2083 } 2084 2085 int32_t float64_to_int32(float64 a, float_status *s) 2086 { 2087 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2088 } 2089 2090 int64_t float64_to_int64(float64 a, float_status *s) 2091 { 2092 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2093 } 2094 2095 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2096 { 2097 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2098 } 2099 2100 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2101 { 2102 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2103 } 2104 2105 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2106 { 2107 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2108 } 2109 2110 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2111 { 2112 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2113 } 2114 2115 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2116 { 2117 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2118 } 2119 2120 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2121 { 2122 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2123 } 2124 2125 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2126 { 2127 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2128 } 2129 2130 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2131 { 2132 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2133 } 2134 2135 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2136 { 2137 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2138 } 2139 2140 /* 2141 * Returns the result of converting the floating-point value `a' to 2142 * the unsigned integer format. The conversion is performed according 2143 * to the IEC/IEEE Standard for Binary Floating-Point 2144 * Arithmetic---which means in particular that the conversion is 2145 * rounded according to the current rounding mode. If `a' is a NaN, 2146 * the largest unsigned integer is returned. Otherwise, if the 2147 * conversion overflows, the largest unsigned integer is returned. If 2148 * the 'a' is negative, the result is rounded and zero is returned; 2149 * values that do not round to zero will raise the inexact exception 2150 * flag. 2151 */ 2152 2153 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2154 uint64_t max, float_status *s) 2155 { 2156 int orig_flags = get_float_exception_flags(s); 2157 FloatParts p = round_to_int(in, rmode, scale, s); 2158 uint64_t r; 2159 2160 switch (p.cls) { 2161 case float_class_snan: 2162 case float_class_qnan: 2163 s->float_exception_flags = orig_flags | float_flag_invalid; 2164 return max; 2165 case float_class_inf: 2166 s->float_exception_flags = orig_flags | float_flag_invalid; 2167 return p.sign ? 0 : max; 2168 case float_class_zero: 2169 return 0; 2170 case float_class_normal: 2171 if (p.sign) { 2172 s->float_exception_flags = orig_flags | float_flag_invalid; 2173 return 0; 2174 } 2175 2176 if (p.exp < DECOMPOSED_BINARY_POINT) { 2177 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2178 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2179 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2180 } else { 2181 s->float_exception_flags = orig_flags | float_flag_invalid; 2182 return max; 2183 } 2184 2185 /* For uint64 this will never trip, but if p.exp is too large 2186 * to shift a decomposed fraction we shall have exited via the 2187 * 3rd leg above. 2188 */ 2189 if (r > max) { 2190 s->float_exception_flags = orig_flags | float_flag_invalid; 2191 return max; 2192 } 2193 return r; 2194 default: 2195 g_assert_not_reached(); 2196 } 2197 } 2198 2199 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2200 float_status *s) 2201 { 2202 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2203 rmode, scale, UINT16_MAX, s); 2204 } 2205 2206 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2207 float_status *s) 2208 { 2209 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2210 rmode, scale, UINT32_MAX, s); 2211 } 2212 2213 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2214 float_status *s) 2215 { 2216 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2217 rmode, scale, UINT64_MAX, s); 2218 } 2219 2220 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2221 float_status *s) 2222 { 2223 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2224 rmode, scale, UINT16_MAX, s); 2225 } 2226 2227 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2228 float_status *s) 2229 { 2230 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2231 rmode, scale, UINT32_MAX, s); 2232 } 2233 2234 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2235 float_status *s) 2236 { 2237 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2238 rmode, scale, UINT64_MAX, s); 2239 } 2240 2241 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2242 float_status *s) 2243 { 2244 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2245 rmode, scale, UINT16_MAX, s); 2246 } 2247 2248 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2249 float_status *s) 2250 { 2251 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2252 rmode, scale, UINT32_MAX, s); 2253 } 2254 2255 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2256 float_status *s) 2257 { 2258 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2259 rmode, scale, UINT64_MAX, s); 2260 } 2261 2262 uint16_t float16_to_uint16(float16 a, float_status *s) 2263 { 2264 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2265 } 2266 2267 uint32_t float16_to_uint32(float16 a, float_status *s) 2268 { 2269 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2270 } 2271 2272 uint64_t float16_to_uint64(float16 a, float_status *s) 2273 { 2274 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2275 } 2276 2277 uint16_t float32_to_uint16(float32 a, float_status *s) 2278 { 2279 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2280 } 2281 2282 uint32_t float32_to_uint32(float32 a, float_status *s) 2283 { 2284 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2285 } 2286 2287 uint64_t float32_to_uint64(float32 a, float_status *s) 2288 { 2289 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2290 } 2291 2292 uint16_t float64_to_uint16(float64 a, float_status *s) 2293 { 2294 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2295 } 2296 2297 uint32_t float64_to_uint32(float64 a, float_status *s) 2298 { 2299 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2300 } 2301 2302 uint64_t float64_to_uint64(float64 a, float_status *s) 2303 { 2304 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2305 } 2306 2307 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2308 { 2309 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2310 } 2311 2312 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2313 { 2314 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2315 } 2316 2317 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2318 { 2319 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2320 } 2321 2322 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2323 { 2324 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2325 } 2326 2327 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2328 { 2329 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2330 } 2331 2332 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2333 { 2334 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2335 } 2336 2337 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2338 { 2339 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2340 } 2341 2342 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2343 { 2344 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2345 } 2346 2347 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2348 { 2349 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2350 } 2351 2352 /* 2353 * Integer to float conversions 2354 * 2355 * Returns the result of converting the two's complement integer `a' 2356 * to the floating-point format. The conversion is performed according 2357 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2358 */ 2359 2360 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2361 { 2362 FloatParts r = { .sign = false }; 2363 2364 if (a == 0) { 2365 r.cls = float_class_zero; 2366 } else { 2367 uint64_t f = a; 2368 int shift; 2369 2370 r.cls = float_class_normal; 2371 if (a < 0) { 2372 f = -f; 2373 r.sign = true; 2374 } 2375 shift = clz64(f) - 1; 2376 scale = MIN(MAX(scale, -0x10000), 0x10000); 2377 2378 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2379 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2380 } 2381 2382 return r; 2383 } 2384 2385 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2386 { 2387 FloatParts pa = int_to_float(a, scale, status); 2388 return float16_round_pack_canonical(pa, status); 2389 } 2390 2391 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2392 { 2393 return int64_to_float16_scalbn(a, scale, status); 2394 } 2395 2396 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2397 { 2398 return int64_to_float16_scalbn(a, scale, status); 2399 } 2400 2401 float16 int64_to_float16(int64_t a, float_status *status) 2402 { 2403 return int64_to_float16_scalbn(a, 0, status); 2404 } 2405 2406 float16 int32_to_float16(int32_t a, float_status *status) 2407 { 2408 return int64_to_float16_scalbn(a, 0, status); 2409 } 2410 2411 float16 int16_to_float16(int16_t a, float_status *status) 2412 { 2413 return int64_to_float16_scalbn(a, 0, status); 2414 } 2415 2416 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2417 { 2418 FloatParts pa = int_to_float(a, scale, status); 2419 return float32_round_pack_canonical(pa, status); 2420 } 2421 2422 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2423 { 2424 return int64_to_float32_scalbn(a, scale, status); 2425 } 2426 2427 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2428 { 2429 return int64_to_float32_scalbn(a, scale, status); 2430 } 2431 2432 float32 int64_to_float32(int64_t a, float_status *status) 2433 { 2434 return int64_to_float32_scalbn(a, 0, status); 2435 } 2436 2437 float32 int32_to_float32(int32_t a, float_status *status) 2438 { 2439 return int64_to_float32_scalbn(a, 0, status); 2440 } 2441 2442 float32 int16_to_float32(int16_t a, float_status *status) 2443 { 2444 return int64_to_float32_scalbn(a, 0, status); 2445 } 2446 2447 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2448 { 2449 FloatParts pa = int_to_float(a, scale, status); 2450 return float64_round_pack_canonical(pa, status); 2451 } 2452 2453 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2454 { 2455 return int64_to_float64_scalbn(a, scale, status); 2456 } 2457 2458 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2459 { 2460 return int64_to_float64_scalbn(a, scale, status); 2461 } 2462 2463 float64 int64_to_float64(int64_t a, float_status *status) 2464 { 2465 return int64_to_float64_scalbn(a, 0, status); 2466 } 2467 2468 float64 int32_to_float64(int32_t a, float_status *status) 2469 { 2470 return int64_to_float64_scalbn(a, 0, status); 2471 } 2472 2473 float64 int16_to_float64(int16_t a, float_status *status) 2474 { 2475 return int64_to_float64_scalbn(a, 0, status); 2476 } 2477 2478 2479 /* 2480 * Unsigned Integer to float conversions 2481 * 2482 * Returns the result of converting the unsigned integer `a' to the 2483 * floating-point format. The conversion is performed according to the 2484 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2485 */ 2486 2487 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2488 { 2489 FloatParts r = { .sign = false }; 2490 2491 if (a == 0) { 2492 r.cls = float_class_zero; 2493 } else { 2494 scale = MIN(MAX(scale, -0x10000), 0x10000); 2495 r.cls = float_class_normal; 2496 if ((int64_t)a < 0) { 2497 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2498 shift64RightJamming(a, 1, &a); 2499 r.frac = a; 2500 } else { 2501 int shift = clz64(a) - 1; 2502 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2503 r.frac = a << shift; 2504 } 2505 } 2506 2507 return r; 2508 } 2509 2510 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2511 { 2512 FloatParts pa = uint_to_float(a, scale, status); 2513 return float16_round_pack_canonical(pa, status); 2514 } 2515 2516 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2517 { 2518 return uint64_to_float16_scalbn(a, scale, status); 2519 } 2520 2521 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2522 { 2523 return uint64_to_float16_scalbn(a, scale, status); 2524 } 2525 2526 float16 uint64_to_float16(uint64_t a, float_status *status) 2527 { 2528 return uint64_to_float16_scalbn(a, 0, status); 2529 } 2530 2531 float16 uint32_to_float16(uint32_t a, float_status *status) 2532 { 2533 return uint64_to_float16_scalbn(a, 0, status); 2534 } 2535 2536 float16 uint16_to_float16(uint16_t a, float_status *status) 2537 { 2538 return uint64_to_float16_scalbn(a, 0, status); 2539 } 2540 2541 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2542 { 2543 FloatParts pa = uint_to_float(a, scale, status); 2544 return float32_round_pack_canonical(pa, status); 2545 } 2546 2547 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2548 { 2549 return uint64_to_float32_scalbn(a, scale, status); 2550 } 2551 2552 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2553 { 2554 return uint64_to_float32_scalbn(a, scale, status); 2555 } 2556 2557 float32 uint64_to_float32(uint64_t a, float_status *status) 2558 { 2559 return uint64_to_float32_scalbn(a, 0, status); 2560 } 2561 2562 float32 uint32_to_float32(uint32_t a, float_status *status) 2563 { 2564 return uint64_to_float32_scalbn(a, 0, status); 2565 } 2566 2567 float32 uint16_to_float32(uint16_t a, float_status *status) 2568 { 2569 return uint64_to_float32_scalbn(a, 0, status); 2570 } 2571 2572 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2573 { 2574 FloatParts pa = uint_to_float(a, scale, status); 2575 return float64_round_pack_canonical(pa, status); 2576 } 2577 2578 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2579 { 2580 return uint64_to_float64_scalbn(a, scale, status); 2581 } 2582 2583 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2584 { 2585 return uint64_to_float64_scalbn(a, scale, status); 2586 } 2587 2588 float64 uint64_to_float64(uint64_t a, float_status *status) 2589 { 2590 return uint64_to_float64_scalbn(a, 0, status); 2591 } 2592 2593 float64 uint32_to_float64(uint32_t a, float_status *status) 2594 { 2595 return uint64_to_float64_scalbn(a, 0, status); 2596 } 2597 2598 float64 uint16_to_float64(uint16_t a, float_status *status) 2599 { 2600 return uint64_to_float64_scalbn(a, 0, status); 2601 } 2602 2603 /* Float Min/Max */ 2604 /* min() and max() functions. These can't be implemented as 2605 * 'compare and pick one input' because that would mishandle 2606 * NaNs and +0 vs -0. 2607 * 2608 * minnum() and maxnum() functions. These are similar to the min() 2609 * and max() functions but if one of the arguments is a QNaN and 2610 * the other is numerical then the numerical argument is returned. 2611 * SNaNs will get quietened before being returned. 2612 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2613 * and maxNum() operations. min() and max() are the typical min/max 2614 * semantics provided by many CPUs which predate that specification. 2615 * 2616 * minnummag() and maxnummag() functions correspond to minNumMag() 2617 * and minNumMag() from the IEEE-754 2008. 2618 */ 2619 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2620 bool ieee, bool ismag, float_status *s) 2621 { 2622 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2623 if (ieee) { 2624 /* Takes two floating-point values `a' and `b', one of 2625 * which is a NaN, and returns the appropriate NaN 2626 * result. If either `a' or `b' is a signaling NaN, 2627 * the invalid exception is raised. 2628 */ 2629 if (is_snan(a.cls) || is_snan(b.cls)) { 2630 return pick_nan(a, b, s); 2631 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2632 return b; 2633 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2634 return a; 2635 } 2636 } 2637 return pick_nan(a, b, s); 2638 } else { 2639 int a_exp, b_exp; 2640 2641 switch (a.cls) { 2642 case float_class_normal: 2643 a_exp = a.exp; 2644 break; 2645 case float_class_inf: 2646 a_exp = INT_MAX; 2647 break; 2648 case float_class_zero: 2649 a_exp = INT_MIN; 2650 break; 2651 default: 2652 g_assert_not_reached(); 2653 break; 2654 } 2655 switch (b.cls) { 2656 case float_class_normal: 2657 b_exp = b.exp; 2658 break; 2659 case float_class_inf: 2660 b_exp = INT_MAX; 2661 break; 2662 case float_class_zero: 2663 b_exp = INT_MIN; 2664 break; 2665 default: 2666 g_assert_not_reached(); 2667 break; 2668 } 2669 2670 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2671 bool a_less = a_exp < b_exp; 2672 if (a_exp == b_exp) { 2673 a_less = a.frac < b.frac; 2674 } 2675 return a_less ^ ismin ? b : a; 2676 } 2677 2678 if (a.sign == b.sign) { 2679 bool a_less = a_exp < b_exp; 2680 if (a_exp == b_exp) { 2681 a_less = a.frac < b.frac; 2682 } 2683 return a.sign ^ a_less ^ ismin ? b : a; 2684 } else { 2685 return a.sign ^ ismin ? b : a; 2686 } 2687 } 2688 } 2689 2690 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2691 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2692 float_status *s) \ 2693 { \ 2694 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2695 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2696 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2697 \ 2698 return float ## sz ## _round_pack_canonical(pr, s); \ 2699 } 2700 2701 MINMAX(16, min, true, false, false) 2702 MINMAX(16, minnum, true, true, false) 2703 MINMAX(16, minnummag, true, true, true) 2704 MINMAX(16, max, false, false, false) 2705 MINMAX(16, maxnum, false, true, false) 2706 MINMAX(16, maxnummag, false, true, true) 2707 2708 MINMAX(32, min, true, false, false) 2709 MINMAX(32, minnum, true, true, false) 2710 MINMAX(32, minnummag, true, true, true) 2711 MINMAX(32, max, false, false, false) 2712 MINMAX(32, maxnum, false, true, false) 2713 MINMAX(32, maxnummag, false, true, true) 2714 2715 MINMAX(64, min, true, false, false) 2716 MINMAX(64, minnum, true, true, false) 2717 MINMAX(64, minnummag, true, true, true) 2718 MINMAX(64, max, false, false, false) 2719 MINMAX(64, maxnum, false, true, false) 2720 MINMAX(64, maxnummag, false, true, true) 2721 2722 #undef MINMAX 2723 2724 /* Floating point compare */ 2725 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2726 float_status *s) 2727 { 2728 if (is_nan(a.cls) || is_nan(b.cls)) { 2729 if (!is_quiet || 2730 a.cls == float_class_snan || 2731 b.cls == float_class_snan) { 2732 s->float_exception_flags |= float_flag_invalid; 2733 } 2734 return float_relation_unordered; 2735 } 2736 2737 if (a.cls == float_class_zero) { 2738 if (b.cls == float_class_zero) { 2739 return float_relation_equal; 2740 } 2741 return b.sign ? float_relation_greater : float_relation_less; 2742 } else if (b.cls == float_class_zero) { 2743 return a.sign ? float_relation_less : float_relation_greater; 2744 } 2745 2746 /* The only really important thing about infinity is its sign. If 2747 * both are infinities the sign marks the smallest of the two. 2748 */ 2749 if (a.cls == float_class_inf) { 2750 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2751 return float_relation_equal; 2752 } 2753 return a.sign ? float_relation_less : float_relation_greater; 2754 } else if (b.cls == float_class_inf) { 2755 return b.sign ? float_relation_greater : float_relation_less; 2756 } 2757 2758 if (a.sign != b.sign) { 2759 return a.sign ? float_relation_less : float_relation_greater; 2760 } 2761 2762 if (a.exp == b.exp) { 2763 if (a.frac == b.frac) { 2764 return float_relation_equal; 2765 } 2766 if (a.sign) { 2767 return a.frac > b.frac ? 2768 float_relation_less : float_relation_greater; 2769 } else { 2770 return a.frac > b.frac ? 2771 float_relation_greater : float_relation_less; 2772 } 2773 } else { 2774 if (a.sign) { 2775 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2776 } else { 2777 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2778 } 2779 } 2780 } 2781 2782 #define COMPARE(sz) \ 2783 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2784 float_status *s) \ 2785 { \ 2786 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2787 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2788 return compare_floats(pa, pb, false, s); \ 2789 } \ 2790 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2791 float_status *s) \ 2792 { \ 2793 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2794 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2795 return compare_floats(pa, pb, true, s); \ 2796 } 2797 2798 COMPARE(16) 2799 COMPARE(32) 2800 COMPARE(64) 2801 2802 #undef COMPARE 2803 2804 /* Multiply A by 2 raised to the power N. */ 2805 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2806 { 2807 if (unlikely(is_nan(a.cls))) { 2808 return return_nan(a, s); 2809 } 2810 if (a.cls == float_class_normal) { 2811 /* The largest float type (even though not supported by FloatParts) 2812 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2813 * still allows rounding to infinity, without allowing overflow 2814 * within the int32_t that backs FloatParts.exp. 2815 */ 2816 n = MIN(MAX(n, -0x10000), 0x10000); 2817 a.exp += n; 2818 } 2819 return a; 2820 } 2821 2822 float16 float16_scalbn(float16 a, int n, float_status *status) 2823 { 2824 FloatParts pa = float16_unpack_canonical(a, status); 2825 FloatParts pr = scalbn_decomposed(pa, n, status); 2826 return float16_round_pack_canonical(pr, status); 2827 } 2828 2829 float32 float32_scalbn(float32 a, int n, float_status *status) 2830 { 2831 FloatParts pa = float32_unpack_canonical(a, status); 2832 FloatParts pr = scalbn_decomposed(pa, n, status); 2833 return float32_round_pack_canonical(pr, status); 2834 } 2835 2836 float64 float64_scalbn(float64 a, int n, float_status *status) 2837 { 2838 FloatParts pa = float64_unpack_canonical(a, status); 2839 FloatParts pr = scalbn_decomposed(pa, n, status); 2840 return float64_round_pack_canonical(pr, status); 2841 } 2842 2843 /* 2844 * Square Root 2845 * 2846 * The old softfloat code did an approximation step before zeroing in 2847 * on the final result. However for simpleness we just compute the 2848 * square root by iterating down from the implicit bit to enough extra 2849 * bits to ensure we get a correctly rounded result. 2850 * 2851 * This does mean however the calculation is slower than before, 2852 * especially for 64 bit floats. 2853 */ 2854 2855 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2856 { 2857 uint64_t a_frac, r_frac, s_frac; 2858 int bit, last_bit; 2859 2860 if (is_nan(a.cls)) { 2861 return return_nan(a, s); 2862 } 2863 if (a.cls == float_class_zero) { 2864 return a; /* sqrt(+-0) = +-0 */ 2865 } 2866 if (a.sign) { 2867 s->float_exception_flags |= float_flag_invalid; 2868 return parts_default_nan(s); 2869 } 2870 if (a.cls == float_class_inf) { 2871 return a; /* sqrt(+inf) = +inf */ 2872 } 2873 2874 assert(a.cls == float_class_normal); 2875 2876 /* We need two overflow bits at the top. Adding room for that is a 2877 * right shift. If the exponent is odd, we can discard the low bit 2878 * by multiplying the fraction by 2; that's a left shift. Combine 2879 * those and we shift right if the exponent is even. 2880 */ 2881 a_frac = a.frac; 2882 if (!(a.exp & 1)) { 2883 a_frac >>= 1; 2884 } 2885 a.exp >>= 1; 2886 2887 /* Bit-by-bit computation of sqrt. */ 2888 r_frac = 0; 2889 s_frac = 0; 2890 2891 /* Iterate from implicit bit down to the 3 extra bits to compute a 2892 * properly rounded result. Remember we've inserted one more bit 2893 * at the top, so these positions are one less. 2894 */ 2895 bit = DECOMPOSED_BINARY_POINT - 1; 2896 last_bit = MAX(p->frac_shift - 4, 0); 2897 do { 2898 uint64_t q = 1ULL << bit; 2899 uint64_t t_frac = s_frac + q; 2900 if (t_frac <= a_frac) { 2901 s_frac = t_frac + q; 2902 a_frac -= t_frac; 2903 r_frac += q; 2904 } 2905 a_frac <<= 1; 2906 } while (--bit >= last_bit); 2907 2908 /* Undo the right shift done above. If there is any remaining 2909 * fraction, the result is inexact. Set the sticky bit. 2910 */ 2911 a.frac = (r_frac << 1) + (a_frac != 0); 2912 2913 return a; 2914 } 2915 2916 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 2917 { 2918 FloatParts pa = float16_unpack_canonical(a, status); 2919 FloatParts pr = sqrt_float(pa, status, &float16_params); 2920 return float16_round_pack_canonical(pr, status); 2921 } 2922 2923 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 2924 { 2925 FloatParts pa = float32_unpack_canonical(a, status); 2926 FloatParts pr = sqrt_float(pa, status, &float32_params); 2927 return float32_round_pack_canonical(pr, status); 2928 } 2929 2930 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 2931 { 2932 FloatParts pa = float64_unpack_canonical(a, status); 2933 FloatParts pr = sqrt_float(pa, status, &float64_params); 2934 return float64_round_pack_canonical(pr, status); 2935 } 2936 2937 /*---------------------------------------------------------------------------- 2938 | The pattern for a default generated NaN. 2939 *----------------------------------------------------------------------------*/ 2940 2941 float16 float16_default_nan(float_status *status) 2942 { 2943 FloatParts p = parts_default_nan(status); 2944 p.frac >>= float16_params.frac_shift; 2945 return float16_pack_raw(p); 2946 } 2947 2948 float32 float32_default_nan(float_status *status) 2949 { 2950 FloatParts p = parts_default_nan(status); 2951 p.frac >>= float32_params.frac_shift; 2952 return float32_pack_raw(p); 2953 } 2954 2955 float64 float64_default_nan(float_status *status) 2956 { 2957 FloatParts p = parts_default_nan(status); 2958 p.frac >>= float64_params.frac_shift; 2959 return float64_pack_raw(p); 2960 } 2961 2962 float128 float128_default_nan(float_status *status) 2963 { 2964 FloatParts p = parts_default_nan(status); 2965 float128 r; 2966 2967 /* Extrapolate from the choices made by parts_default_nan to fill 2968 * in the quad-floating format. If the low bit is set, assume we 2969 * want to set all non-snan bits. 2970 */ 2971 r.low = -(p.frac & 1); 2972 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2973 r.high |= LIT64(0x7FFF000000000000); 2974 r.high |= (uint64_t)p.sign << 63; 2975 2976 return r; 2977 } 2978 2979 /*---------------------------------------------------------------------------- 2980 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2981 *----------------------------------------------------------------------------*/ 2982 2983 float16 float16_silence_nan(float16 a, float_status *status) 2984 { 2985 FloatParts p = float16_unpack_raw(a); 2986 p.frac <<= float16_params.frac_shift; 2987 p = parts_silence_nan(p, status); 2988 p.frac >>= float16_params.frac_shift; 2989 return float16_pack_raw(p); 2990 } 2991 2992 float32 float32_silence_nan(float32 a, float_status *status) 2993 { 2994 FloatParts p = float32_unpack_raw(a); 2995 p.frac <<= float32_params.frac_shift; 2996 p = parts_silence_nan(p, status); 2997 p.frac >>= float32_params.frac_shift; 2998 return float32_pack_raw(p); 2999 } 3000 3001 float64 float64_silence_nan(float64 a, float_status *status) 3002 { 3003 FloatParts p = float64_unpack_raw(a); 3004 p.frac <<= float64_params.frac_shift; 3005 p = parts_silence_nan(p, status); 3006 p.frac >>= float64_params.frac_shift; 3007 return float64_pack_raw(p); 3008 } 3009 3010 /*---------------------------------------------------------------------------- 3011 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3012 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3013 | input. If `zSign' is 1, the input is negated before being converted to an 3014 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3015 | is simply rounded to an integer, with the inexact exception raised if the 3016 | input cannot be represented exactly as an integer. However, if the fixed- 3017 | point input is too large, the invalid exception is raised and the largest 3018 | positive or negative integer is returned. 3019 *----------------------------------------------------------------------------*/ 3020 3021 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3022 { 3023 int8_t roundingMode; 3024 flag roundNearestEven; 3025 int8_t roundIncrement, roundBits; 3026 int32_t z; 3027 3028 roundingMode = status->float_rounding_mode; 3029 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3030 switch (roundingMode) { 3031 case float_round_nearest_even: 3032 case float_round_ties_away: 3033 roundIncrement = 0x40; 3034 break; 3035 case float_round_to_zero: 3036 roundIncrement = 0; 3037 break; 3038 case float_round_up: 3039 roundIncrement = zSign ? 0 : 0x7f; 3040 break; 3041 case float_round_down: 3042 roundIncrement = zSign ? 0x7f : 0; 3043 break; 3044 default: 3045 abort(); 3046 } 3047 roundBits = absZ & 0x7F; 3048 absZ = ( absZ + roundIncrement )>>7; 3049 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3050 z = absZ; 3051 if ( zSign ) z = - z; 3052 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3053 float_raise(float_flag_invalid, status); 3054 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3055 } 3056 if (roundBits) { 3057 status->float_exception_flags |= float_flag_inexact; 3058 } 3059 return z; 3060 3061 } 3062 3063 /*---------------------------------------------------------------------------- 3064 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3065 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3066 | and returns the properly rounded 64-bit integer corresponding to the input. 3067 | If `zSign' is 1, the input is negated before being converted to an integer. 3068 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3069 | the inexact exception raised if the input cannot be represented exactly as 3070 | an integer. However, if the fixed-point input is too large, the invalid 3071 | exception is raised and the largest positive or negative integer is 3072 | returned. 3073 *----------------------------------------------------------------------------*/ 3074 3075 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3076 float_status *status) 3077 { 3078 int8_t roundingMode; 3079 flag roundNearestEven, increment; 3080 int64_t z; 3081 3082 roundingMode = status->float_rounding_mode; 3083 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3084 switch (roundingMode) { 3085 case float_round_nearest_even: 3086 case float_round_ties_away: 3087 increment = ((int64_t) absZ1 < 0); 3088 break; 3089 case float_round_to_zero: 3090 increment = 0; 3091 break; 3092 case float_round_up: 3093 increment = !zSign && absZ1; 3094 break; 3095 case float_round_down: 3096 increment = zSign && absZ1; 3097 break; 3098 default: 3099 abort(); 3100 } 3101 if ( increment ) { 3102 ++absZ0; 3103 if ( absZ0 == 0 ) goto overflow; 3104 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3105 } 3106 z = absZ0; 3107 if ( zSign ) z = - z; 3108 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3109 overflow: 3110 float_raise(float_flag_invalid, status); 3111 return 3112 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3113 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3114 } 3115 if (absZ1) { 3116 status->float_exception_flags |= float_flag_inexact; 3117 } 3118 return z; 3119 3120 } 3121 3122 /*---------------------------------------------------------------------------- 3123 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3124 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3125 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3126 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3127 | with the inexact exception raised if the input cannot be represented exactly 3128 | as an integer. However, if the fixed-point input is too large, the invalid 3129 | exception is raised and the largest unsigned integer is returned. 3130 *----------------------------------------------------------------------------*/ 3131 3132 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3133 uint64_t absZ1, float_status *status) 3134 { 3135 int8_t roundingMode; 3136 flag roundNearestEven, increment; 3137 3138 roundingMode = status->float_rounding_mode; 3139 roundNearestEven = (roundingMode == float_round_nearest_even); 3140 switch (roundingMode) { 3141 case float_round_nearest_even: 3142 case float_round_ties_away: 3143 increment = ((int64_t)absZ1 < 0); 3144 break; 3145 case float_round_to_zero: 3146 increment = 0; 3147 break; 3148 case float_round_up: 3149 increment = !zSign && absZ1; 3150 break; 3151 case float_round_down: 3152 increment = zSign && absZ1; 3153 break; 3154 default: 3155 abort(); 3156 } 3157 if (increment) { 3158 ++absZ0; 3159 if (absZ0 == 0) { 3160 float_raise(float_flag_invalid, status); 3161 return LIT64(0xFFFFFFFFFFFFFFFF); 3162 } 3163 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3164 } 3165 3166 if (zSign && absZ0) { 3167 float_raise(float_flag_invalid, status); 3168 return 0; 3169 } 3170 3171 if (absZ1) { 3172 status->float_exception_flags |= float_flag_inexact; 3173 } 3174 return absZ0; 3175 } 3176 3177 /*---------------------------------------------------------------------------- 3178 | If `a' is denormal and we are in flush-to-zero mode then set the 3179 | input-denormal exception and return zero. Otherwise just return the value. 3180 *----------------------------------------------------------------------------*/ 3181 float32 float32_squash_input_denormal(float32 a, float_status *status) 3182 { 3183 if (status->flush_inputs_to_zero) { 3184 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3185 float_raise(float_flag_input_denormal, status); 3186 return make_float32(float32_val(a) & 0x80000000); 3187 } 3188 } 3189 return a; 3190 } 3191 3192 /*---------------------------------------------------------------------------- 3193 | Normalizes the subnormal single-precision floating-point value represented 3194 | by the denormalized significand `aSig'. The normalized exponent and 3195 | significand are stored at the locations pointed to by `zExpPtr' and 3196 | `zSigPtr', respectively. 3197 *----------------------------------------------------------------------------*/ 3198 3199 static void 3200 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3201 { 3202 int8_t shiftCount; 3203 3204 shiftCount = clz32(aSig) - 8; 3205 *zSigPtr = aSig<<shiftCount; 3206 *zExpPtr = 1 - shiftCount; 3207 3208 } 3209 3210 /*---------------------------------------------------------------------------- 3211 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3212 | and significand `zSig', and returns the proper single-precision floating- 3213 | point value corresponding to the abstract input. Ordinarily, the abstract 3214 | value is simply rounded and packed into the single-precision format, with 3215 | the inexact exception raised if the abstract input cannot be represented 3216 | exactly. However, if the abstract value is too large, the overflow and 3217 | inexact exceptions are raised and an infinity or maximal finite value is 3218 | returned. If the abstract value is too small, the input value is rounded to 3219 | a subnormal number, and the underflow and inexact exceptions are raised if 3220 | the abstract input cannot be represented exactly as a subnormal single- 3221 | precision floating-point number. 3222 | The input significand `zSig' has its binary point between bits 30 3223 | and 29, which is 7 bits to the left of the usual location. This shifted 3224 | significand must be normalized or smaller. If `zSig' is not normalized, 3225 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3226 | and it must not require rounding. In the usual case that `zSig' is 3227 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3228 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3229 | Binary Floating-Point Arithmetic. 3230 *----------------------------------------------------------------------------*/ 3231 3232 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3233 float_status *status) 3234 { 3235 int8_t roundingMode; 3236 flag roundNearestEven; 3237 int8_t roundIncrement, roundBits; 3238 flag isTiny; 3239 3240 roundingMode = status->float_rounding_mode; 3241 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3242 switch (roundingMode) { 3243 case float_round_nearest_even: 3244 case float_round_ties_away: 3245 roundIncrement = 0x40; 3246 break; 3247 case float_round_to_zero: 3248 roundIncrement = 0; 3249 break; 3250 case float_round_up: 3251 roundIncrement = zSign ? 0 : 0x7f; 3252 break; 3253 case float_round_down: 3254 roundIncrement = zSign ? 0x7f : 0; 3255 break; 3256 default: 3257 abort(); 3258 break; 3259 } 3260 roundBits = zSig & 0x7F; 3261 if ( 0xFD <= (uint16_t) zExp ) { 3262 if ( ( 0xFD < zExp ) 3263 || ( ( zExp == 0xFD ) 3264 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3265 ) { 3266 float_raise(float_flag_overflow | float_flag_inexact, status); 3267 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3268 } 3269 if ( zExp < 0 ) { 3270 if (status->flush_to_zero) { 3271 float_raise(float_flag_output_denormal, status); 3272 return packFloat32(zSign, 0, 0); 3273 } 3274 isTiny = 3275 (status->float_detect_tininess 3276 == float_tininess_before_rounding) 3277 || ( zExp < -1 ) 3278 || ( zSig + roundIncrement < 0x80000000 ); 3279 shift32RightJamming( zSig, - zExp, &zSig ); 3280 zExp = 0; 3281 roundBits = zSig & 0x7F; 3282 if (isTiny && roundBits) { 3283 float_raise(float_flag_underflow, status); 3284 } 3285 } 3286 } 3287 if (roundBits) { 3288 status->float_exception_flags |= float_flag_inexact; 3289 } 3290 zSig = ( zSig + roundIncrement )>>7; 3291 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3292 if ( zSig == 0 ) zExp = 0; 3293 return packFloat32( zSign, zExp, zSig ); 3294 3295 } 3296 3297 /*---------------------------------------------------------------------------- 3298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3299 | and significand `zSig', and returns the proper single-precision floating- 3300 | point value corresponding to the abstract input. This routine is just like 3301 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3302 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3303 | floating-point exponent. 3304 *----------------------------------------------------------------------------*/ 3305 3306 static float32 3307 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3308 float_status *status) 3309 { 3310 int8_t shiftCount; 3311 3312 shiftCount = clz32(zSig) - 1; 3313 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3314 status); 3315 3316 } 3317 3318 /*---------------------------------------------------------------------------- 3319 | If `a' is denormal and we are in flush-to-zero mode then set the 3320 | input-denormal exception and return zero. Otherwise just return the value. 3321 *----------------------------------------------------------------------------*/ 3322 float64 float64_squash_input_denormal(float64 a, float_status *status) 3323 { 3324 if (status->flush_inputs_to_zero) { 3325 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3326 float_raise(float_flag_input_denormal, status); 3327 return make_float64(float64_val(a) & (1ULL << 63)); 3328 } 3329 } 3330 return a; 3331 } 3332 3333 /*---------------------------------------------------------------------------- 3334 | Normalizes the subnormal double-precision floating-point value represented 3335 | by the denormalized significand `aSig'. The normalized exponent and 3336 | significand are stored at the locations pointed to by `zExpPtr' and 3337 | `zSigPtr', respectively. 3338 *----------------------------------------------------------------------------*/ 3339 3340 static void 3341 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3342 { 3343 int8_t shiftCount; 3344 3345 shiftCount = clz64(aSig) - 11; 3346 *zSigPtr = aSig<<shiftCount; 3347 *zExpPtr = 1 - shiftCount; 3348 3349 } 3350 3351 /*---------------------------------------------------------------------------- 3352 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3353 | double-precision floating-point value, returning the result. After being 3354 | shifted into the proper positions, the three fields are simply added 3355 | together to form the result. This means that any integer portion of `zSig' 3356 | will be added into the exponent. Since a properly normalized significand 3357 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3358 | than the desired result exponent whenever `zSig' is a complete, normalized 3359 | significand. 3360 *----------------------------------------------------------------------------*/ 3361 3362 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3363 { 3364 3365 return make_float64( 3366 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3367 3368 } 3369 3370 /*---------------------------------------------------------------------------- 3371 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3372 | and significand `zSig', and returns the proper double-precision floating- 3373 | point value corresponding to the abstract input. Ordinarily, the abstract 3374 | value is simply rounded and packed into the double-precision format, with 3375 | the inexact exception raised if the abstract input cannot be represented 3376 | exactly. However, if the abstract value is too large, the overflow and 3377 | inexact exceptions are raised and an infinity or maximal finite value is 3378 | returned. If the abstract value is too small, the input value is rounded to 3379 | a subnormal number, and the underflow and inexact exceptions are raised if 3380 | the abstract input cannot be represented exactly as a subnormal double- 3381 | precision floating-point number. 3382 | The input significand `zSig' has its binary point between bits 62 3383 | and 61, which is 10 bits to the left of the usual location. This shifted 3384 | significand must be normalized or smaller. If `zSig' is not normalized, 3385 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3386 | and it must not require rounding. In the usual case that `zSig' is 3387 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3388 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3389 | Binary Floating-Point Arithmetic. 3390 *----------------------------------------------------------------------------*/ 3391 3392 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3393 float_status *status) 3394 { 3395 int8_t roundingMode; 3396 flag roundNearestEven; 3397 int roundIncrement, roundBits; 3398 flag isTiny; 3399 3400 roundingMode = status->float_rounding_mode; 3401 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3402 switch (roundingMode) { 3403 case float_round_nearest_even: 3404 case float_round_ties_away: 3405 roundIncrement = 0x200; 3406 break; 3407 case float_round_to_zero: 3408 roundIncrement = 0; 3409 break; 3410 case float_round_up: 3411 roundIncrement = zSign ? 0 : 0x3ff; 3412 break; 3413 case float_round_down: 3414 roundIncrement = zSign ? 0x3ff : 0; 3415 break; 3416 case float_round_to_odd: 3417 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3418 break; 3419 default: 3420 abort(); 3421 } 3422 roundBits = zSig & 0x3FF; 3423 if ( 0x7FD <= (uint16_t) zExp ) { 3424 if ( ( 0x7FD < zExp ) 3425 || ( ( zExp == 0x7FD ) 3426 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3427 ) { 3428 bool overflow_to_inf = roundingMode != float_round_to_odd && 3429 roundIncrement != 0; 3430 float_raise(float_flag_overflow | float_flag_inexact, status); 3431 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3432 } 3433 if ( zExp < 0 ) { 3434 if (status->flush_to_zero) { 3435 float_raise(float_flag_output_denormal, status); 3436 return packFloat64(zSign, 0, 0); 3437 } 3438 isTiny = 3439 (status->float_detect_tininess 3440 == float_tininess_before_rounding) 3441 || ( zExp < -1 ) 3442 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3443 shift64RightJamming( zSig, - zExp, &zSig ); 3444 zExp = 0; 3445 roundBits = zSig & 0x3FF; 3446 if (isTiny && roundBits) { 3447 float_raise(float_flag_underflow, status); 3448 } 3449 if (roundingMode == float_round_to_odd) { 3450 /* 3451 * For round-to-odd case, the roundIncrement depends on 3452 * zSig which just changed. 3453 */ 3454 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3455 } 3456 } 3457 } 3458 if (roundBits) { 3459 status->float_exception_flags |= float_flag_inexact; 3460 } 3461 zSig = ( zSig + roundIncrement )>>10; 3462 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3463 if ( zSig == 0 ) zExp = 0; 3464 return packFloat64( zSign, zExp, zSig ); 3465 3466 } 3467 3468 /*---------------------------------------------------------------------------- 3469 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3470 | and significand `zSig', and returns the proper double-precision floating- 3471 | point value corresponding to the abstract input. This routine is just like 3472 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3473 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3474 | floating-point exponent. 3475 *----------------------------------------------------------------------------*/ 3476 3477 static float64 3478 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3479 float_status *status) 3480 { 3481 int8_t shiftCount; 3482 3483 shiftCount = clz64(zSig) - 1; 3484 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3485 status); 3486 3487 } 3488 3489 /*---------------------------------------------------------------------------- 3490 | Normalizes the subnormal extended double-precision floating-point value 3491 | represented by the denormalized significand `aSig'. The normalized exponent 3492 | and significand are stored at the locations pointed to by `zExpPtr' and 3493 | `zSigPtr', respectively. 3494 *----------------------------------------------------------------------------*/ 3495 3496 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3497 uint64_t *zSigPtr) 3498 { 3499 int8_t shiftCount; 3500 3501 shiftCount = clz64(aSig); 3502 *zSigPtr = aSig<<shiftCount; 3503 *zExpPtr = 1 - shiftCount; 3504 } 3505 3506 /*---------------------------------------------------------------------------- 3507 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3508 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3509 | and returns the proper extended double-precision floating-point value 3510 | corresponding to the abstract input. Ordinarily, the abstract value is 3511 | rounded and packed into the extended double-precision format, with the 3512 | inexact exception raised if the abstract input cannot be represented 3513 | exactly. However, if the abstract value is too large, the overflow and 3514 | inexact exceptions are raised and an infinity or maximal finite value is 3515 | returned. If the abstract value is too small, the input value is rounded to 3516 | a subnormal number, and the underflow and inexact exceptions are raised if 3517 | the abstract input cannot be represented exactly as a subnormal extended 3518 | double-precision floating-point number. 3519 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3520 | number of bits as single or double precision, respectively. Otherwise, the 3521 | result is rounded to the full precision of the extended double-precision 3522 | format. 3523 | The input significand must be normalized or smaller. If the input 3524 | significand is not normalized, `zExp' must be 0; in that case, the result 3525 | returned is a subnormal number, and it must not require rounding. The 3526 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3527 | Floating-Point Arithmetic. 3528 *----------------------------------------------------------------------------*/ 3529 3530 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3531 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3532 float_status *status) 3533 { 3534 int8_t roundingMode; 3535 flag roundNearestEven, increment, isTiny; 3536 int64_t roundIncrement, roundMask, roundBits; 3537 3538 roundingMode = status->float_rounding_mode; 3539 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3540 if ( roundingPrecision == 80 ) goto precision80; 3541 if ( roundingPrecision == 64 ) { 3542 roundIncrement = LIT64( 0x0000000000000400 ); 3543 roundMask = LIT64( 0x00000000000007FF ); 3544 } 3545 else if ( roundingPrecision == 32 ) { 3546 roundIncrement = LIT64( 0x0000008000000000 ); 3547 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3548 } 3549 else { 3550 goto precision80; 3551 } 3552 zSig0 |= ( zSig1 != 0 ); 3553 switch (roundingMode) { 3554 case float_round_nearest_even: 3555 case float_round_ties_away: 3556 break; 3557 case float_round_to_zero: 3558 roundIncrement = 0; 3559 break; 3560 case float_round_up: 3561 roundIncrement = zSign ? 0 : roundMask; 3562 break; 3563 case float_round_down: 3564 roundIncrement = zSign ? roundMask : 0; 3565 break; 3566 default: 3567 abort(); 3568 } 3569 roundBits = zSig0 & roundMask; 3570 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3571 if ( ( 0x7FFE < zExp ) 3572 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3573 ) { 3574 goto overflow; 3575 } 3576 if ( zExp <= 0 ) { 3577 if (status->flush_to_zero) { 3578 float_raise(float_flag_output_denormal, status); 3579 return packFloatx80(zSign, 0, 0); 3580 } 3581 isTiny = 3582 (status->float_detect_tininess 3583 == float_tininess_before_rounding) 3584 || ( zExp < 0 ) 3585 || ( zSig0 <= zSig0 + roundIncrement ); 3586 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3587 zExp = 0; 3588 roundBits = zSig0 & roundMask; 3589 if (isTiny && roundBits) { 3590 float_raise(float_flag_underflow, status); 3591 } 3592 if (roundBits) { 3593 status->float_exception_flags |= float_flag_inexact; 3594 } 3595 zSig0 += roundIncrement; 3596 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3597 roundIncrement = roundMask + 1; 3598 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3599 roundMask |= roundIncrement; 3600 } 3601 zSig0 &= ~ roundMask; 3602 return packFloatx80( zSign, zExp, zSig0 ); 3603 } 3604 } 3605 if (roundBits) { 3606 status->float_exception_flags |= float_flag_inexact; 3607 } 3608 zSig0 += roundIncrement; 3609 if ( zSig0 < roundIncrement ) { 3610 ++zExp; 3611 zSig0 = LIT64( 0x8000000000000000 ); 3612 } 3613 roundIncrement = roundMask + 1; 3614 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3615 roundMask |= roundIncrement; 3616 } 3617 zSig0 &= ~ roundMask; 3618 if ( zSig0 == 0 ) zExp = 0; 3619 return packFloatx80( zSign, zExp, zSig0 ); 3620 precision80: 3621 switch (roundingMode) { 3622 case float_round_nearest_even: 3623 case float_round_ties_away: 3624 increment = ((int64_t)zSig1 < 0); 3625 break; 3626 case float_round_to_zero: 3627 increment = 0; 3628 break; 3629 case float_round_up: 3630 increment = !zSign && zSig1; 3631 break; 3632 case float_round_down: 3633 increment = zSign && zSig1; 3634 break; 3635 default: 3636 abort(); 3637 } 3638 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3639 if ( ( 0x7FFE < zExp ) 3640 || ( ( zExp == 0x7FFE ) 3641 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3642 && increment 3643 ) 3644 ) { 3645 roundMask = 0; 3646 overflow: 3647 float_raise(float_flag_overflow | float_flag_inexact, status); 3648 if ( ( roundingMode == float_round_to_zero ) 3649 || ( zSign && ( roundingMode == float_round_up ) ) 3650 || ( ! zSign && ( roundingMode == float_round_down ) ) 3651 ) { 3652 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3653 } 3654 return packFloatx80(zSign, 3655 floatx80_infinity_high, 3656 floatx80_infinity_low); 3657 } 3658 if ( zExp <= 0 ) { 3659 isTiny = 3660 (status->float_detect_tininess 3661 == float_tininess_before_rounding) 3662 || ( zExp < 0 ) 3663 || ! increment 3664 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3665 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3666 zExp = 0; 3667 if (isTiny && zSig1) { 3668 float_raise(float_flag_underflow, status); 3669 } 3670 if (zSig1) { 3671 status->float_exception_flags |= float_flag_inexact; 3672 } 3673 switch (roundingMode) { 3674 case float_round_nearest_even: 3675 case float_round_ties_away: 3676 increment = ((int64_t)zSig1 < 0); 3677 break; 3678 case float_round_to_zero: 3679 increment = 0; 3680 break; 3681 case float_round_up: 3682 increment = !zSign && zSig1; 3683 break; 3684 case float_round_down: 3685 increment = zSign && zSig1; 3686 break; 3687 default: 3688 abort(); 3689 } 3690 if ( increment ) { 3691 ++zSig0; 3692 zSig0 &= 3693 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3694 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3695 } 3696 return packFloatx80( zSign, zExp, zSig0 ); 3697 } 3698 } 3699 if (zSig1) { 3700 status->float_exception_flags |= float_flag_inexact; 3701 } 3702 if ( increment ) { 3703 ++zSig0; 3704 if ( zSig0 == 0 ) { 3705 ++zExp; 3706 zSig0 = LIT64( 0x8000000000000000 ); 3707 } 3708 else { 3709 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3710 } 3711 } 3712 else { 3713 if ( zSig0 == 0 ) zExp = 0; 3714 } 3715 return packFloatx80( zSign, zExp, zSig0 ); 3716 3717 } 3718 3719 /*---------------------------------------------------------------------------- 3720 | Takes an abstract floating-point value having sign `zSign', exponent 3721 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3722 | and returns the proper extended double-precision floating-point value 3723 | corresponding to the abstract input. This routine is just like 3724 | `roundAndPackFloatx80' except that the input significand does not have to be 3725 | normalized. 3726 *----------------------------------------------------------------------------*/ 3727 3728 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3729 flag zSign, int32_t zExp, 3730 uint64_t zSig0, uint64_t zSig1, 3731 float_status *status) 3732 { 3733 int8_t shiftCount; 3734 3735 if ( zSig0 == 0 ) { 3736 zSig0 = zSig1; 3737 zSig1 = 0; 3738 zExp -= 64; 3739 } 3740 shiftCount = clz64(zSig0); 3741 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3742 zExp -= shiftCount; 3743 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3744 zSig0, zSig1, status); 3745 3746 } 3747 3748 /*---------------------------------------------------------------------------- 3749 | Returns the least-significant 64 fraction bits of the quadruple-precision 3750 | floating-point value `a'. 3751 *----------------------------------------------------------------------------*/ 3752 3753 static inline uint64_t extractFloat128Frac1( float128 a ) 3754 { 3755 3756 return a.low; 3757 3758 } 3759 3760 /*---------------------------------------------------------------------------- 3761 | Returns the most-significant 48 fraction bits of the quadruple-precision 3762 | floating-point value `a'. 3763 *----------------------------------------------------------------------------*/ 3764 3765 static inline uint64_t extractFloat128Frac0( float128 a ) 3766 { 3767 3768 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3769 3770 } 3771 3772 /*---------------------------------------------------------------------------- 3773 | Returns the exponent bits of the quadruple-precision floating-point value 3774 | `a'. 3775 *----------------------------------------------------------------------------*/ 3776 3777 static inline int32_t extractFloat128Exp( float128 a ) 3778 { 3779 3780 return ( a.high>>48 ) & 0x7FFF; 3781 3782 } 3783 3784 /*---------------------------------------------------------------------------- 3785 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3786 *----------------------------------------------------------------------------*/ 3787 3788 static inline flag extractFloat128Sign( float128 a ) 3789 { 3790 3791 return a.high>>63; 3792 3793 } 3794 3795 /*---------------------------------------------------------------------------- 3796 | Normalizes the subnormal quadruple-precision floating-point value 3797 | represented by the denormalized significand formed by the concatenation of 3798 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3799 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3800 | significand are stored at the location pointed to by `zSig0Ptr', and the 3801 | least significant 64 bits of the normalized significand are stored at the 3802 | location pointed to by `zSig1Ptr'. 3803 *----------------------------------------------------------------------------*/ 3804 3805 static void 3806 normalizeFloat128Subnormal( 3807 uint64_t aSig0, 3808 uint64_t aSig1, 3809 int32_t *zExpPtr, 3810 uint64_t *zSig0Ptr, 3811 uint64_t *zSig1Ptr 3812 ) 3813 { 3814 int8_t shiftCount; 3815 3816 if ( aSig0 == 0 ) { 3817 shiftCount = clz64(aSig1) - 15; 3818 if ( shiftCount < 0 ) { 3819 *zSig0Ptr = aSig1>>( - shiftCount ); 3820 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3821 } 3822 else { 3823 *zSig0Ptr = aSig1<<shiftCount; 3824 *zSig1Ptr = 0; 3825 } 3826 *zExpPtr = - shiftCount - 63; 3827 } 3828 else { 3829 shiftCount = clz64(aSig0) - 15; 3830 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3831 *zExpPtr = 1 - shiftCount; 3832 } 3833 3834 } 3835 3836 /*---------------------------------------------------------------------------- 3837 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3838 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3839 | floating-point value, returning the result. After being shifted into the 3840 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3841 | added together to form the most significant 32 bits of the result. This 3842 | means that any integer portion of `zSig0' will be added into the exponent. 3843 | Since a properly normalized significand will have an integer portion equal 3844 | to 1, the `zExp' input should be 1 less than the desired result exponent 3845 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3846 | significand. 3847 *----------------------------------------------------------------------------*/ 3848 3849 static inline float128 3850 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3851 { 3852 float128 z; 3853 3854 z.low = zSig1; 3855 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3856 return z; 3857 3858 } 3859 3860 /*---------------------------------------------------------------------------- 3861 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3862 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3863 | and `zSig2', and returns the proper quadruple-precision floating-point value 3864 | corresponding to the abstract input. Ordinarily, the abstract value is 3865 | simply rounded and packed into the quadruple-precision format, with the 3866 | inexact exception raised if the abstract input cannot be represented 3867 | exactly. However, if the abstract value is too large, the overflow and 3868 | inexact exceptions are raised and an infinity or maximal finite value is 3869 | returned. If the abstract value is too small, the input value is rounded to 3870 | a subnormal number, and the underflow and inexact exceptions are raised if 3871 | the abstract input cannot be represented exactly as a subnormal quadruple- 3872 | precision floating-point number. 3873 | The input significand must be normalized or smaller. If the input 3874 | significand is not normalized, `zExp' must be 0; in that case, the result 3875 | returned is a subnormal number, and it must not require rounding. In the 3876 | usual case that the input significand is normalized, `zExp' must be 1 less 3877 | than the ``true'' floating-point exponent. The handling of underflow and 3878 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3879 *----------------------------------------------------------------------------*/ 3880 3881 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3882 uint64_t zSig0, uint64_t zSig1, 3883 uint64_t zSig2, float_status *status) 3884 { 3885 int8_t roundingMode; 3886 flag roundNearestEven, increment, isTiny; 3887 3888 roundingMode = status->float_rounding_mode; 3889 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3890 switch (roundingMode) { 3891 case float_round_nearest_even: 3892 case float_round_ties_away: 3893 increment = ((int64_t)zSig2 < 0); 3894 break; 3895 case float_round_to_zero: 3896 increment = 0; 3897 break; 3898 case float_round_up: 3899 increment = !zSign && zSig2; 3900 break; 3901 case float_round_down: 3902 increment = zSign && zSig2; 3903 break; 3904 case float_round_to_odd: 3905 increment = !(zSig1 & 0x1) && zSig2; 3906 break; 3907 default: 3908 abort(); 3909 } 3910 if ( 0x7FFD <= (uint32_t) zExp ) { 3911 if ( ( 0x7FFD < zExp ) 3912 || ( ( zExp == 0x7FFD ) 3913 && eq128( 3914 LIT64( 0x0001FFFFFFFFFFFF ), 3915 LIT64( 0xFFFFFFFFFFFFFFFF ), 3916 zSig0, 3917 zSig1 3918 ) 3919 && increment 3920 ) 3921 ) { 3922 float_raise(float_flag_overflow | float_flag_inexact, status); 3923 if ( ( roundingMode == float_round_to_zero ) 3924 || ( zSign && ( roundingMode == float_round_up ) ) 3925 || ( ! zSign && ( roundingMode == float_round_down ) ) 3926 || (roundingMode == float_round_to_odd) 3927 ) { 3928 return 3929 packFloat128( 3930 zSign, 3931 0x7FFE, 3932 LIT64( 0x0000FFFFFFFFFFFF ), 3933 LIT64( 0xFFFFFFFFFFFFFFFF ) 3934 ); 3935 } 3936 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3937 } 3938 if ( zExp < 0 ) { 3939 if (status->flush_to_zero) { 3940 float_raise(float_flag_output_denormal, status); 3941 return packFloat128(zSign, 0, 0, 0); 3942 } 3943 isTiny = 3944 (status->float_detect_tininess 3945 == float_tininess_before_rounding) 3946 || ( zExp < -1 ) 3947 || ! increment 3948 || lt128( 3949 zSig0, 3950 zSig1, 3951 LIT64( 0x0001FFFFFFFFFFFF ), 3952 LIT64( 0xFFFFFFFFFFFFFFFF ) 3953 ); 3954 shift128ExtraRightJamming( 3955 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3956 zExp = 0; 3957 if (isTiny && zSig2) { 3958 float_raise(float_flag_underflow, status); 3959 } 3960 switch (roundingMode) { 3961 case float_round_nearest_even: 3962 case float_round_ties_away: 3963 increment = ((int64_t)zSig2 < 0); 3964 break; 3965 case float_round_to_zero: 3966 increment = 0; 3967 break; 3968 case float_round_up: 3969 increment = !zSign && zSig2; 3970 break; 3971 case float_round_down: 3972 increment = zSign && zSig2; 3973 break; 3974 case float_round_to_odd: 3975 increment = !(zSig1 & 0x1) && zSig2; 3976 break; 3977 default: 3978 abort(); 3979 } 3980 } 3981 } 3982 if (zSig2) { 3983 status->float_exception_flags |= float_flag_inexact; 3984 } 3985 if ( increment ) { 3986 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3987 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3988 } 3989 else { 3990 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3991 } 3992 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3993 3994 } 3995 3996 /*---------------------------------------------------------------------------- 3997 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3998 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3999 | returns the proper quadruple-precision floating-point value corresponding 4000 | to the abstract input. This routine is just like `roundAndPackFloat128' 4001 | except that the input significand has fewer bits and does not have to be 4002 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4003 | point exponent. 4004 *----------------------------------------------------------------------------*/ 4005 4006 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4007 uint64_t zSig0, uint64_t zSig1, 4008 float_status *status) 4009 { 4010 int8_t shiftCount; 4011 uint64_t zSig2; 4012 4013 if ( zSig0 == 0 ) { 4014 zSig0 = zSig1; 4015 zSig1 = 0; 4016 zExp -= 64; 4017 } 4018 shiftCount = clz64(zSig0) - 15; 4019 if ( 0 <= shiftCount ) { 4020 zSig2 = 0; 4021 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4022 } 4023 else { 4024 shift128ExtraRightJamming( 4025 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4026 } 4027 zExp -= shiftCount; 4028 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4029 4030 } 4031 4032 4033 /*---------------------------------------------------------------------------- 4034 | Returns the result of converting the 32-bit two's complement integer `a' 4035 | to the extended double-precision floating-point format. The conversion 4036 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4037 | Arithmetic. 4038 *----------------------------------------------------------------------------*/ 4039 4040 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4041 { 4042 flag zSign; 4043 uint32_t absA; 4044 int8_t shiftCount; 4045 uint64_t zSig; 4046 4047 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4048 zSign = ( a < 0 ); 4049 absA = zSign ? - a : a; 4050 shiftCount = clz32(absA) + 32; 4051 zSig = absA; 4052 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4053 4054 } 4055 4056 /*---------------------------------------------------------------------------- 4057 | Returns the result of converting the 32-bit two's complement integer `a' to 4058 | the quadruple-precision floating-point format. The conversion is performed 4059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4060 *----------------------------------------------------------------------------*/ 4061 4062 float128 int32_to_float128(int32_t a, float_status *status) 4063 { 4064 flag zSign; 4065 uint32_t absA; 4066 int8_t shiftCount; 4067 uint64_t zSig0; 4068 4069 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4070 zSign = ( a < 0 ); 4071 absA = zSign ? - a : a; 4072 shiftCount = clz32(absA) + 17; 4073 zSig0 = absA; 4074 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4075 4076 } 4077 4078 /*---------------------------------------------------------------------------- 4079 | Returns the result of converting the 64-bit two's complement integer `a' 4080 | to the extended double-precision floating-point format. The conversion 4081 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4082 | Arithmetic. 4083 *----------------------------------------------------------------------------*/ 4084 4085 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4086 { 4087 flag zSign; 4088 uint64_t absA; 4089 int8_t shiftCount; 4090 4091 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4092 zSign = ( a < 0 ); 4093 absA = zSign ? - a : a; 4094 shiftCount = clz64(absA); 4095 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4096 4097 } 4098 4099 /*---------------------------------------------------------------------------- 4100 | Returns the result of converting the 64-bit two's complement integer `a' to 4101 | the quadruple-precision floating-point format. The conversion is performed 4102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4103 *----------------------------------------------------------------------------*/ 4104 4105 float128 int64_to_float128(int64_t a, float_status *status) 4106 { 4107 flag zSign; 4108 uint64_t absA; 4109 int8_t shiftCount; 4110 int32_t zExp; 4111 uint64_t zSig0, zSig1; 4112 4113 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4114 zSign = ( a < 0 ); 4115 absA = zSign ? - a : a; 4116 shiftCount = clz64(absA) + 49; 4117 zExp = 0x406E - shiftCount; 4118 if ( 64 <= shiftCount ) { 4119 zSig1 = 0; 4120 zSig0 = absA; 4121 shiftCount -= 64; 4122 } 4123 else { 4124 zSig1 = absA; 4125 zSig0 = 0; 4126 } 4127 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4128 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4129 4130 } 4131 4132 /*---------------------------------------------------------------------------- 4133 | Returns the result of converting the 64-bit unsigned integer `a' 4134 | to the quadruple-precision floating-point format. The conversion is performed 4135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4136 *----------------------------------------------------------------------------*/ 4137 4138 float128 uint64_to_float128(uint64_t a, float_status *status) 4139 { 4140 if (a == 0) { 4141 return float128_zero; 4142 } 4143 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4144 } 4145 4146 /*---------------------------------------------------------------------------- 4147 | Returns the result of converting the single-precision floating-point value 4148 | `a' to the extended double-precision floating-point format. The conversion 4149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4150 | Arithmetic. 4151 *----------------------------------------------------------------------------*/ 4152 4153 floatx80 float32_to_floatx80(float32 a, float_status *status) 4154 { 4155 flag aSign; 4156 int aExp; 4157 uint32_t aSig; 4158 4159 a = float32_squash_input_denormal(a, status); 4160 aSig = extractFloat32Frac( a ); 4161 aExp = extractFloat32Exp( a ); 4162 aSign = extractFloat32Sign( a ); 4163 if ( aExp == 0xFF ) { 4164 if (aSig) { 4165 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4166 } 4167 return packFloatx80(aSign, 4168 floatx80_infinity_high, 4169 floatx80_infinity_low); 4170 } 4171 if ( aExp == 0 ) { 4172 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4173 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4174 } 4175 aSig |= 0x00800000; 4176 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4177 4178 } 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns the result of converting the single-precision floating-point value 4182 | `a' to the double-precision floating-point format. The conversion is 4183 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4184 | Arithmetic. 4185 *----------------------------------------------------------------------------*/ 4186 4187 float128 float32_to_float128(float32 a, float_status *status) 4188 { 4189 flag aSign; 4190 int aExp; 4191 uint32_t aSig; 4192 4193 a = float32_squash_input_denormal(a, status); 4194 aSig = extractFloat32Frac( a ); 4195 aExp = extractFloat32Exp( a ); 4196 aSign = extractFloat32Sign( a ); 4197 if ( aExp == 0xFF ) { 4198 if (aSig) { 4199 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4200 } 4201 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4202 } 4203 if ( aExp == 0 ) { 4204 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4205 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4206 --aExp; 4207 } 4208 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4209 4210 } 4211 4212 /*---------------------------------------------------------------------------- 4213 | Returns the remainder of the single-precision floating-point value `a' 4214 | with respect to the corresponding value `b'. The operation is performed 4215 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4216 *----------------------------------------------------------------------------*/ 4217 4218 float32 float32_rem(float32 a, float32 b, float_status *status) 4219 { 4220 flag aSign, zSign; 4221 int aExp, bExp, expDiff; 4222 uint32_t aSig, bSig; 4223 uint32_t q; 4224 uint64_t aSig64, bSig64, q64; 4225 uint32_t alternateASig; 4226 int32_t sigMean; 4227 a = float32_squash_input_denormal(a, status); 4228 b = float32_squash_input_denormal(b, status); 4229 4230 aSig = extractFloat32Frac( a ); 4231 aExp = extractFloat32Exp( a ); 4232 aSign = extractFloat32Sign( a ); 4233 bSig = extractFloat32Frac( b ); 4234 bExp = extractFloat32Exp( b ); 4235 if ( aExp == 0xFF ) { 4236 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4237 return propagateFloat32NaN(a, b, status); 4238 } 4239 float_raise(float_flag_invalid, status); 4240 return float32_default_nan(status); 4241 } 4242 if ( bExp == 0xFF ) { 4243 if (bSig) { 4244 return propagateFloat32NaN(a, b, status); 4245 } 4246 return a; 4247 } 4248 if ( bExp == 0 ) { 4249 if ( bSig == 0 ) { 4250 float_raise(float_flag_invalid, status); 4251 return float32_default_nan(status); 4252 } 4253 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4254 } 4255 if ( aExp == 0 ) { 4256 if ( aSig == 0 ) return a; 4257 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4258 } 4259 expDiff = aExp - bExp; 4260 aSig |= 0x00800000; 4261 bSig |= 0x00800000; 4262 if ( expDiff < 32 ) { 4263 aSig <<= 8; 4264 bSig <<= 8; 4265 if ( expDiff < 0 ) { 4266 if ( expDiff < -1 ) return a; 4267 aSig >>= 1; 4268 } 4269 q = ( bSig <= aSig ); 4270 if ( q ) aSig -= bSig; 4271 if ( 0 < expDiff ) { 4272 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4273 q >>= 32 - expDiff; 4274 bSig >>= 2; 4275 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4276 } 4277 else { 4278 aSig >>= 2; 4279 bSig >>= 2; 4280 } 4281 } 4282 else { 4283 if ( bSig <= aSig ) aSig -= bSig; 4284 aSig64 = ( (uint64_t) aSig )<<40; 4285 bSig64 = ( (uint64_t) bSig )<<40; 4286 expDiff -= 64; 4287 while ( 0 < expDiff ) { 4288 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4289 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4290 aSig64 = - ( ( bSig * q64 )<<38 ); 4291 expDiff -= 62; 4292 } 4293 expDiff += 64; 4294 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4295 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4296 q = q64>>( 64 - expDiff ); 4297 bSig <<= 6; 4298 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4299 } 4300 do { 4301 alternateASig = aSig; 4302 ++q; 4303 aSig -= bSig; 4304 } while ( 0 <= (int32_t) aSig ); 4305 sigMean = aSig + alternateASig; 4306 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4307 aSig = alternateASig; 4308 } 4309 zSign = ( (int32_t) aSig < 0 ); 4310 if ( zSign ) aSig = - aSig; 4311 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4312 } 4313 4314 4315 4316 /*---------------------------------------------------------------------------- 4317 | Returns the binary exponential of the single-precision floating-point value 4318 | `a'. The operation is performed according to the IEC/IEEE Standard for 4319 | Binary Floating-Point Arithmetic. 4320 | 4321 | Uses the following identities: 4322 | 4323 | 1. ------------------------------------------------------------------------- 4324 | x x*ln(2) 4325 | 2 = e 4326 | 4327 | 2. ------------------------------------------------------------------------- 4328 | 2 3 4 5 n 4329 | x x x x x x x 4330 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4331 | 1! 2! 3! 4! 5! n! 4332 *----------------------------------------------------------------------------*/ 4333 4334 static const float64 float32_exp2_coefficients[15] = 4335 { 4336 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4337 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4338 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4339 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4340 const_float64( 0x3f81111111111111ll ), /* 5 */ 4341 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4342 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4343 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4344 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4345 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4346 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4347 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4348 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4349 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4350 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4351 }; 4352 4353 float32 float32_exp2(float32 a, float_status *status) 4354 { 4355 flag aSign; 4356 int aExp; 4357 uint32_t aSig; 4358 float64 r, x, xn; 4359 int i; 4360 a = float32_squash_input_denormal(a, status); 4361 4362 aSig = extractFloat32Frac( a ); 4363 aExp = extractFloat32Exp( a ); 4364 aSign = extractFloat32Sign( a ); 4365 4366 if ( aExp == 0xFF) { 4367 if (aSig) { 4368 return propagateFloat32NaN(a, float32_zero, status); 4369 } 4370 return (aSign) ? float32_zero : a; 4371 } 4372 if (aExp == 0) { 4373 if (aSig == 0) return float32_one; 4374 } 4375 4376 float_raise(float_flag_inexact, status); 4377 4378 /* ******************************* */ 4379 /* using float64 for approximation */ 4380 /* ******************************* */ 4381 x = float32_to_float64(a, status); 4382 x = float64_mul(x, float64_ln2, status); 4383 4384 xn = x; 4385 r = float64_one; 4386 for (i = 0 ; i < 15 ; i++) { 4387 float64 f; 4388 4389 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4390 r = float64_add(r, f, status); 4391 4392 xn = float64_mul(xn, x, status); 4393 } 4394 4395 return float64_to_float32(r, status); 4396 } 4397 4398 /*---------------------------------------------------------------------------- 4399 | Returns the binary log of the single-precision floating-point value `a'. 4400 | The operation is performed according to the IEC/IEEE Standard for Binary 4401 | Floating-Point Arithmetic. 4402 *----------------------------------------------------------------------------*/ 4403 float32 float32_log2(float32 a, float_status *status) 4404 { 4405 flag aSign, zSign; 4406 int aExp; 4407 uint32_t aSig, zSig, i; 4408 4409 a = float32_squash_input_denormal(a, status); 4410 aSig = extractFloat32Frac( a ); 4411 aExp = extractFloat32Exp( a ); 4412 aSign = extractFloat32Sign( a ); 4413 4414 if ( aExp == 0 ) { 4415 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4416 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4417 } 4418 if ( aSign ) { 4419 float_raise(float_flag_invalid, status); 4420 return float32_default_nan(status); 4421 } 4422 if ( aExp == 0xFF ) { 4423 if (aSig) { 4424 return propagateFloat32NaN(a, float32_zero, status); 4425 } 4426 return a; 4427 } 4428 4429 aExp -= 0x7F; 4430 aSig |= 0x00800000; 4431 zSign = aExp < 0; 4432 zSig = aExp << 23; 4433 4434 for (i = 1 << 22; i > 0; i >>= 1) { 4435 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4436 if ( aSig & 0x01000000 ) { 4437 aSig >>= 1; 4438 zSig |= i; 4439 } 4440 } 4441 4442 if ( zSign ) 4443 zSig = -zSig; 4444 4445 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4446 } 4447 4448 /*---------------------------------------------------------------------------- 4449 | Returns 1 if the single-precision floating-point value `a' is equal to 4450 | the corresponding value `b', and 0 otherwise. The invalid exception is 4451 | raised if either operand is a NaN. Otherwise, the comparison is performed 4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4453 *----------------------------------------------------------------------------*/ 4454 4455 int float32_eq(float32 a, float32 b, float_status *status) 4456 { 4457 uint32_t av, bv; 4458 a = float32_squash_input_denormal(a, status); 4459 b = float32_squash_input_denormal(b, status); 4460 4461 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4462 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4463 ) { 4464 float_raise(float_flag_invalid, status); 4465 return 0; 4466 } 4467 av = float32_val(a); 4468 bv = float32_val(b); 4469 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4470 } 4471 4472 /*---------------------------------------------------------------------------- 4473 | Returns 1 if the single-precision floating-point value `a' is less than 4474 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4475 | exception is raised if either operand is a NaN. The comparison is performed 4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4477 *----------------------------------------------------------------------------*/ 4478 4479 int float32_le(float32 a, float32 b, float_status *status) 4480 { 4481 flag aSign, bSign; 4482 uint32_t av, bv; 4483 a = float32_squash_input_denormal(a, status); 4484 b = float32_squash_input_denormal(b, status); 4485 4486 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4487 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4488 ) { 4489 float_raise(float_flag_invalid, status); 4490 return 0; 4491 } 4492 aSign = extractFloat32Sign( a ); 4493 bSign = extractFloat32Sign( b ); 4494 av = float32_val(a); 4495 bv = float32_val(b); 4496 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4497 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4498 4499 } 4500 4501 /*---------------------------------------------------------------------------- 4502 | Returns 1 if the single-precision floating-point value `a' is less than 4503 | the corresponding value `b', and 0 otherwise. The invalid exception is 4504 | raised if either operand is a NaN. The comparison is performed according 4505 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4506 *----------------------------------------------------------------------------*/ 4507 4508 int float32_lt(float32 a, float32 b, float_status *status) 4509 { 4510 flag aSign, bSign; 4511 uint32_t av, bv; 4512 a = float32_squash_input_denormal(a, status); 4513 b = float32_squash_input_denormal(b, status); 4514 4515 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4516 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4517 ) { 4518 float_raise(float_flag_invalid, status); 4519 return 0; 4520 } 4521 aSign = extractFloat32Sign( a ); 4522 bSign = extractFloat32Sign( b ); 4523 av = float32_val(a); 4524 bv = float32_val(b); 4525 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4526 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4527 4528 } 4529 4530 /*---------------------------------------------------------------------------- 4531 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4532 | be compared, and 0 otherwise. The invalid exception is raised if either 4533 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4534 | Standard for Binary Floating-Point Arithmetic. 4535 *----------------------------------------------------------------------------*/ 4536 4537 int float32_unordered(float32 a, float32 b, float_status *status) 4538 { 4539 a = float32_squash_input_denormal(a, status); 4540 b = float32_squash_input_denormal(b, status); 4541 4542 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4543 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4544 ) { 4545 float_raise(float_flag_invalid, status); 4546 return 1; 4547 } 4548 return 0; 4549 } 4550 4551 /*---------------------------------------------------------------------------- 4552 | Returns 1 if the single-precision floating-point value `a' is equal to 4553 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4554 | exception. The comparison is performed according to the IEC/IEEE Standard 4555 | for Binary Floating-Point Arithmetic. 4556 *----------------------------------------------------------------------------*/ 4557 4558 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4559 { 4560 a = float32_squash_input_denormal(a, status); 4561 b = float32_squash_input_denormal(b, status); 4562 4563 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4564 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4565 ) { 4566 if (float32_is_signaling_nan(a, status) 4567 || float32_is_signaling_nan(b, status)) { 4568 float_raise(float_flag_invalid, status); 4569 } 4570 return 0; 4571 } 4572 return ( float32_val(a) == float32_val(b) ) || 4573 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4574 } 4575 4576 /*---------------------------------------------------------------------------- 4577 | Returns 1 if the single-precision floating-point value `a' is less than or 4578 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4579 | cause an exception. Otherwise, the comparison is performed according to the 4580 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4581 *----------------------------------------------------------------------------*/ 4582 4583 int float32_le_quiet(float32 a, float32 b, float_status *status) 4584 { 4585 flag aSign, bSign; 4586 uint32_t av, bv; 4587 a = float32_squash_input_denormal(a, status); 4588 b = float32_squash_input_denormal(b, status); 4589 4590 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4591 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4592 ) { 4593 if (float32_is_signaling_nan(a, status) 4594 || float32_is_signaling_nan(b, status)) { 4595 float_raise(float_flag_invalid, status); 4596 } 4597 return 0; 4598 } 4599 aSign = extractFloat32Sign( a ); 4600 bSign = extractFloat32Sign( b ); 4601 av = float32_val(a); 4602 bv = float32_val(b); 4603 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4604 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4605 4606 } 4607 4608 /*---------------------------------------------------------------------------- 4609 | Returns 1 if the single-precision floating-point value `a' is less than 4610 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4611 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4612 | Standard for Binary Floating-Point Arithmetic. 4613 *----------------------------------------------------------------------------*/ 4614 4615 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4616 { 4617 flag aSign, bSign; 4618 uint32_t av, bv; 4619 a = float32_squash_input_denormal(a, status); 4620 b = float32_squash_input_denormal(b, status); 4621 4622 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4623 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4624 ) { 4625 if (float32_is_signaling_nan(a, status) 4626 || float32_is_signaling_nan(b, status)) { 4627 float_raise(float_flag_invalid, status); 4628 } 4629 return 0; 4630 } 4631 aSign = extractFloat32Sign( a ); 4632 bSign = extractFloat32Sign( b ); 4633 av = float32_val(a); 4634 bv = float32_val(b); 4635 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4636 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4637 4638 } 4639 4640 /*---------------------------------------------------------------------------- 4641 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4642 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4643 | comparison is performed according to the IEC/IEEE Standard for Binary 4644 | Floating-Point Arithmetic. 4645 *----------------------------------------------------------------------------*/ 4646 4647 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4648 { 4649 a = float32_squash_input_denormal(a, status); 4650 b = float32_squash_input_denormal(b, status); 4651 4652 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4653 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4654 ) { 4655 if (float32_is_signaling_nan(a, status) 4656 || float32_is_signaling_nan(b, status)) { 4657 float_raise(float_flag_invalid, status); 4658 } 4659 return 1; 4660 } 4661 return 0; 4662 } 4663 4664 /*---------------------------------------------------------------------------- 4665 | If `a' is denormal and we are in flush-to-zero mode then set the 4666 | input-denormal exception and return zero. Otherwise just return the value. 4667 *----------------------------------------------------------------------------*/ 4668 float16 float16_squash_input_denormal(float16 a, float_status *status) 4669 { 4670 if (status->flush_inputs_to_zero) { 4671 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4672 float_raise(float_flag_input_denormal, status); 4673 return make_float16(float16_val(a) & 0x8000); 4674 } 4675 } 4676 return a; 4677 } 4678 4679 /*---------------------------------------------------------------------------- 4680 | Returns the result of converting the double-precision floating-point value 4681 | `a' to the extended double-precision floating-point format. The conversion 4682 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4683 | Arithmetic. 4684 *----------------------------------------------------------------------------*/ 4685 4686 floatx80 float64_to_floatx80(float64 a, float_status *status) 4687 { 4688 flag aSign; 4689 int aExp; 4690 uint64_t aSig; 4691 4692 a = float64_squash_input_denormal(a, status); 4693 aSig = extractFloat64Frac( a ); 4694 aExp = extractFloat64Exp( a ); 4695 aSign = extractFloat64Sign( a ); 4696 if ( aExp == 0x7FF ) { 4697 if (aSig) { 4698 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4699 } 4700 return packFloatx80(aSign, 4701 floatx80_infinity_high, 4702 floatx80_infinity_low); 4703 } 4704 if ( aExp == 0 ) { 4705 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4706 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4707 } 4708 return 4709 packFloatx80( 4710 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4711 4712 } 4713 4714 /*---------------------------------------------------------------------------- 4715 | Returns the result of converting the double-precision floating-point value 4716 | `a' to the quadruple-precision floating-point format. The conversion is 4717 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4718 | Arithmetic. 4719 *----------------------------------------------------------------------------*/ 4720 4721 float128 float64_to_float128(float64 a, float_status *status) 4722 { 4723 flag aSign; 4724 int aExp; 4725 uint64_t aSig, zSig0, zSig1; 4726 4727 a = float64_squash_input_denormal(a, status); 4728 aSig = extractFloat64Frac( a ); 4729 aExp = extractFloat64Exp( a ); 4730 aSign = extractFloat64Sign( a ); 4731 if ( aExp == 0x7FF ) { 4732 if (aSig) { 4733 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4734 } 4735 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4736 } 4737 if ( aExp == 0 ) { 4738 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4739 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4740 --aExp; 4741 } 4742 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4743 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4744 4745 } 4746 4747 4748 /*---------------------------------------------------------------------------- 4749 | Returns the remainder of the double-precision floating-point value `a' 4750 | with respect to the corresponding value `b'. The operation is performed 4751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4752 *----------------------------------------------------------------------------*/ 4753 4754 float64 float64_rem(float64 a, float64 b, float_status *status) 4755 { 4756 flag aSign, zSign; 4757 int aExp, bExp, expDiff; 4758 uint64_t aSig, bSig; 4759 uint64_t q, alternateASig; 4760 int64_t sigMean; 4761 4762 a = float64_squash_input_denormal(a, status); 4763 b = float64_squash_input_denormal(b, status); 4764 aSig = extractFloat64Frac( a ); 4765 aExp = extractFloat64Exp( a ); 4766 aSign = extractFloat64Sign( a ); 4767 bSig = extractFloat64Frac( b ); 4768 bExp = extractFloat64Exp( b ); 4769 if ( aExp == 0x7FF ) { 4770 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4771 return propagateFloat64NaN(a, b, status); 4772 } 4773 float_raise(float_flag_invalid, status); 4774 return float64_default_nan(status); 4775 } 4776 if ( bExp == 0x7FF ) { 4777 if (bSig) { 4778 return propagateFloat64NaN(a, b, status); 4779 } 4780 return a; 4781 } 4782 if ( bExp == 0 ) { 4783 if ( bSig == 0 ) { 4784 float_raise(float_flag_invalid, status); 4785 return float64_default_nan(status); 4786 } 4787 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4788 } 4789 if ( aExp == 0 ) { 4790 if ( aSig == 0 ) return a; 4791 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4792 } 4793 expDiff = aExp - bExp; 4794 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4795 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4796 if ( expDiff < 0 ) { 4797 if ( expDiff < -1 ) return a; 4798 aSig >>= 1; 4799 } 4800 q = ( bSig <= aSig ); 4801 if ( q ) aSig -= bSig; 4802 expDiff -= 64; 4803 while ( 0 < expDiff ) { 4804 q = estimateDiv128To64( aSig, 0, bSig ); 4805 q = ( 2 < q ) ? q - 2 : 0; 4806 aSig = - ( ( bSig>>2 ) * q ); 4807 expDiff -= 62; 4808 } 4809 expDiff += 64; 4810 if ( 0 < expDiff ) { 4811 q = estimateDiv128To64( aSig, 0, bSig ); 4812 q = ( 2 < q ) ? q - 2 : 0; 4813 q >>= 64 - expDiff; 4814 bSig >>= 2; 4815 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4816 } 4817 else { 4818 aSig >>= 2; 4819 bSig >>= 2; 4820 } 4821 do { 4822 alternateASig = aSig; 4823 ++q; 4824 aSig -= bSig; 4825 } while ( 0 <= (int64_t) aSig ); 4826 sigMean = aSig + alternateASig; 4827 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4828 aSig = alternateASig; 4829 } 4830 zSign = ( (int64_t) aSig < 0 ); 4831 if ( zSign ) aSig = - aSig; 4832 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4833 4834 } 4835 4836 /*---------------------------------------------------------------------------- 4837 | Returns the binary log of the double-precision floating-point value `a'. 4838 | The operation is performed according to the IEC/IEEE Standard for Binary 4839 | Floating-Point Arithmetic. 4840 *----------------------------------------------------------------------------*/ 4841 float64 float64_log2(float64 a, float_status *status) 4842 { 4843 flag aSign, zSign; 4844 int aExp; 4845 uint64_t aSig, aSig0, aSig1, zSig, i; 4846 a = float64_squash_input_denormal(a, status); 4847 4848 aSig = extractFloat64Frac( a ); 4849 aExp = extractFloat64Exp( a ); 4850 aSign = extractFloat64Sign( a ); 4851 4852 if ( aExp == 0 ) { 4853 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4854 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4855 } 4856 if ( aSign ) { 4857 float_raise(float_flag_invalid, status); 4858 return float64_default_nan(status); 4859 } 4860 if ( aExp == 0x7FF ) { 4861 if (aSig) { 4862 return propagateFloat64NaN(a, float64_zero, status); 4863 } 4864 return a; 4865 } 4866 4867 aExp -= 0x3FF; 4868 aSig |= LIT64( 0x0010000000000000 ); 4869 zSign = aExp < 0; 4870 zSig = (uint64_t)aExp << 52; 4871 for (i = 1LL << 51; i > 0; i >>= 1) { 4872 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4873 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4874 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4875 aSig >>= 1; 4876 zSig |= i; 4877 } 4878 } 4879 4880 if ( zSign ) 4881 zSig = -zSig; 4882 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4883 } 4884 4885 /*---------------------------------------------------------------------------- 4886 | Returns 1 if the double-precision floating-point value `a' is equal to the 4887 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4888 | if either operand is a NaN. Otherwise, the comparison is performed 4889 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4890 *----------------------------------------------------------------------------*/ 4891 4892 int float64_eq(float64 a, float64 b, float_status *status) 4893 { 4894 uint64_t av, bv; 4895 a = float64_squash_input_denormal(a, status); 4896 b = float64_squash_input_denormal(b, status); 4897 4898 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4899 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4900 ) { 4901 float_raise(float_flag_invalid, status); 4902 return 0; 4903 } 4904 av = float64_val(a); 4905 bv = float64_val(b); 4906 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4907 4908 } 4909 4910 /*---------------------------------------------------------------------------- 4911 | Returns 1 if the double-precision floating-point value `a' is less than or 4912 | equal to the corresponding value `b', and 0 otherwise. The invalid 4913 | exception is raised if either operand is a NaN. The comparison is performed 4914 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4915 *----------------------------------------------------------------------------*/ 4916 4917 int float64_le(float64 a, float64 b, float_status *status) 4918 { 4919 flag aSign, bSign; 4920 uint64_t av, bv; 4921 a = float64_squash_input_denormal(a, status); 4922 b = float64_squash_input_denormal(b, status); 4923 4924 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4925 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4926 ) { 4927 float_raise(float_flag_invalid, status); 4928 return 0; 4929 } 4930 aSign = extractFloat64Sign( a ); 4931 bSign = extractFloat64Sign( b ); 4932 av = float64_val(a); 4933 bv = float64_val(b); 4934 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4935 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4936 4937 } 4938 4939 /*---------------------------------------------------------------------------- 4940 | Returns 1 if the double-precision floating-point value `a' is less than 4941 | the corresponding value `b', and 0 otherwise. The invalid exception is 4942 | raised if either operand is a NaN. The comparison is performed according 4943 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4944 *----------------------------------------------------------------------------*/ 4945 4946 int float64_lt(float64 a, float64 b, float_status *status) 4947 { 4948 flag aSign, bSign; 4949 uint64_t av, bv; 4950 4951 a = float64_squash_input_denormal(a, status); 4952 b = float64_squash_input_denormal(b, status); 4953 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4954 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4955 ) { 4956 float_raise(float_flag_invalid, status); 4957 return 0; 4958 } 4959 aSign = extractFloat64Sign( a ); 4960 bSign = extractFloat64Sign( b ); 4961 av = float64_val(a); 4962 bv = float64_val(b); 4963 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4964 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4965 4966 } 4967 4968 /*---------------------------------------------------------------------------- 4969 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4970 | be compared, and 0 otherwise. The invalid exception is raised if either 4971 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4972 | Standard for Binary Floating-Point Arithmetic. 4973 *----------------------------------------------------------------------------*/ 4974 4975 int float64_unordered(float64 a, float64 b, float_status *status) 4976 { 4977 a = float64_squash_input_denormal(a, status); 4978 b = float64_squash_input_denormal(b, status); 4979 4980 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4981 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4982 ) { 4983 float_raise(float_flag_invalid, status); 4984 return 1; 4985 } 4986 return 0; 4987 } 4988 4989 /*---------------------------------------------------------------------------- 4990 | Returns 1 if the double-precision floating-point value `a' is equal to the 4991 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4992 | exception.The comparison is performed according to the IEC/IEEE Standard 4993 | for Binary Floating-Point Arithmetic. 4994 *----------------------------------------------------------------------------*/ 4995 4996 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4997 { 4998 uint64_t av, bv; 4999 a = float64_squash_input_denormal(a, status); 5000 b = float64_squash_input_denormal(b, status); 5001 5002 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5003 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5004 ) { 5005 if (float64_is_signaling_nan(a, status) 5006 || float64_is_signaling_nan(b, status)) { 5007 float_raise(float_flag_invalid, status); 5008 } 5009 return 0; 5010 } 5011 av = float64_val(a); 5012 bv = float64_val(b); 5013 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5014 5015 } 5016 5017 /*---------------------------------------------------------------------------- 5018 | Returns 1 if the double-precision floating-point value `a' is less than or 5019 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5020 | cause an exception. Otherwise, the comparison is performed according to the 5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 int float64_le_quiet(float64 a, float64 b, float_status *status) 5025 { 5026 flag aSign, bSign; 5027 uint64_t av, bv; 5028 a = float64_squash_input_denormal(a, status); 5029 b = float64_squash_input_denormal(b, status); 5030 5031 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5032 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5033 ) { 5034 if (float64_is_signaling_nan(a, status) 5035 || float64_is_signaling_nan(b, status)) { 5036 float_raise(float_flag_invalid, status); 5037 } 5038 return 0; 5039 } 5040 aSign = extractFloat64Sign( a ); 5041 bSign = extractFloat64Sign( b ); 5042 av = float64_val(a); 5043 bv = float64_val(b); 5044 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5045 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5046 5047 } 5048 5049 /*---------------------------------------------------------------------------- 5050 | Returns 1 if the double-precision floating-point value `a' is less than 5051 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5052 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5053 | Standard for Binary Floating-Point Arithmetic. 5054 *----------------------------------------------------------------------------*/ 5055 5056 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5057 { 5058 flag aSign, bSign; 5059 uint64_t av, bv; 5060 a = float64_squash_input_denormal(a, status); 5061 b = float64_squash_input_denormal(b, status); 5062 5063 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5064 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5065 ) { 5066 if (float64_is_signaling_nan(a, status) 5067 || float64_is_signaling_nan(b, status)) { 5068 float_raise(float_flag_invalid, status); 5069 } 5070 return 0; 5071 } 5072 aSign = extractFloat64Sign( a ); 5073 bSign = extractFloat64Sign( b ); 5074 av = float64_val(a); 5075 bv = float64_val(b); 5076 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5077 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5078 5079 } 5080 5081 /*---------------------------------------------------------------------------- 5082 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5083 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5084 | comparison is performed according to the IEC/IEEE Standard for Binary 5085 | Floating-Point Arithmetic. 5086 *----------------------------------------------------------------------------*/ 5087 5088 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5089 { 5090 a = float64_squash_input_denormal(a, status); 5091 b = float64_squash_input_denormal(b, status); 5092 5093 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5094 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5095 ) { 5096 if (float64_is_signaling_nan(a, status) 5097 || float64_is_signaling_nan(b, status)) { 5098 float_raise(float_flag_invalid, status); 5099 } 5100 return 1; 5101 } 5102 return 0; 5103 } 5104 5105 /*---------------------------------------------------------------------------- 5106 | Returns the result of converting the extended double-precision floating- 5107 | point value `a' to the 32-bit two's complement integer format. The 5108 | conversion is performed according to the IEC/IEEE Standard for Binary 5109 | Floating-Point Arithmetic---which means in particular that the conversion 5110 | is rounded according to the current rounding mode. If `a' is a NaN, the 5111 | largest positive integer is returned. Otherwise, if the conversion 5112 | overflows, the largest integer with the same sign as `a' is returned. 5113 *----------------------------------------------------------------------------*/ 5114 5115 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5116 { 5117 flag aSign; 5118 int32_t aExp, shiftCount; 5119 uint64_t aSig; 5120 5121 if (floatx80_invalid_encoding(a)) { 5122 float_raise(float_flag_invalid, status); 5123 return 1 << 31; 5124 } 5125 aSig = extractFloatx80Frac( a ); 5126 aExp = extractFloatx80Exp( a ); 5127 aSign = extractFloatx80Sign( a ); 5128 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5129 shiftCount = 0x4037 - aExp; 5130 if ( shiftCount <= 0 ) shiftCount = 1; 5131 shift64RightJamming( aSig, shiftCount, &aSig ); 5132 return roundAndPackInt32(aSign, aSig, status); 5133 5134 } 5135 5136 /*---------------------------------------------------------------------------- 5137 | Returns the result of converting the extended double-precision floating- 5138 | point value `a' to the 32-bit two's complement integer format. The 5139 | conversion is performed according to the IEC/IEEE Standard for Binary 5140 | Floating-Point Arithmetic, except that the conversion is always rounded 5141 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5142 | Otherwise, if the conversion overflows, the largest integer with the same 5143 | sign as `a' is returned. 5144 *----------------------------------------------------------------------------*/ 5145 5146 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5147 { 5148 flag aSign; 5149 int32_t aExp, shiftCount; 5150 uint64_t aSig, savedASig; 5151 int32_t z; 5152 5153 if (floatx80_invalid_encoding(a)) { 5154 float_raise(float_flag_invalid, status); 5155 return 1 << 31; 5156 } 5157 aSig = extractFloatx80Frac( a ); 5158 aExp = extractFloatx80Exp( a ); 5159 aSign = extractFloatx80Sign( a ); 5160 if ( 0x401E < aExp ) { 5161 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5162 goto invalid; 5163 } 5164 else if ( aExp < 0x3FFF ) { 5165 if (aExp || aSig) { 5166 status->float_exception_flags |= float_flag_inexact; 5167 } 5168 return 0; 5169 } 5170 shiftCount = 0x403E - aExp; 5171 savedASig = aSig; 5172 aSig >>= shiftCount; 5173 z = aSig; 5174 if ( aSign ) z = - z; 5175 if ( ( z < 0 ) ^ aSign ) { 5176 invalid: 5177 float_raise(float_flag_invalid, status); 5178 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5179 } 5180 if ( ( aSig<<shiftCount ) != savedASig ) { 5181 status->float_exception_flags |= float_flag_inexact; 5182 } 5183 return z; 5184 5185 } 5186 5187 /*---------------------------------------------------------------------------- 5188 | Returns the result of converting the extended double-precision floating- 5189 | point value `a' to the 64-bit two's complement integer format. The 5190 | conversion is performed according to the IEC/IEEE Standard for Binary 5191 | Floating-Point Arithmetic---which means in particular that the conversion 5192 | is rounded according to the current rounding mode. If `a' is a NaN, 5193 | the largest positive integer is returned. Otherwise, if the conversion 5194 | overflows, the largest integer with the same sign as `a' is returned. 5195 *----------------------------------------------------------------------------*/ 5196 5197 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5198 { 5199 flag aSign; 5200 int32_t aExp, shiftCount; 5201 uint64_t aSig, aSigExtra; 5202 5203 if (floatx80_invalid_encoding(a)) { 5204 float_raise(float_flag_invalid, status); 5205 return 1ULL << 63; 5206 } 5207 aSig = extractFloatx80Frac( a ); 5208 aExp = extractFloatx80Exp( a ); 5209 aSign = extractFloatx80Sign( a ); 5210 shiftCount = 0x403E - aExp; 5211 if ( shiftCount <= 0 ) { 5212 if ( shiftCount ) { 5213 float_raise(float_flag_invalid, status); 5214 if (!aSign || floatx80_is_any_nan(a)) { 5215 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5216 } 5217 return (int64_t) LIT64( 0x8000000000000000 ); 5218 } 5219 aSigExtra = 0; 5220 } 5221 else { 5222 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5223 } 5224 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5225 5226 } 5227 5228 /*---------------------------------------------------------------------------- 5229 | Returns the result of converting the extended double-precision floating- 5230 | point value `a' to the 64-bit two's complement integer format. The 5231 | conversion is performed according to the IEC/IEEE Standard for Binary 5232 | Floating-Point Arithmetic, except that the conversion is always rounded 5233 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5234 | Otherwise, if the conversion overflows, the largest integer with the same 5235 | sign as `a' is returned. 5236 *----------------------------------------------------------------------------*/ 5237 5238 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5239 { 5240 flag aSign; 5241 int32_t aExp, shiftCount; 5242 uint64_t aSig; 5243 int64_t z; 5244 5245 if (floatx80_invalid_encoding(a)) { 5246 float_raise(float_flag_invalid, status); 5247 return 1ULL << 63; 5248 } 5249 aSig = extractFloatx80Frac( a ); 5250 aExp = extractFloatx80Exp( a ); 5251 aSign = extractFloatx80Sign( a ); 5252 shiftCount = aExp - 0x403E; 5253 if ( 0 <= shiftCount ) { 5254 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5255 if ( ( a.high != 0xC03E ) || aSig ) { 5256 float_raise(float_flag_invalid, status); 5257 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5258 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5259 } 5260 } 5261 return (int64_t) LIT64( 0x8000000000000000 ); 5262 } 5263 else if ( aExp < 0x3FFF ) { 5264 if (aExp | aSig) { 5265 status->float_exception_flags |= float_flag_inexact; 5266 } 5267 return 0; 5268 } 5269 z = aSig>>( - shiftCount ); 5270 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5271 status->float_exception_flags |= float_flag_inexact; 5272 } 5273 if ( aSign ) z = - z; 5274 return z; 5275 5276 } 5277 5278 /*---------------------------------------------------------------------------- 5279 | Returns the result of converting the extended double-precision floating- 5280 | point value `a' to the single-precision floating-point format. The 5281 | conversion is performed according to the IEC/IEEE Standard for Binary 5282 | Floating-Point Arithmetic. 5283 *----------------------------------------------------------------------------*/ 5284 5285 float32 floatx80_to_float32(floatx80 a, float_status *status) 5286 { 5287 flag aSign; 5288 int32_t aExp; 5289 uint64_t aSig; 5290 5291 if (floatx80_invalid_encoding(a)) { 5292 float_raise(float_flag_invalid, status); 5293 return float32_default_nan(status); 5294 } 5295 aSig = extractFloatx80Frac( a ); 5296 aExp = extractFloatx80Exp( a ); 5297 aSign = extractFloatx80Sign( a ); 5298 if ( aExp == 0x7FFF ) { 5299 if ( (uint64_t) ( aSig<<1 ) ) { 5300 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5301 } 5302 return packFloat32( aSign, 0xFF, 0 ); 5303 } 5304 shift64RightJamming( aSig, 33, &aSig ); 5305 if ( aExp || aSig ) aExp -= 0x3F81; 5306 return roundAndPackFloat32(aSign, aExp, aSig, status); 5307 5308 } 5309 5310 /*---------------------------------------------------------------------------- 5311 | Returns the result of converting the extended double-precision floating- 5312 | point value `a' to the double-precision floating-point format. The 5313 | conversion is performed according to the IEC/IEEE Standard for Binary 5314 | Floating-Point Arithmetic. 5315 *----------------------------------------------------------------------------*/ 5316 5317 float64 floatx80_to_float64(floatx80 a, float_status *status) 5318 { 5319 flag aSign; 5320 int32_t aExp; 5321 uint64_t aSig, zSig; 5322 5323 if (floatx80_invalid_encoding(a)) { 5324 float_raise(float_flag_invalid, status); 5325 return float64_default_nan(status); 5326 } 5327 aSig = extractFloatx80Frac( a ); 5328 aExp = extractFloatx80Exp( a ); 5329 aSign = extractFloatx80Sign( a ); 5330 if ( aExp == 0x7FFF ) { 5331 if ( (uint64_t) ( aSig<<1 ) ) { 5332 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5333 } 5334 return packFloat64( aSign, 0x7FF, 0 ); 5335 } 5336 shift64RightJamming( aSig, 1, &zSig ); 5337 if ( aExp || aSig ) aExp -= 0x3C01; 5338 return roundAndPackFloat64(aSign, aExp, zSig, status); 5339 5340 } 5341 5342 /*---------------------------------------------------------------------------- 5343 | Returns the result of converting the extended double-precision floating- 5344 | point value `a' to the quadruple-precision floating-point format. The 5345 | conversion is performed according to the IEC/IEEE Standard for Binary 5346 | Floating-Point Arithmetic. 5347 *----------------------------------------------------------------------------*/ 5348 5349 float128 floatx80_to_float128(floatx80 a, float_status *status) 5350 { 5351 flag aSign; 5352 int aExp; 5353 uint64_t aSig, zSig0, zSig1; 5354 5355 if (floatx80_invalid_encoding(a)) { 5356 float_raise(float_flag_invalid, status); 5357 return float128_default_nan(status); 5358 } 5359 aSig = extractFloatx80Frac( a ); 5360 aExp = extractFloatx80Exp( a ); 5361 aSign = extractFloatx80Sign( a ); 5362 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5363 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5364 } 5365 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5366 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5367 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Rounds the extended double-precision floating-point value `a' 5372 | to the precision provided by floatx80_rounding_precision and returns the 5373 | result as an extended double-precision floating-point value. 5374 | The operation is performed according to the IEC/IEEE Standard for Binary 5375 | Floating-Point Arithmetic. 5376 *----------------------------------------------------------------------------*/ 5377 5378 floatx80 floatx80_round(floatx80 a, float_status *status) 5379 { 5380 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5381 extractFloatx80Sign(a), 5382 extractFloatx80Exp(a), 5383 extractFloatx80Frac(a), 0, status); 5384 } 5385 5386 /*---------------------------------------------------------------------------- 5387 | Rounds the extended double-precision floating-point value `a' to an integer, 5388 | and returns the result as an extended quadruple-precision floating-point 5389 | value. The operation is performed according to the IEC/IEEE Standard for 5390 | Binary Floating-Point Arithmetic. 5391 *----------------------------------------------------------------------------*/ 5392 5393 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5394 { 5395 flag aSign; 5396 int32_t aExp; 5397 uint64_t lastBitMask, roundBitsMask; 5398 floatx80 z; 5399 5400 if (floatx80_invalid_encoding(a)) { 5401 float_raise(float_flag_invalid, status); 5402 return floatx80_default_nan(status); 5403 } 5404 aExp = extractFloatx80Exp( a ); 5405 if ( 0x403E <= aExp ) { 5406 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5407 return propagateFloatx80NaN(a, a, status); 5408 } 5409 return a; 5410 } 5411 if ( aExp < 0x3FFF ) { 5412 if ( ( aExp == 0 ) 5413 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5414 return a; 5415 } 5416 status->float_exception_flags |= float_flag_inexact; 5417 aSign = extractFloatx80Sign( a ); 5418 switch (status->float_rounding_mode) { 5419 case float_round_nearest_even: 5420 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5421 ) { 5422 return 5423 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5424 } 5425 break; 5426 case float_round_ties_away: 5427 if (aExp == 0x3FFE) { 5428 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5429 } 5430 break; 5431 case float_round_down: 5432 return 5433 aSign ? 5434 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5435 : packFloatx80( 0, 0, 0 ); 5436 case float_round_up: 5437 return 5438 aSign ? packFloatx80( 1, 0, 0 ) 5439 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5440 } 5441 return packFloatx80( aSign, 0, 0 ); 5442 } 5443 lastBitMask = 1; 5444 lastBitMask <<= 0x403E - aExp; 5445 roundBitsMask = lastBitMask - 1; 5446 z = a; 5447 switch (status->float_rounding_mode) { 5448 case float_round_nearest_even: 5449 z.low += lastBitMask>>1; 5450 if ((z.low & roundBitsMask) == 0) { 5451 z.low &= ~lastBitMask; 5452 } 5453 break; 5454 case float_round_ties_away: 5455 z.low += lastBitMask >> 1; 5456 break; 5457 case float_round_to_zero: 5458 break; 5459 case float_round_up: 5460 if (!extractFloatx80Sign(z)) { 5461 z.low += roundBitsMask; 5462 } 5463 break; 5464 case float_round_down: 5465 if (extractFloatx80Sign(z)) { 5466 z.low += roundBitsMask; 5467 } 5468 break; 5469 default: 5470 abort(); 5471 } 5472 z.low &= ~ roundBitsMask; 5473 if ( z.low == 0 ) { 5474 ++z.high; 5475 z.low = LIT64( 0x8000000000000000 ); 5476 } 5477 if (z.low != a.low) { 5478 status->float_exception_flags |= float_flag_inexact; 5479 } 5480 return z; 5481 5482 } 5483 5484 /*---------------------------------------------------------------------------- 5485 | Returns the result of adding the absolute values of the extended double- 5486 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5487 | negated before being returned. `zSign' is ignored if the result is a NaN. 5488 | The addition is performed according to the IEC/IEEE Standard for Binary 5489 | Floating-Point Arithmetic. 5490 *----------------------------------------------------------------------------*/ 5491 5492 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5493 float_status *status) 5494 { 5495 int32_t aExp, bExp, zExp; 5496 uint64_t aSig, bSig, zSig0, zSig1; 5497 int32_t expDiff; 5498 5499 aSig = extractFloatx80Frac( a ); 5500 aExp = extractFloatx80Exp( a ); 5501 bSig = extractFloatx80Frac( b ); 5502 bExp = extractFloatx80Exp( b ); 5503 expDiff = aExp - bExp; 5504 if ( 0 < expDiff ) { 5505 if ( aExp == 0x7FFF ) { 5506 if ((uint64_t)(aSig << 1)) { 5507 return propagateFloatx80NaN(a, b, status); 5508 } 5509 return a; 5510 } 5511 if ( bExp == 0 ) --expDiff; 5512 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5513 zExp = aExp; 5514 } 5515 else if ( expDiff < 0 ) { 5516 if ( bExp == 0x7FFF ) { 5517 if ((uint64_t)(bSig << 1)) { 5518 return propagateFloatx80NaN(a, b, status); 5519 } 5520 return packFloatx80(zSign, 5521 floatx80_infinity_high, 5522 floatx80_infinity_low); 5523 } 5524 if ( aExp == 0 ) ++expDiff; 5525 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5526 zExp = bExp; 5527 } 5528 else { 5529 if ( aExp == 0x7FFF ) { 5530 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5531 return propagateFloatx80NaN(a, b, status); 5532 } 5533 return a; 5534 } 5535 zSig1 = 0; 5536 zSig0 = aSig + bSig; 5537 if ( aExp == 0 ) { 5538 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5539 goto roundAndPack; 5540 } 5541 zExp = aExp; 5542 goto shiftRight1; 5543 } 5544 zSig0 = aSig + bSig; 5545 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5546 shiftRight1: 5547 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5548 zSig0 |= LIT64( 0x8000000000000000 ); 5549 ++zExp; 5550 roundAndPack: 5551 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5552 zSign, zExp, zSig0, zSig1, status); 5553 } 5554 5555 /*---------------------------------------------------------------------------- 5556 | Returns the result of subtracting the absolute values of the extended 5557 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5558 | difference is negated before being returned. `zSign' is ignored if the 5559 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5560 | Standard for Binary Floating-Point Arithmetic. 5561 *----------------------------------------------------------------------------*/ 5562 5563 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5564 float_status *status) 5565 { 5566 int32_t aExp, bExp, zExp; 5567 uint64_t aSig, bSig, zSig0, zSig1; 5568 int32_t expDiff; 5569 5570 aSig = extractFloatx80Frac( a ); 5571 aExp = extractFloatx80Exp( a ); 5572 bSig = extractFloatx80Frac( b ); 5573 bExp = extractFloatx80Exp( b ); 5574 expDiff = aExp - bExp; 5575 if ( 0 < expDiff ) goto aExpBigger; 5576 if ( expDiff < 0 ) goto bExpBigger; 5577 if ( aExp == 0x7FFF ) { 5578 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5579 return propagateFloatx80NaN(a, b, status); 5580 } 5581 float_raise(float_flag_invalid, status); 5582 return floatx80_default_nan(status); 5583 } 5584 if ( aExp == 0 ) { 5585 aExp = 1; 5586 bExp = 1; 5587 } 5588 zSig1 = 0; 5589 if ( bSig < aSig ) goto aBigger; 5590 if ( aSig < bSig ) goto bBigger; 5591 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5592 bExpBigger: 5593 if ( bExp == 0x7FFF ) { 5594 if ((uint64_t)(bSig << 1)) { 5595 return propagateFloatx80NaN(a, b, status); 5596 } 5597 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5598 floatx80_infinity_low); 5599 } 5600 if ( aExp == 0 ) ++expDiff; 5601 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5602 bBigger: 5603 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5604 zExp = bExp; 5605 zSign ^= 1; 5606 goto normalizeRoundAndPack; 5607 aExpBigger: 5608 if ( aExp == 0x7FFF ) { 5609 if ((uint64_t)(aSig << 1)) { 5610 return propagateFloatx80NaN(a, b, status); 5611 } 5612 return a; 5613 } 5614 if ( bExp == 0 ) --expDiff; 5615 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5616 aBigger: 5617 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5618 zExp = aExp; 5619 normalizeRoundAndPack: 5620 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5621 zSign, zExp, zSig0, zSig1, status); 5622 } 5623 5624 /*---------------------------------------------------------------------------- 5625 | Returns the result of adding the extended double-precision floating-point 5626 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5627 | Standard for Binary Floating-Point Arithmetic. 5628 *----------------------------------------------------------------------------*/ 5629 5630 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5631 { 5632 flag aSign, bSign; 5633 5634 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5635 float_raise(float_flag_invalid, status); 5636 return floatx80_default_nan(status); 5637 } 5638 aSign = extractFloatx80Sign( a ); 5639 bSign = extractFloatx80Sign( b ); 5640 if ( aSign == bSign ) { 5641 return addFloatx80Sigs(a, b, aSign, status); 5642 } 5643 else { 5644 return subFloatx80Sigs(a, b, aSign, status); 5645 } 5646 5647 } 5648 5649 /*---------------------------------------------------------------------------- 5650 | Returns the result of subtracting the extended double-precision floating- 5651 | point values `a' and `b'. The operation is performed according to the 5652 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5653 *----------------------------------------------------------------------------*/ 5654 5655 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5656 { 5657 flag aSign, bSign; 5658 5659 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5660 float_raise(float_flag_invalid, status); 5661 return floatx80_default_nan(status); 5662 } 5663 aSign = extractFloatx80Sign( a ); 5664 bSign = extractFloatx80Sign( b ); 5665 if ( aSign == bSign ) { 5666 return subFloatx80Sigs(a, b, aSign, status); 5667 } 5668 else { 5669 return addFloatx80Sigs(a, b, aSign, status); 5670 } 5671 5672 } 5673 5674 /*---------------------------------------------------------------------------- 5675 | Returns the result of multiplying the extended double-precision floating- 5676 | point values `a' and `b'. The operation is performed according to the 5677 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5678 *----------------------------------------------------------------------------*/ 5679 5680 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5681 { 5682 flag aSign, bSign, zSign; 5683 int32_t aExp, bExp, zExp; 5684 uint64_t aSig, bSig, zSig0, zSig1; 5685 5686 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5687 float_raise(float_flag_invalid, status); 5688 return floatx80_default_nan(status); 5689 } 5690 aSig = extractFloatx80Frac( a ); 5691 aExp = extractFloatx80Exp( a ); 5692 aSign = extractFloatx80Sign( a ); 5693 bSig = extractFloatx80Frac( b ); 5694 bExp = extractFloatx80Exp( b ); 5695 bSign = extractFloatx80Sign( b ); 5696 zSign = aSign ^ bSign; 5697 if ( aExp == 0x7FFF ) { 5698 if ( (uint64_t) ( aSig<<1 ) 5699 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5700 return propagateFloatx80NaN(a, b, status); 5701 } 5702 if ( ( bExp | bSig ) == 0 ) goto invalid; 5703 return packFloatx80(zSign, floatx80_infinity_high, 5704 floatx80_infinity_low); 5705 } 5706 if ( bExp == 0x7FFF ) { 5707 if ((uint64_t)(bSig << 1)) { 5708 return propagateFloatx80NaN(a, b, status); 5709 } 5710 if ( ( aExp | aSig ) == 0 ) { 5711 invalid: 5712 float_raise(float_flag_invalid, status); 5713 return floatx80_default_nan(status); 5714 } 5715 return packFloatx80(zSign, floatx80_infinity_high, 5716 floatx80_infinity_low); 5717 } 5718 if ( aExp == 0 ) { 5719 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5720 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5721 } 5722 if ( bExp == 0 ) { 5723 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5724 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5725 } 5726 zExp = aExp + bExp - 0x3FFE; 5727 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5728 if ( 0 < (int64_t) zSig0 ) { 5729 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5730 --zExp; 5731 } 5732 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5733 zSign, zExp, zSig0, zSig1, status); 5734 } 5735 5736 /*---------------------------------------------------------------------------- 5737 | Returns the result of dividing the extended double-precision floating-point 5738 | value `a' by the corresponding value `b'. The operation is performed 5739 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5740 *----------------------------------------------------------------------------*/ 5741 5742 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5743 { 5744 flag aSign, bSign, zSign; 5745 int32_t aExp, bExp, zExp; 5746 uint64_t aSig, bSig, zSig0, zSig1; 5747 uint64_t rem0, rem1, rem2, term0, term1, term2; 5748 5749 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5750 float_raise(float_flag_invalid, status); 5751 return floatx80_default_nan(status); 5752 } 5753 aSig = extractFloatx80Frac( a ); 5754 aExp = extractFloatx80Exp( a ); 5755 aSign = extractFloatx80Sign( a ); 5756 bSig = extractFloatx80Frac( b ); 5757 bExp = extractFloatx80Exp( b ); 5758 bSign = extractFloatx80Sign( b ); 5759 zSign = aSign ^ bSign; 5760 if ( aExp == 0x7FFF ) { 5761 if ((uint64_t)(aSig << 1)) { 5762 return propagateFloatx80NaN(a, b, status); 5763 } 5764 if ( bExp == 0x7FFF ) { 5765 if ((uint64_t)(bSig << 1)) { 5766 return propagateFloatx80NaN(a, b, status); 5767 } 5768 goto invalid; 5769 } 5770 return packFloatx80(zSign, floatx80_infinity_high, 5771 floatx80_infinity_low); 5772 } 5773 if ( bExp == 0x7FFF ) { 5774 if ((uint64_t)(bSig << 1)) { 5775 return propagateFloatx80NaN(a, b, status); 5776 } 5777 return packFloatx80( zSign, 0, 0 ); 5778 } 5779 if ( bExp == 0 ) { 5780 if ( bSig == 0 ) { 5781 if ( ( aExp | aSig ) == 0 ) { 5782 invalid: 5783 float_raise(float_flag_invalid, status); 5784 return floatx80_default_nan(status); 5785 } 5786 float_raise(float_flag_divbyzero, status); 5787 return packFloatx80(zSign, floatx80_infinity_high, 5788 floatx80_infinity_low); 5789 } 5790 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5791 } 5792 if ( aExp == 0 ) { 5793 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5794 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5795 } 5796 zExp = aExp - bExp + 0x3FFE; 5797 rem1 = 0; 5798 if ( bSig <= aSig ) { 5799 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5800 ++zExp; 5801 } 5802 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5803 mul64To128( bSig, zSig0, &term0, &term1 ); 5804 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5805 while ( (int64_t) rem0 < 0 ) { 5806 --zSig0; 5807 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5808 } 5809 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5810 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5811 mul64To128( bSig, zSig1, &term1, &term2 ); 5812 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5813 while ( (int64_t) rem1 < 0 ) { 5814 --zSig1; 5815 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5816 } 5817 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5818 } 5819 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5820 zSign, zExp, zSig0, zSig1, status); 5821 } 5822 5823 /*---------------------------------------------------------------------------- 5824 | Returns the remainder of the extended double-precision floating-point value 5825 | `a' with respect to the corresponding value `b'. The operation is performed 5826 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5827 *----------------------------------------------------------------------------*/ 5828 5829 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5830 { 5831 flag aSign, zSign; 5832 int32_t aExp, bExp, expDiff; 5833 uint64_t aSig0, aSig1, bSig; 5834 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5835 5836 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5837 float_raise(float_flag_invalid, status); 5838 return floatx80_default_nan(status); 5839 } 5840 aSig0 = extractFloatx80Frac( a ); 5841 aExp = extractFloatx80Exp( a ); 5842 aSign = extractFloatx80Sign( a ); 5843 bSig = extractFloatx80Frac( b ); 5844 bExp = extractFloatx80Exp( b ); 5845 if ( aExp == 0x7FFF ) { 5846 if ( (uint64_t) ( aSig0<<1 ) 5847 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5848 return propagateFloatx80NaN(a, b, status); 5849 } 5850 goto invalid; 5851 } 5852 if ( bExp == 0x7FFF ) { 5853 if ((uint64_t)(bSig << 1)) { 5854 return propagateFloatx80NaN(a, b, status); 5855 } 5856 return a; 5857 } 5858 if ( bExp == 0 ) { 5859 if ( bSig == 0 ) { 5860 invalid: 5861 float_raise(float_flag_invalid, status); 5862 return floatx80_default_nan(status); 5863 } 5864 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5865 } 5866 if ( aExp == 0 ) { 5867 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5868 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5869 } 5870 bSig |= LIT64( 0x8000000000000000 ); 5871 zSign = aSign; 5872 expDiff = aExp - bExp; 5873 aSig1 = 0; 5874 if ( expDiff < 0 ) { 5875 if ( expDiff < -1 ) return a; 5876 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5877 expDiff = 0; 5878 } 5879 q = ( bSig <= aSig0 ); 5880 if ( q ) aSig0 -= bSig; 5881 expDiff -= 64; 5882 while ( 0 < expDiff ) { 5883 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5884 q = ( 2 < q ) ? q - 2 : 0; 5885 mul64To128( bSig, q, &term0, &term1 ); 5886 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5887 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5888 expDiff -= 62; 5889 } 5890 expDiff += 64; 5891 if ( 0 < expDiff ) { 5892 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5893 q = ( 2 < q ) ? q - 2 : 0; 5894 q >>= 64 - expDiff; 5895 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5896 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5897 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5898 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5899 ++q; 5900 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5901 } 5902 } 5903 else { 5904 term1 = 0; 5905 term0 = bSig; 5906 } 5907 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5908 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5909 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5910 && ( q & 1 ) ) 5911 ) { 5912 aSig0 = alternateASig0; 5913 aSig1 = alternateASig1; 5914 zSign = ! zSign; 5915 } 5916 return 5917 normalizeRoundAndPackFloatx80( 5918 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5919 5920 } 5921 5922 /*---------------------------------------------------------------------------- 5923 | Returns the square root of the extended double-precision floating-point 5924 | value `a'. The operation is performed according to the IEC/IEEE Standard 5925 | for Binary Floating-Point Arithmetic. 5926 *----------------------------------------------------------------------------*/ 5927 5928 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5929 { 5930 flag aSign; 5931 int32_t aExp, zExp; 5932 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5933 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5934 5935 if (floatx80_invalid_encoding(a)) { 5936 float_raise(float_flag_invalid, status); 5937 return floatx80_default_nan(status); 5938 } 5939 aSig0 = extractFloatx80Frac( a ); 5940 aExp = extractFloatx80Exp( a ); 5941 aSign = extractFloatx80Sign( a ); 5942 if ( aExp == 0x7FFF ) { 5943 if ((uint64_t)(aSig0 << 1)) { 5944 return propagateFloatx80NaN(a, a, status); 5945 } 5946 if ( ! aSign ) return a; 5947 goto invalid; 5948 } 5949 if ( aSign ) { 5950 if ( ( aExp | aSig0 ) == 0 ) return a; 5951 invalid: 5952 float_raise(float_flag_invalid, status); 5953 return floatx80_default_nan(status); 5954 } 5955 if ( aExp == 0 ) { 5956 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5957 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5958 } 5959 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5960 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5961 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5962 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5963 doubleZSig0 = zSig0<<1; 5964 mul64To128( zSig0, zSig0, &term0, &term1 ); 5965 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5966 while ( (int64_t) rem0 < 0 ) { 5967 --zSig0; 5968 doubleZSig0 -= 2; 5969 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5970 } 5971 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5972 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5973 if ( zSig1 == 0 ) zSig1 = 1; 5974 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5975 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5976 mul64To128( zSig1, zSig1, &term2, &term3 ); 5977 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5978 while ( (int64_t) rem1 < 0 ) { 5979 --zSig1; 5980 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5981 term3 |= 1; 5982 term2 |= doubleZSig0; 5983 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5984 } 5985 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5986 } 5987 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5988 zSig0 |= doubleZSig0; 5989 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5990 0, zExp, zSig0, zSig1, status); 5991 } 5992 5993 /*---------------------------------------------------------------------------- 5994 | Returns 1 if the extended double-precision floating-point value `a' is equal 5995 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5996 | raised if either operand is a NaN. Otherwise, the comparison is performed 5997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5998 *----------------------------------------------------------------------------*/ 5999 6000 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6001 { 6002 6003 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6004 || (extractFloatx80Exp(a) == 0x7FFF 6005 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6006 || (extractFloatx80Exp(b) == 0x7FFF 6007 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6008 ) { 6009 float_raise(float_flag_invalid, status); 6010 return 0; 6011 } 6012 return 6013 ( a.low == b.low ) 6014 && ( ( a.high == b.high ) 6015 || ( ( a.low == 0 ) 6016 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6017 ); 6018 6019 } 6020 6021 /*---------------------------------------------------------------------------- 6022 | Returns 1 if the extended double-precision floating-point value `a' is 6023 | less than or equal to the corresponding value `b', and 0 otherwise. The 6024 | invalid exception is raised if either operand is a NaN. The comparison is 6025 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6026 | Arithmetic. 6027 *----------------------------------------------------------------------------*/ 6028 6029 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6030 { 6031 flag aSign, bSign; 6032 6033 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6034 || (extractFloatx80Exp(a) == 0x7FFF 6035 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6036 || (extractFloatx80Exp(b) == 0x7FFF 6037 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6038 ) { 6039 float_raise(float_flag_invalid, status); 6040 return 0; 6041 } 6042 aSign = extractFloatx80Sign( a ); 6043 bSign = extractFloatx80Sign( b ); 6044 if ( aSign != bSign ) { 6045 return 6046 aSign 6047 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6048 == 0 ); 6049 } 6050 return 6051 aSign ? le128( b.high, b.low, a.high, a.low ) 6052 : le128( a.high, a.low, b.high, b.low ); 6053 6054 } 6055 6056 /*---------------------------------------------------------------------------- 6057 | Returns 1 if the extended double-precision floating-point value `a' is 6058 | less than the corresponding value `b', and 0 otherwise. The invalid 6059 | exception is raised if either operand is a NaN. The comparison is performed 6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6061 *----------------------------------------------------------------------------*/ 6062 6063 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6064 { 6065 flag aSign, bSign; 6066 6067 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6068 || (extractFloatx80Exp(a) == 0x7FFF 6069 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6070 || (extractFloatx80Exp(b) == 0x7FFF 6071 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6072 ) { 6073 float_raise(float_flag_invalid, status); 6074 return 0; 6075 } 6076 aSign = extractFloatx80Sign( a ); 6077 bSign = extractFloatx80Sign( b ); 6078 if ( aSign != bSign ) { 6079 return 6080 aSign 6081 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6082 != 0 ); 6083 } 6084 return 6085 aSign ? lt128( b.high, b.low, a.high, a.low ) 6086 : lt128( a.high, a.low, b.high, b.low ); 6087 6088 } 6089 6090 /*---------------------------------------------------------------------------- 6091 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6092 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6093 | either operand is a NaN. The comparison is performed according to the 6094 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6095 *----------------------------------------------------------------------------*/ 6096 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6097 { 6098 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6099 || (extractFloatx80Exp(a) == 0x7FFF 6100 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6101 || (extractFloatx80Exp(b) == 0x7FFF 6102 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6103 ) { 6104 float_raise(float_flag_invalid, status); 6105 return 1; 6106 } 6107 return 0; 6108 } 6109 6110 /*---------------------------------------------------------------------------- 6111 | Returns 1 if the extended double-precision floating-point value `a' is 6112 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6113 | cause an exception. The comparison is performed according to the IEC/IEEE 6114 | Standard for Binary Floating-Point Arithmetic. 6115 *----------------------------------------------------------------------------*/ 6116 6117 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6118 { 6119 6120 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6121 float_raise(float_flag_invalid, status); 6122 return 0; 6123 } 6124 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6125 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6126 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6127 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6128 ) { 6129 if (floatx80_is_signaling_nan(a, status) 6130 || floatx80_is_signaling_nan(b, status)) { 6131 float_raise(float_flag_invalid, status); 6132 } 6133 return 0; 6134 } 6135 return 6136 ( a.low == b.low ) 6137 && ( ( a.high == b.high ) 6138 || ( ( a.low == 0 ) 6139 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6140 ); 6141 6142 } 6143 6144 /*---------------------------------------------------------------------------- 6145 | Returns 1 if the extended double-precision floating-point value `a' is less 6146 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6147 | do not cause an exception. Otherwise, the comparison is performed according 6148 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6149 *----------------------------------------------------------------------------*/ 6150 6151 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6152 { 6153 flag aSign, bSign; 6154 6155 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6156 float_raise(float_flag_invalid, status); 6157 return 0; 6158 } 6159 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6160 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6161 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6162 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6163 ) { 6164 if (floatx80_is_signaling_nan(a, status) 6165 || floatx80_is_signaling_nan(b, status)) { 6166 float_raise(float_flag_invalid, status); 6167 } 6168 return 0; 6169 } 6170 aSign = extractFloatx80Sign( a ); 6171 bSign = extractFloatx80Sign( b ); 6172 if ( aSign != bSign ) { 6173 return 6174 aSign 6175 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6176 == 0 ); 6177 } 6178 return 6179 aSign ? le128( b.high, b.low, a.high, a.low ) 6180 : le128( a.high, a.low, b.high, b.low ); 6181 6182 } 6183 6184 /*---------------------------------------------------------------------------- 6185 | Returns 1 if the extended double-precision floating-point value `a' is less 6186 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6187 | an exception. Otherwise, the comparison is performed according to the 6188 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6189 *----------------------------------------------------------------------------*/ 6190 6191 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6192 { 6193 flag aSign, bSign; 6194 6195 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6196 float_raise(float_flag_invalid, status); 6197 return 0; 6198 } 6199 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6200 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6201 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6202 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6203 ) { 6204 if (floatx80_is_signaling_nan(a, status) 6205 || floatx80_is_signaling_nan(b, status)) { 6206 float_raise(float_flag_invalid, status); 6207 } 6208 return 0; 6209 } 6210 aSign = extractFloatx80Sign( a ); 6211 bSign = extractFloatx80Sign( b ); 6212 if ( aSign != bSign ) { 6213 return 6214 aSign 6215 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6216 != 0 ); 6217 } 6218 return 6219 aSign ? lt128( b.high, b.low, a.high, a.low ) 6220 : lt128( a.high, a.low, b.high, b.low ); 6221 6222 } 6223 6224 /*---------------------------------------------------------------------------- 6225 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6226 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6227 | The comparison is performed according to the IEC/IEEE Standard for Binary 6228 | Floating-Point Arithmetic. 6229 *----------------------------------------------------------------------------*/ 6230 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6231 { 6232 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6233 float_raise(float_flag_invalid, status); 6234 return 1; 6235 } 6236 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6237 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6238 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6239 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6240 ) { 6241 if (floatx80_is_signaling_nan(a, status) 6242 || floatx80_is_signaling_nan(b, status)) { 6243 float_raise(float_flag_invalid, status); 6244 } 6245 return 1; 6246 } 6247 return 0; 6248 } 6249 6250 /*---------------------------------------------------------------------------- 6251 | Returns the result of converting the quadruple-precision floating-point 6252 | value `a' to the 32-bit two's complement integer format. The conversion 6253 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6254 | Arithmetic---which means in particular that the conversion is rounded 6255 | according to the current rounding mode. If `a' is a NaN, the largest 6256 | positive integer is returned. Otherwise, if the conversion overflows, the 6257 | largest integer with the same sign as `a' is returned. 6258 *----------------------------------------------------------------------------*/ 6259 6260 int32_t float128_to_int32(float128 a, float_status *status) 6261 { 6262 flag aSign; 6263 int32_t aExp, shiftCount; 6264 uint64_t aSig0, aSig1; 6265 6266 aSig1 = extractFloat128Frac1( a ); 6267 aSig0 = extractFloat128Frac0( a ); 6268 aExp = extractFloat128Exp( a ); 6269 aSign = extractFloat128Sign( a ); 6270 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6271 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6272 aSig0 |= ( aSig1 != 0 ); 6273 shiftCount = 0x4028 - aExp; 6274 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6275 return roundAndPackInt32(aSign, aSig0, status); 6276 6277 } 6278 6279 /*---------------------------------------------------------------------------- 6280 | Returns the result of converting the quadruple-precision floating-point 6281 | value `a' to the 32-bit two's complement integer format. The conversion 6282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6283 | Arithmetic, except that the conversion is always rounded toward zero. If 6284 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6285 | conversion overflows, the largest integer with the same sign as `a' is 6286 | returned. 6287 *----------------------------------------------------------------------------*/ 6288 6289 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6290 { 6291 flag aSign; 6292 int32_t aExp, shiftCount; 6293 uint64_t aSig0, aSig1, savedASig; 6294 int32_t z; 6295 6296 aSig1 = extractFloat128Frac1( a ); 6297 aSig0 = extractFloat128Frac0( a ); 6298 aExp = extractFloat128Exp( a ); 6299 aSign = extractFloat128Sign( a ); 6300 aSig0 |= ( aSig1 != 0 ); 6301 if ( 0x401E < aExp ) { 6302 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6303 goto invalid; 6304 } 6305 else if ( aExp < 0x3FFF ) { 6306 if (aExp || aSig0) { 6307 status->float_exception_flags |= float_flag_inexact; 6308 } 6309 return 0; 6310 } 6311 aSig0 |= LIT64( 0x0001000000000000 ); 6312 shiftCount = 0x402F - aExp; 6313 savedASig = aSig0; 6314 aSig0 >>= shiftCount; 6315 z = aSig0; 6316 if ( aSign ) z = - z; 6317 if ( ( z < 0 ) ^ aSign ) { 6318 invalid: 6319 float_raise(float_flag_invalid, status); 6320 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6321 } 6322 if ( ( aSig0<<shiftCount ) != savedASig ) { 6323 status->float_exception_flags |= float_flag_inexact; 6324 } 6325 return z; 6326 6327 } 6328 6329 /*---------------------------------------------------------------------------- 6330 | Returns the result of converting the quadruple-precision floating-point 6331 | value `a' to the 64-bit two's complement integer format. The conversion 6332 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6333 | Arithmetic---which means in particular that the conversion is rounded 6334 | according to the current rounding mode. If `a' is a NaN, the largest 6335 | positive integer is returned. Otherwise, if the conversion overflows, the 6336 | largest integer with the same sign as `a' is returned. 6337 *----------------------------------------------------------------------------*/ 6338 6339 int64_t float128_to_int64(float128 a, float_status *status) 6340 { 6341 flag aSign; 6342 int32_t aExp, shiftCount; 6343 uint64_t aSig0, aSig1; 6344 6345 aSig1 = extractFloat128Frac1( a ); 6346 aSig0 = extractFloat128Frac0( a ); 6347 aExp = extractFloat128Exp( a ); 6348 aSign = extractFloat128Sign( a ); 6349 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6350 shiftCount = 0x402F - aExp; 6351 if ( shiftCount <= 0 ) { 6352 if ( 0x403E < aExp ) { 6353 float_raise(float_flag_invalid, status); 6354 if ( ! aSign 6355 || ( ( aExp == 0x7FFF ) 6356 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6357 ) 6358 ) { 6359 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6360 } 6361 return (int64_t) LIT64( 0x8000000000000000 ); 6362 } 6363 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6364 } 6365 else { 6366 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6367 } 6368 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6369 6370 } 6371 6372 /*---------------------------------------------------------------------------- 6373 | Returns the result of converting the quadruple-precision floating-point 6374 | value `a' to the 64-bit two's complement integer format. The conversion 6375 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6376 | Arithmetic, except that the conversion is always rounded toward zero. 6377 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6378 | the conversion overflows, the largest integer with the same sign as `a' is 6379 | returned. 6380 *----------------------------------------------------------------------------*/ 6381 6382 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6383 { 6384 flag aSign; 6385 int32_t aExp, shiftCount; 6386 uint64_t aSig0, aSig1; 6387 int64_t z; 6388 6389 aSig1 = extractFloat128Frac1( a ); 6390 aSig0 = extractFloat128Frac0( a ); 6391 aExp = extractFloat128Exp( a ); 6392 aSign = extractFloat128Sign( a ); 6393 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6394 shiftCount = aExp - 0x402F; 6395 if ( 0 < shiftCount ) { 6396 if ( 0x403E <= aExp ) { 6397 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6398 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6399 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6400 if (aSig1) { 6401 status->float_exception_flags |= float_flag_inexact; 6402 } 6403 } 6404 else { 6405 float_raise(float_flag_invalid, status); 6406 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6407 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6408 } 6409 } 6410 return (int64_t) LIT64( 0x8000000000000000 ); 6411 } 6412 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6413 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6414 status->float_exception_flags |= float_flag_inexact; 6415 } 6416 } 6417 else { 6418 if ( aExp < 0x3FFF ) { 6419 if ( aExp | aSig0 | aSig1 ) { 6420 status->float_exception_flags |= float_flag_inexact; 6421 } 6422 return 0; 6423 } 6424 z = aSig0>>( - shiftCount ); 6425 if ( aSig1 6426 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6427 status->float_exception_flags |= float_flag_inexact; 6428 } 6429 } 6430 if ( aSign ) z = - z; 6431 return z; 6432 6433 } 6434 6435 /*---------------------------------------------------------------------------- 6436 | Returns the result of converting the quadruple-precision floating-point value 6437 | `a' to the 64-bit unsigned integer format. The conversion is 6438 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6439 | Arithmetic---which means in particular that the conversion is rounded 6440 | according to the current rounding mode. If `a' is a NaN, the largest 6441 | positive integer is returned. If the conversion overflows, the 6442 | largest unsigned integer is returned. If 'a' is negative, the value is 6443 | rounded and zero is returned; negative values that do not round to zero 6444 | will raise the inexact exception. 6445 *----------------------------------------------------------------------------*/ 6446 6447 uint64_t float128_to_uint64(float128 a, float_status *status) 6448 { 6449 flag aSign; 6450 int aExp; 6451 int shiftCount; 6452 uint64_t aSig0, aSig1; 6453 6454 aSig0 = extractFloat128Frac0(a); 6455 aSig1 = extractFloat128Frac1(a); 6456 aExp = extractFloat128Exp(a); 6457 aSign = extractFloat128Sign(a); 6458 if (aSign && (aExp > 0x3FFE)) { 6459 float_raise(float_flag_invalid, status); 6460 if (float128_is_any_nan(a)) { 6461 return LIT64(0xFFFFFFFFFFFFFFFF); 6462 } else { 6463 return 0; 6464 } 6465 } 6466 if (aExp) { 6467 aSig0 |= LIT64(0x0001000000000000); 6468 } 6469 shiftCount = 0x402F - aExp; 6470 if (shiftCount <= 0) { 6471 if (0x403E < aExp) { 6472 float_raise(float_flag_invalid, status); 6473 return LIT64(0xFFFFFFFFFFFFFFFF); 6474 } 6475 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6476 } else { 6477 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6478 } 6479 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6480 } 6481 6482 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6483 { 6484 uint64_t v; 6485 signed char current_rounding_mode = status->float_rounding_mode; 6486 6487 set_float_rounding_mode(float_round_to_zero, status); 6488 v = float128_to_uint64(a, status); 6489 set_float_rounding_mode(current_rounding_mode, status); 6490 6491 return v; 6492 } 6493 6494 /*---------------------------------------------------------------------------- 6495 | Returns the result of converting the quadruple-precision floating-point 6496 | value `a' to the 32-bit unsigned integer format. The conversion 6497 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6498 | Arithmetic except that the conversion is always rounded toward zero. 6499 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6500 | if the conversion overflows, the largest unsigned integer is returned. 6501 | If 'a' is negative, the value is rounded and zero is returned; negative 6502 | values that do not round to zero will raise the inexact exception. 6503 *----------------------------------------------------------------------------*/ 6504 6505 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6506 { 6507 uint64_t v; 6508 uint32_t res; 6509 int old_exc_flags = get_float_exception_flags(status); 6510 6511 v = float128_to_uint64_round_to_zero(a, status); 6512 if (v > 0xffffffff) { 6513 res = 0xffffffff; 6514 } else { 6515 return v; 6516 } 6517 set_float_exception_flags(old_exc_flags, status); 6518 float_raise(float_flag_invalid, status); 6519 return res; 6520 } 6521 6522 /*---------------------------------------------------------------------------- 6523 | Returns the result of converting the quadruple-precision floating-point 6524 | value `a' to the single-precision floating-point format. The conversion 6525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6526 | Arithmetic. 6527 *----------------------------------------------------------------------------*/ 6528 6529 float32 float128_to_float32(float128 a, float_status *status) 6530 { 6531 flag aSign; 6532 int32_t aExp; 6533 uint64_t aSig0, aSig1; 6534 uint32_t zSig; 6535 6536 aSig1 = extractFloat128Frac1( a ); 6537 aSig0 = extractFloat128Frac0( a ); 6538 aExp = extractFloat128Exp( a ); 6539 aSign = extractFloat128Sign( a ); 6540 if ( aExp == 0x7FFF ) { 6541 if ( aSig0 | aSig1 ) { 6542 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6543 } 6544 return packFloat32( aSign, 0xFF, 0 ); 6545 } 6546 aSig0 |= ( aSig1 != 0 ); 6547 shift64RightJamming( aSig0, 18, &aSig0 ); 6548 zSig = aSig0; 6549 if ( aExp || zSig ) { 6550 zSig |= 0x40000000; 6551 aExp -= 0x3F81; 6552 } 6553 return roundAndPackFloat32(aSign, aExp, zSig, status); 6554 6555 } 6556 6557 /*---------------------------------------------------------------------------- 6558 | Returns the result of converting the quadruple-precision floating-point 6559 | value `a' to the double-precision floating-point format. The conversion 6560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6561 | Arithmetic. 6562 *----------------------------------------------------------------------------*/ 6563 6564 float64 float128_to_float64(float128 a, float_status *status) 6565 { 6566 flag aSign; 6567 int32_t aExp; 6568 uint64_t aSig0, aSig1; 6569 6570 aSig1 = extractFloat128Frac1( a ); 6571 aSig0 = extractFloat128Frac0( a ); 6572 aExp = extractFloat128Exp( a ); 6573 aSign = extractFloat128Sign( a ); 6574 if ( aExp == 0x7FFF ) { 6575 if ( aSig0 | aSig1 ) { 6576 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6577 } 6578 return packFloat64( aSign, 0x7FF, 0 ); 6579 } 6580 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6581 aSig0 |= ( aSig1 != 0 ); 6582 if ( aExp || aSig0 ) { 6583 aSig0 |= LIT64( 0x4000000000000000 ); 6584 aExp -= 0x3C01; 6585 } 6586 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6587 6588 } 6589 6590 /*---------------------------------------------------------------------------- 6591 | Returns the result of converting the quadruple-precision floating-point 6592 | value `a' to the extended double-precision floating-point format. The 6593 | conversion is performed according to the IEC/IEEE Standard for Binary 6594 | Floating-Point Arithmetic. 6595 *----------------------------------------------------------------------------*/ 6596 6597 floatx80 float128_to_floatx80(float128 a, float_status *status) 6598 { 6599 flag aSign; 6600 int32_t aExp; 6601 uint64_t aSig0, aSig1; 6602 6603 aSig1 = extractFloat128Frac1( a ); 6604 aSig0 = extractFloat128Frac0( a ); 6605 aExp = extractFloat128Exp( a ); 6606 aSign = extractFloat128Sign( a ); 6607 if ( aExp == 0x7FFF ) { 6608 if ( aSig0 | aSig1 ) { 6609 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6610 } 6611 return packFloatx80(aSign, floatx80_infinity_high, 6612 floatx80_infinity_low); 6613 } 6614 if ( aExp == 0 ) { 6615 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6616 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6617 } 6618 else { 6619 aSig0 |= LIT64( 0x0001000000000000 ); 6620 } 6621 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6622 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6623 6624 } 6625 6626 /*---------------------------------------------------------------------------- 6627 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6628 | returns the result as a quadruple-precision floating-point value. The 6629 | operation is performed according to the IEC/IEEE Standard for Binary 6630 | Floating-Point Arithmetic. 6631 *----------------------------------------------------------------------------*/ 6632 6633 float128 float128_round_to_int(float128 a, float_status *status) 6634 { 6635 flag aSign; 6636 int32_t aExp; 6637 uint64_t lastBitMask, roundBitsMask; 6638 float128 z; 6639 6640 aExp = extractFloat128Exp( a ); 6641 if ( 0x402F <= aExp ) { 6642 if ( 0x406F <= aExp ) { 6643 if ( ( aExp == 0x7FFF ) 6644 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6645 ) { 6646 return propagateFloat128NaN(a, a, status); 6647 } 6648 return a; 6649 } 6650 lastBitMask = 1; 6651 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6652 roundBitsMask = lastBitMask - 1; 6653 z = a; 6654 switch (status->float_rounding_mode) { 6655 case float_round_nearest_even: 6656 if ( lastBitMask ) { 6657 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6658 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6659 } 6660 else { 6661 if ( (int64_t) z.low < 0 ) { 6662 ++z.high; 6663 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6664 } 6665 } 6666 break; 6667 case float_round_ties_away: 6668 if (lastBitMask) { 6669 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6670 } else { 6671 if ((int64_t) z.low < 0) { 6672 ++z.high; 6673 } 6674 } 6675 break; 6676 case float_round_to_zero: 6677 break; 6678 case float_round_up: 6679 if (!extractFloat128Sign(z)) { 6680 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6681 } 6682 break; 6683 case float_round_down: 6684 if (extractFloat128Sign(z)) { 6685 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6686 } 6687 break; 6688 default: 6689 abort(); 6690 } 6691 z.low &= ~ roundBitsMask; 6692 } 6693 else { 6694 if ( aExp < 0x3FFF ) { 6695 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6696 status->float_exception_flags |= float_flag_inexact; 6697 aSign = extractFloat128Sign( a ); 6698 switch (status->float_rounding_mode) { 6699 case float_round_nearest_even: 6700 if ( ( aExp == 0x3FFE ) 6701 && ( extractFloat128Frac0( a ) 6702 | extractFloat128Frac1( a ) ) 6703 ) { 6704 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6705 } 6706 break; 6707 case float_round_ties_away: 6708 if (aExp == 0x3FFE) { 6709 return packFloat128(aSign, 0x3FFF, 0, 0); 6710 } 6711 break; 6712 case float_round_down: 6713 return 6714 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6715 : packFloat128( 0, 0, 0, 0 ); 6716 case float_round_up: 6717 return 6718 aSign ? packFloat128( 1, 0, 0, 0 ) 6719 : packFloat128( 0, 0x3FFF, 0, 0 ); 6720 } 6721 return packFloat128( aSign, 0, 0, 0 ); 6722 } 6723 lastBitMask = 1; 6724 lastBitMask <<= 0x402F - aExp; 6725 roundBitsMask = lastBitMask - 1; 6726 z.low = 0; 6727 z.high = a.high; 6728 switch (status->float_rounding_mode) { 6729 case float_round_nearest_even: 6730 z.high += lastBitMask>>1; 6731 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6732 z.high &= ~ lastBitMask; 6733 } 6734 break; 6735 case float_round_ties_away: 6736 z.high += lastBitMask>>1; 6737 break; 6738 case float_round_to_zero: 6739 break; 6740 case float_round_up: 6741 if (!extractFloat128Sign(z)) { 6742 z.high |= ( a.low != 0 ); 6743 z.high += roundBitsMask; 6744 } 6745 break; 6746 case float_round_down: 6747 if (extractFloat128Sign(z)) { 6748 z.high |= (a.low != 0); 6749 z.high += roundBitsMask; 6750 } 6751 break; 6752 default: 6753 abort(); 6754 } 6755 z.high &= ~ roundBitsMask; 6756 } 6757 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6758 status->float_exception_flags |= float_flag_inexact; 6759 } 6760 return z; 6761 6762 } 6763 6764 /*---------------------------------------------------------------------------- 6765 | Returns the result of adding the absolute values of the quadruple-precision 6766 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6767 | before being returned. `zSign' is ignored if the result is a NaN. 6768 | The addition is performed according to the IEC/IEEE Standard for Binary 6769 | Floating-Point Arithmetic. 6770 *----------------------------------------------------------------------------*/ 6771 6772 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6773 float_status *status) 6774 { 6775 int32_t aExp, bExp, zExp; 6776 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6777 int32_t expDiff; 6778 6779 aSig1 = extractFloat128Frac1( a ); 6780 aSig0 = extractFloat128Frac0( a ); 6781 aExp = extractFloat128Exp( a ); 6782 bSig1 = extractFloat128Frac1( b ); 6783 bSig0 = extractFloat128Frac0( b ); 6784 bExp = extractFloat128Exp( b ); 6785 expDiff = aExp - bExp; 6786 if ( 0 < expDiff ) { 6787 if ( aExp == 0x7FFF ) { 6788 if (aSig0 | aSig1) { 6789 return propagateFloat128NaN(a, b, status); 6790 } 6791 return a; 6792 } 6793 if ( bExp == 0 ) { 6794 --expDiff; 6795 } 6796 else { 6797 bSig0 |= LIT64( 0x0001000000000000 ); 6798 } 6799 shift128ExtraRightJamming( 6800 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6801 zExp = aExp; 6802 } 6803 else if ( expDiff < 0 ) { 6804 if ( bExp == 0x7FFF ) { 6805 if (bSig0 | bSig1) { 6806 return propagateFloat128NaN(a, b, status); 6807 } 6808 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6809 } 6810 if ( aExp == 0 ) { 6811 ++expDiff; 6812 } 6813 else { 6814 aSig0 |= LIT64( 0x0001000000000000 ); 6815 } 6816 shift128ExtraRightJamming( 6817 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6818 zExp = bExp; 6819 } 6820 else { 6821 if ( aExp == 0x7FFF ) { 6822 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6823 return propagateFloat128NaN(a, b, status); 6824 } 6825 return a; 6826 } 6827 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6828 if ( aExp == 0 ) { 6829 if (status->flush_to_zero) { 6830 if (zSig0 | zSig1) { 6831 float_raise(float_flag_output_denormal, status); 6832 } 6833 return packFloat128(zSign, 0, 0, 0); 6834 } 6835 return packFloat128( zSign, 0, zSig0, zSig1 ); 6836 } 6837 zSig2 = 0; 6838 zSig0 |= LIT64( 0x0002000000000000 ); 6839 zExp = aExp; 6840 goto shiftRight1; 6841 } 6842 aSig0 |= LIT64( 0x0001000000000000 ); 6843 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6844 --zExp; 6845 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6846 ++zExp; 6847 shiftRight1: 6848 shift128ExtraRightJamming( 6849 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6850 roundAndPack: 6851 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6852 6853 } 6854 6855 /*---------------------------------------------------------------------------- 6856 | Returns the result of subtracting the absolute values of the quadruple- 6857 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6858 | difference is negated before being returned. `zSign' is ignored if the 6859 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6860 | Standard for Binary Floating-Point Arithmetic. 6861 *----------------------------------------------------------------------------*/ 6862 6863 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6864 float_status *status) 6865 { 6866 int32_t aExp, bExp, zExp; 6867 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6868 int32_t expDiff; 6869 6870 aSig1 = extractFloat128Frac1( a ); 6871 aSig0 = extractFloat128Frac0( a ); 6872 aExp = extractFloat128Exp( a ); 6873 bSig1 = extractFloat128Frac1( b ); 6874 bSig0 = extractFloat128Frac0( b ); 6875 bExp = extractFloat128Exp( b ); 6876 expDiff = aExp - bExp; 6877 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6878 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6879 if ( 0 < expDiff ) goto aExpBigger; 6880 if ( expDiff < 0 ) goto bExpBigger; 6881 if ( aExp == 0x7FFF ) { 6882 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6883 return propagateFloat128NaN(a, b, status); 6884 } 6885 float_raise(float_flag_invalid, status); 6886 return float128_default_nan(status); 6887 } 6888 if ( aExp == 0 ) { 6889 aExp = 1; 6890 bExp = 1; 6891 } 6892 if ( bSig0 < aSig0 ) goto aBigger; 6893 if ( aSig0 < bSig0 ) goto bBigger; 6894 if ( bSig1 < aSig1 ) goto aBigger; 6895 if ( aSig1 < bSig1 ) goto bBigger; 6896 return packFloat128(status->float_rounding_mode == float_round_down, 6897 0, 0, 0); 6898 bExpBigger: 6899 if ( bExp == 0x7FFF ) { 6900 if (bSig0 | bSig1) { 6901 return propagateFloat128NaN(a, b, status); 6902 } 6903 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6904 } 6905 if ( aExp == 0 ) { 6906 ++expDiff; 6907 } 6908 else { 6909 aSig0 |= LIT64( 0x4000000000000000 ); 6910 } 6911 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6912 bSig0 |= LIT64( 0x4000000000000000 ); 6913 bBigger: 6914 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6915 zExp = bExp; 6916 zSign ^= 1; 6917 goto normalizeRoundAndPack; 6918 aExpBigger: 6919 if ( aExp == 0x7FFF ) { 6920 if (aSig0 | aSig1) { 6921 return propagateFloat128NaN(a, b, status); 6922 } 6923 return a; 6924 } 6925 if ( bExp == 0 ) { 6926 --expDiff; 6927 } 6928 else { 6929 bSig0 |= LIT64( 0x4000000000000000 ); 6930 } 6931 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6932 aSig0 |= LIT64( 0x4000000000000000 ); 6933 aBigger: 6934 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6935 zExp = aExp; 6936 normalizeRoundAndPack: 6937 --zExp; 6938 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6939 status); 6940 6941 } 6942 6943 /*---------------------------------------------------------------------------- 6944 | Returns the result of adding the quadruple-precision floating-point values 6945 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6946 | for Binary Floating-Point Arithmetic. 6947 *----------------------------------------------------------------------------*/ 6948 6949 float128 float128_add(float128 a, float128 b, float_status *status) 6950 { 6951 flag aSign, bSign; 6952 6953 aSign = extractFloat128Sign( a ); 6954 bSign = extractFloat128Sign( b ); 6955 if ( aSign == bSign ) { 6956 return addFloat128Sigs(a, b, aSign, status); 6957 } 6958 else { 6959 return subFloat128Sigs(a, b, aSign, status); 6960 } 6961 6962 } 6963 6964 /*---------------------------------------------------------------------------- 6965 | Returns the result of subtracting the quadruple-precision floating-point 6966 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6967 | Standard for Binary Floating-Point Arithmetic. 6968 *----------------------------------------------------------------------------*/ 6969 6970 float128 float128_sub(float128 a, float128 b, float_status *status) 6971 { 6972 flag aSign, bSign; 6973 6974 aSign = extractFloat128Sign( a ); 6975 bSign = extractFloat128Sign( b ); 6976 if ( aSign == bSign ) { 6977 return subFloat128Sigs(a, b, aSign, status); 6978 } 6979 else { 6980 return addFloat128Sigs(a, b, aSign, status); 6981 } 6982 6983 } 6984 6985 /*---------------------------------------------------------------------------- 6986 | Returns the result of multiplying the quadruple-precision floating-point 6987 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6988 | Standard for Binary Floating-Point Arithmetic. 6989 *----------------------------------------------------------------------------*/ 6990 6991 float128 float128_mul(float128 a, float128 b, float_status *status) 6992 { 6993 flag aSign, bSign, zSign; 6994 int32_t aExp, bExp, zExp; 6995 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6996 6997 aSig1 = extractFloat128Frac1( a ); 6998 aSig0 = extractFloat128Frac0( a ); 6999 aExp = extractFloat128Exp( a ); 7000 aSign = extractFloat128Sign( a ); 7001 bSig1 = extractFloat128Frac1( b ); 7002 bSig0 = extractFloat128Frac0( b ); 7003 bExp = extractFloat128Exp( b ); 7004 bSign = extractFloat128Sign( b ); 7005 zSign = aSign ^ bSign; 7006 if ( aExp == 0x7FFF ) { 7007 if ( ( aSig0 | aSig1 ) 7008 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7009 return propagateFloat128NaN(a, b, status); 7010 } 7011 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7012 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7013 } 7014 if ( bExp == 0x7FFF ) { 7015 if (bSig0 | bSig1) { 7016 return propagateFloat128NaN(a, b, status); 7017 } 7018 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7019 invalid: 7020 float_raise(float_flag_invalid, status); 7021 return float128_default_nan(status); 7022 } 7023 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7024 } 7025 if ( aExp == 0 ) { 7026 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7027 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7028 } 7029 if ( bExp == 0 ) { 7030 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7031 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7032 } 7033 zExp = aExp + bExp - 0x4000; 7034 aSig0 |= LIT64( 0x0001000000000000 ); 7035 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7036 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7037 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7038 zSig2 |= ( zSig3 != 0 ); 7039 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7040 shift128ExtraRightJamming( 7041 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7042 ++zExp; 7043 } 7044 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7045 7046 } 7047 7048 /*---------------------------------------------------------------------------- 7049 | Returns the result of dividing the quadruple-precision floating-point value 7050 | `a' by the corresponding value `b'. The operation is performed according to 7051 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7052 *----------------------------------------------------------------------------*/ 7053 7054 float128 float128_div(float128 a, float128 b, float_status *status) 7055 { 7056 flag aSign, bSign, zSign; 7057 int32_t aExp, bExp, zExp; 7058 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7059 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7060 7061 aSig1 = extractFloat128Frac1( a ); 7062 aSig0 = extractFloat128Frac0( a ); 7063 aExp = extractFloat128Exp( a ); 7064 aSign = extractFloat128Sign( a ); 7065 bSig1 = extractFloat128Frac1( b ); 7066 bSig0 = extractFloat128Frac0( b ); 7067 bExp = extractFloat128Exp( b ); 7068 bSign = extractFloat128Sign( b ); 7069 zSign = aSign ^ bSign; 7070 if ( aExp == 0x7FFF ) { 7071 if (aSig0 | aSig1) { 7072 return propagateFloat128NaN(a, b, status); 7073 } 7074 if ( bExp == 0x7FFF ) { 7075 if (bSig0 | bSig1) { 7076 return propagateFloat128NaN(a, b, status); 7077 } 7078 goto invalid; 7079 } 7080 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7081 } 7082 if ( bExp == 0x7FFF ) { 7083 if (bSig0 | bSig1) { 7084 return propagateFloat128NaN(a, b, status); 7085 } 7086 return packFloat128( zSign, 0, 0, 0 ); 7087 } 7088 if ( bExp == 0 ) { 7089 if ( ( bSig0 | bSig1 ) == 0 ) { 7090 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7091 invalid: 7092 float_raise(float_flag_invalid, status); 7093 return float128_default_nan(status); 7094 } 7095 float_raise(float_flag_divbyzero, status); 7096 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7097 } 7098 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7099 } 7100 if ( aExp == 0 ) { 7101 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7102 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7103 } 7104 zExp = aExp - bExp + 0x3FFD; 7105 shortShift128Left( 7106 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7107 shortShift128Left( 7108 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7109 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7110 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7111 ++zExp; 7112 } 7113 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7114 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7115 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7116 while ( (int64_t) rem0 < 0 ) { 7117 --zSig0; 7118 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7119 } 7120 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7121 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7122 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7123 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7124 while ( (int64_t) rem1 < 0 ) { 7125 --zSig1; 7126 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7127 } 7128 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7129 } 7130 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7131 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7132 7133 } 7134 7135 /*---------------------------------------------------------------------------- 7136 | Returns the remainder of the quadruple-precision floating-point value `a' 7137 | with respect to the corresponding value `b'. The operation is performed 7138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7139 *----------------------------------------------------------------------------*/ 7140 7141 float128 float128_rem(float128 a, float128 b, float_status *status) 7142 { 7143 flag aSign, zSign; 7144 int32_t aExp, bExp, expDiff; 7145 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7146 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7147 int64_t sigMean0; 7148 7149 aSig1 = extractFloat128Frac1( a ); 7150 aSig0 = extractFloat128Frac0( a ); 7151 aExp = extractFloat128Exp( a ); 7152 aSign = extractFloat128Sign( a ); 7153 bSig1 = extractFloat128Frac1( b ); 7154 bSig0 = extractFloat128Frac0( b ); 7155 bExp = extractFloat128Exp( b ); 7156 if ( aExp == 0x7FFF ) { 7157 if ( ( aSig0 | aSig1 ) 7158 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7159 return propagateFloat128NaN(a, b, status); 7160 } 7161 goto invalid; 7162 } 7163 if ( bExp == 0x7FFF ) { 7164 if (bSig0 | bSig1) { 7165 return propagateFloat128NaN(a, b, status); 7166 } 7167 return a; 7168 } 7169 if ( bExp == 0 ) { 7170 if ( ( bSig0 | bSig1 ) == 0 ) { 7171 invalid: 7172 float_raise(float_flag_invalid, status); 7173 return float128_default_nan(status); 7174 } 7175 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7176 } 7177 if ( aExp == 0 ) { 7178 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7179 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7180 } 7181 expDiff = aExp - bExp; 7182 if ( expDiff < -1 ) return a; 7183 shortShift128Left( 7184 aSig0 | LIT64( 0x0001000000000000 ), 7185 aSig1, 7186 15 - ( expDiff < 0 ), 7187 &aSig0, 7188 &aSig1 7189 ); 7190 shortShift128Left( 7191 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7192 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7193 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7194 expDiff -= 64; 7195 while ( 0 < expDiff ) { 7196 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7197 q = ( 4 < q ) ? q - 4 : 0; 7198 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7199 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7200 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7201 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7202 expDiff -= 61; 7203 } 7204 if ( -64 < expDiff ) { 7205 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7206 q = ( 4 < q ) ? q - 4 : 0; 7207 q >>= - expDiff; 7208 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7209 expDiff += 52; 7210 if ( expDiff < 0 ) { 7211 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7212 } 7213 else { 7214 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7215 } 7216 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7217 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7218 } 7219 else { 7220 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7221 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7222 } 7223 do { 7224 alternateASig0 = aSig0; 7225 alternateASig1 = aSig1; 7226 ++q; 7227 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7228 } while ( 0 <= (int64_t) aSig0 ); 7229 add128( 7230 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7231 if ( ( sigMean0 < 0 ) 7232 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7233 aSig0 = alternateASig0; 7234 aSig1 = alternateASig1; 7235 } 7236 zSign = ( (int64_t) aSig0 < 0 ); 7237 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7238 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7239 status); 7240 } 7241 7242 /*---------------------------------------------------------------------------- 7243 | Returns the square root of the quadruple-precision floating-point value `a'. 7244 | The operation is performed according to the IEC/IEEE Standard for Binary 7245 | Floating-Point Arithmetic. 7246 *----------------------------------------------------------------------------*/ 7247 7248 float128 float128_sqrt(float128 a, float_status *status) 7249 { 7250 flag aSign; 7251 int32_t aExp, zExp; 7252 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7253 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7254 7255 aSig1 = extractFloat128Frac1( a ); 7256 aSig0 = extractFloat128Frac0( a ); 7257 aExp = extractFloat128Exp( a ); 7258 aSign = extractFloat128Sign( a ); 7259 if ( aExp == 0x7FFF ) { 7260 if (aSig0 | aSig1) { 7261 return propagateFloat128NaN(a, a, status); 7262 } 7263 if ( ! aSign ) return a; 7264 goto invalid; 7265 } 7266 if ( aSign ) { 7267 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7268 invalid: 7269 float_raise(float_flag_invalid, status); 7270 return float128_default_nan(status); 7271 } 7272 if ( aExp == 0 ) { 7273 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7274 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7275 } 7276 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7277 aSig0 |= LIT64( 0x0001000000000000 ); 7278 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7279 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7280 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7281 doubleZSig0 = zSig0<<1; 7282 mul64To128( zSig0, zSig0, &term0, &term1 ); 7283 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7284 while ( (int64_t) rem0 < 0 ) { 7285 --zSig0; 7286 doubleZSig0 -= 2; 7287 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7288 } 7289 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7290 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7291 if ( zSig1 == 0 ) zSig1 = 1; 7292 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7293 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7294 mul64To128( zSig1, zSig1, &term2, &term3 ); 7295 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7296 while ( (int64_t) rem1 < 0 ) { 7297 --zSig1; 7298 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7299 term3 |= 1; 7300 term2 |= doubleZSig0; 7301 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7302 } 7303 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7304 } 7305 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7306 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7307 7308 } 7309 7310 /*---------------------------------------------------------------------------- 7311 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7312 | the corresponding value `b', and 0 otherwise. The invalid exception is 7313 | raised if either operand is a NaN. Otherwise, the comparison is performed 7314 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7315 *----------------------------------------------------------------------------*/ 7316 7317 int float128_eq(float128 a, float128 b, float_status *status) 7318 { 7319 7320 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7321 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7322 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7323 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7324 ) { 7325 float_raise(float_flag_invalid, status); 7326 return 0; 7327 } 7328 return 7329 ( a.low == b.low ) 7330 && ( ( a.high == b.high ) 7331 || ( ( a.low == 0 ) 7332 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7333 ); 7334 7335 } 7336 7337 /*---------------------------------------------------------------------------- 7338 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7339 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7340 | exception is raised if either operand is a NaN. The comparison is performed 7341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7342 *----------------------------------------------------------------------------*/ 7343 7344 int float128_le(float128 a, float128 b, float_status *status) 7345 { 7346 flag aSign, bSign; 7347 7348 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7349 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7350 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7351 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7352 ) { 7353 float_raise(float_flag_invalid, status); 7354 return 0; 7355 } 7356 aSign = extractFloat128Sign( a ); 7357 bSign = extractFloat128Sign( b ); 7358 if ( aSign != bSign ) { 7359 return 7360 aSign 7361 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7362 == 0 ); 7363 } 7364 return 7365 aSign ? le128( b.high, b.low, a.high, a.low ) 7366 : le128( a.high, a.low, b.high, b.low ); 7367 7368 } 7369 7370 /*---------------------------------------------------------------------------- 7371 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7372 | the corresponding value `b', and 0 otherwise. The invalid exception is 7373 | raised if either operand is a NaN. The comparison is performed according 7374 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7375 *----------------------------------------------------------------------------*/ 7376 7377 int float128_lt(float128 a, float128 b, float_status *status) 7378 { 7379 flag aSign, bSign; 7380 7381 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7382 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7383 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7384 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7385 ) { 7386 float_raise(float_flag_invalid, status); 7387 return 0; 7388 } 7389 aSign = extractFloat128Sign( a ); 7390 bSign = extractFloat128Sign( b ); 7391 if ( aSign != bSign ) { 7392 return 7393 aSign 7394 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7395 != 0 ); 7396 } 7397 return 7398 aSign ? lt128( b.high, b.low, a.high, a.low ) 7399 : lt128( a.high, a.low, b.high, b.low ); 7400 7401 } 7402 7403 /*---------------------------------------------------------------------------- 7404 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7405 | be compared, and 0 otherwise. The invalid exception is raised if either 7406 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7407 | Standard for Binary Floating-Point Arithmetic. 7408 *----------------------------------------------------------------------------*/ 7409 7410 int float128_unordered(float128 a, float128 b, float_status *status) 7411 { 7412 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7413 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7414 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7415 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7416 ) { 7417 float_raise(float_flag_invalid, status); 7418 return 1; 7419 } 7420 return 0; 7421 } 7422 7423 /*---------------------------------------------------------------------------- 7424 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7425 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7426 | exception. The comparison is performed according to the IEC/IEEE Standard 7427 | for Binary Floating-Point Arithmetic. 7428 *----------------------------------------------------------------------------*/ 7429 7430 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7431 { 7432 7433 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7434 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7435 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7436 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7437 ) { 7438 if (float128_is_signaling_nan(a, status) 7439 || float128_is_signaling_nan(b, status)) { 7440 float_raise(float_flag_invalid, status); 7441 } 7442 return 0; 7443 } 7444 return 7445 ( a.low == b.low ) 7446 && ( ( a.high == b.high ) 7447 || ( ( a.low == 0 ) 7448 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7449 ); 7450 7451 } 7452 7453 /*---------------------------------------------------------------------------- 7454 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7455 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7456 | cause an exception. Otherwise, the comparison is performed according to the 7457 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7458 *----------------------------------------------------------------------------*/ 7459 7460 int float128_le_quiet(float128 a, float128 b, float_status *status) 7461 { 7462 flag aSign, bSign; 7463 7464 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7465 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7466 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7467 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7468 ) { 7469 if (float128_is_signaling_nan(a, status) 7470 || float128_is_signaling_nan(b, status)) { 7471 float_raise(float_flag_invalid, status); 7472 } 7473 return 0; 7474 } 7475 aSign = extractFloat128Sign( a ); 7476 bSign = extractFloat128Sign( b ); 7477 if ( aSign != bSign ) { 7478 return 7479 aSign 7480 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7481 == 0 ); 7482 } 7483 return 7484 aSign ? le128( b.high, b.low, a.high, a.low ) 7485 : le128( a.high, a.low, b.high, b.low ); 7486 7487 } 7488 7489 /*---------------------------------------------------------------------------- 7490 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7491 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7492 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7493 | Standard for Binary Floating-Point Arithmetic. 7494 *----------------------------------------------------------------------------*/ 7495 7496 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7497 { 7498 flag aSign, bSign; 7499 7500 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7501 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7502 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7503 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7504 ) { 7505 if (float128_is_signaling_nan(a, status) 7506 || float128_is_signaling_nan(b, status)) { 7507 float_raise(float_flag_invalid, status); 7508 } 7509 return 0; 7510 } 7511 aSign = extractFloat128Sign( a ); 7512 bSign = extractFloat128Sign( b ); 7513 if ( aSign != bSign ) { 7514 return 7515 aSign 7516 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7517 != 0 ); 7518 } 7519 return 7520 aSign ? lt128( b.high, b.low, a.high, a.low ) 7521 : lt128( a.high, a.low, b.high, b.low ); 7522 7523 } 7524 7525 /*---------------------------------------------------------------------------- 7526 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7527 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7528 | comparison is performed according to the IEC/IEEE Standard for Binary 7529 | Floating-Point Arithmetic. 7530 *----------------------------------------------------------------------------*/ 7531 7532 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7533 { 7534 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7535 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7536 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7537 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7538 ) { 7539 if (float128_is_signaling_nan(a, status) 7540 || float128_is_signaling_nan(b, status)) { 7541 float_raise(float_flag_invalid, status); 7542 } 7543 return 1; 7544 } 7545 return 0; 7546 } 7547 7548 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7549 int is_quiet, float_status *status) 7550 { 7551 flag aSign, bSign; 7552 7553 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7554 float_raise(float_flag_invalid, status); 7555 return float_relation_unordered; 7556 } 7557 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7558 ( extractFloatx80Frac( a )<<1 ) ) || 7559 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7560 ( extractFloatx80Frac( b )<<1 ) )) { 7561 if (!is_quiet || 7562 floatx80_is_signaling_nan(a, status) || 7563 floatx80_is_signaling_nan(b, status)) { 7564 float_raise(float_flag_invalid, status); 7565 } 7566 return float_relation_unordered; 7567 } 7568 aSign = extractFloatx80Sign( a ); 7569 bSign = extractFloatx80Sign( b ); 7570 if ( aSign != bSign ) { 7571 7572 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7573 ( ( a.low | b.low ) == 0 ) ) { 7574 /* zero case */ 7575 return float_relation_equal; 7576 } else { 7577 return 1 - (2 * aSign); 7578 } 7579 } else { 7580 if (a.low == b.low && a.high == b.high) { 7581 return float_relation_equal; 7582 } else { 7583 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7584 } 7585 } 7586 } 7587 7588 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7589 { 7590 return floatx80_compare_internal(a, b, 0, status); 7591 } 7592 7593 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7594 { 7595 return floatx80_compare_internal(a, b, 1, status); 7596 } 7597 7598 static inline int float128_compare_internal(float128 a, float128 b, 7599 int is_quiet, float_status *status) 7600 { 7601 flag aSign, bSign; 7602 7603 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7604 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7605 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7606 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7607 if (!is_quiet || 7608 float128_is_signaling_nan(a, status) || 7609 float128_is_signaling_nan(b, status)) { 7610 float_raise(float_flag_invalid, status); 7611 } 7612 return float_relation_unordered; 7613 } 7614 aSign = extractFloat128Sign( a ); 7615 bSign = extractFloat128Sign( b ); 7616 if ( aSign != bSign ) { 7617 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7618 /* zero case */ 7619 return float_relation_equal; 7620 } else { 7621 return 1 - (2 * aSign); 7622 } 7623 } else { 7624 if (a.low == b.low && a.high == b.high) { 7625 return float_relation_equal; 7626 } else { 7627 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7628 } 7629 } 7630 } 7631 7632 int float128_compare(float128 a, float128 b, float_status *status) 7633 { 7634 return float128_compare_internal(a, b, 0, status); 7635 } 7636 7637 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7638 { 7639 return float128_compare_internal(a, b, 1, status); 7640 } 7641 7642 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7643 { 7644 flag aSign; 7645 int32_t aExp; 7646 uint64_t aSig; 7647 7648 if (floatx80_invalid_encoding(a)) { 7649 float_raise(float_flag_invalid, status); 7650 return floatx80_default_nan(status); 7651 } 7652 aSig = extractFloatx80Frac( a ); 7653 aExp = extractFloatx80Exp( a ); 7654 aSign = extractFloatx80Sign( a ); 7655 7656 if ( aExp == 0x7FFF ) { 7657 if ( aSig<<1 ) { 7658 return propagateFloatx80NaN(a, a, status); 7659 } 7660 return a; 7661 } 7662 7663 if (aExp == 0) { 7664 if (aSig == 0) { 7665 return a; 7666 } 7667 aExp++; 7668 } 7669 7670 if (n > 0x10000) { 7671 n = 0x10000; 7672 } else if (n < -0x10000) { 7673 n = -0x10000; 7674 } 7675 7676 aExp += n; 7677 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7678 aSign, aExp, aSig, 0, status); 7679 } 7680 7681 float128 float128_scalbn(float128 a, int n, float_status *status) 7682 { 7683 flag aSign; 7684 int32_t aExp; 7685 uint64_t aSig0, aSig1; 7686 7687 aSig1 = extractFloat128Frac1( a ); 7688 aSig0 = extractFloat128Frac0( a ); 7689 aExp = extractFloat128Exp( a ); 7690 aSign = extractFloat128Sign( a ); 7691 if ( aExp == 0x7FFF ) { 7692 if ( aSig0 | aSig1 ) { 7693 return propagateFloat128NaN(a, a, status); 7694 } 7695 return a; 7696 } 7697 if (aExp != 0) { 7698 aSig0 |= LIT64( 0x0001000000000000 ); 7699 } else if (aSig0 == 0 && aSig1 == 0) { 7700 return a; 7701 } else { 7702 aExp++; 7703 } 7704 7705 if (n > 0x10000) { 7706 n = 0x10000; 7707 } else if (n < -0x10000) { 7708 n = -0x10000; 7709 } 7710 7711 aExp += n - 1; 7712 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7713 , status); 7714 7715 } 7716