1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, 1522 int flags, float_status *status) 1523 { 1524 FloatParts pa = float32_unpack_canonical(a, status); 1525 FloatParts pb = float32_unpack_canonical(b, status); 1526 FloatParts pc = float32_unpack_canonical(c, status); 1527 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1528 1529 return float32_round_pack_canonical(pr, status); 1530 } 1531 1532 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, 1533 int flags, float_status *status) 1534 { 1535 FloatParts pa = float64_unpack_canonical(a, status); 1536 FloatParts pb = float64_unpack_canonical(b, status); 1537 FloatParts pc = float64_unpack_canonical(c, status); 1538 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1539 1540 return float64_round_pack_canonical(pr, status); 1541 } 1542 1543 /* 1544 * Returns the result of dividing the floating-point value `a' by the 1545 * corresponding value `b'. The operation is performed according to 1546 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1547 */ 1548 1549 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1550 { 1551 bool sign = a.sign ^ b.sign; 1552 1553 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1554 uint64_t n0, n1, q, r; 1555 int exp = a.exp - b.exp; 1556 1557 /* 1558 * We want a 2*N / N-bit division to produce exactly an N-bit 1559 * result, so that we do not lose any precision and so that we 1560 * do not have to renormalize afterward. If A.frac < B.frac, 1561 * then division would produce an (N-1)-bit result; shift A left 1562 * by one to produce the an N-bit result, and decrement the 1563 * exponent to match. 1564 * 1565 * The udiv_qrnnd algorithm that we're using requires normalization, 1566 * i.e. the msb of the denominator must be set. Since we know that 1567 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1568 * by one (more), and the remainder must be shifted right by one. 1569 */ 1570 if (a.frac < b.frac) { 1571 exp -= 1; 1572 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1573 } else { 1574 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1575 } 1576 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1577 1578 /* 1579 * Set lsb if there is a remainder, to set inexact. 1580 * As mentioned above, to find the actual value of the remainder we 1581 * would need to shift right, but (1) we are only concerned about 1582 * non-zero-ness, and (2) the remainder will always be even because 1583 * both inputs to the division primitive are even. 1584 */ 1585 a.frac = q | (r != 0); 1586 a.sign = sign; 1587 a.exp = exp; 1588 return a; 1589 } 1590 /* handle all the NaN cases */ 1591 if (is_nan(a.cls) || is_nan(b.cls)) { 1592 return pick_nan(a, b, s); 1593 } 1594 /* 0/0 or Inf/Inf */ 1595 if (a.cls == b.cls 1596 && 1597 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1598 s->float_exception_flags |= float_flag_invalid; 1599 return parts_default_nan(s); 1600 } 1601 /* Inf / x or 0 / x */ 1602 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1603 a.sign = sign; 1604 return a; 1605 } 1606 /* Div 0 => Inf */ 1607 if (b.cls == float_class_zero) { 1608 s->float_exception_flags |= float_flag_divbyzero; 1609 a.cls = float_class_inf; 1610 a.sign = sign; 1611 return a; 1612 } 1613 /* Div by Inf */ 1614 if (b.cls == float_class_inf) { 1615 a.cls = float_class_zero; 1616 a.sign = sign; 1617 return a; 1618 } 1619 g_assert_not_reached(); 1620 } 1621 1622 float16 float16_div(float16 a, float16 b, float_status *status) 1623 { 1624 FloatParts pa = float16_unpack_canonical(a, status); 1625 FloatParts pb = float16_unpack_canonical(b, status); 1626 FloatParts pr = div_floats(pa, pb, status); 1627 1628 return float16_round_pack_canonical(pr, status); 1629 } 1630 1631 float32 float32_div(float32 a, float32 b, float_status *status) 1632 { 1633 FloatParts pa = float32_unpack_canonical(a, status); 1634 FloatParts pb = float32_unpack_canonical(b, status); 1635 FloatParts pr = div_floats(pa, pb, status); 1636 1637 return float32_round_pack_canonical(pr, status); 1638 } 1639 1640 float64 float64_div(float64 a, float64 b, float_status *status) 1641 { 1642 FloatParts pa = float64_unpack_canonical(a, status); 1643 FloatParts pb = float64_unpack_canonical(b, status); 1644 FloatParts pr = div_floats(pa, pb, status); 1645 1646 return float64_round_pack_canonical(pr, status); 1647 } 1648 1649 /* 1650 * Float to Float conversions 1651 * 1652 * Returns the result of converting one float format to another. The 1653 * conversion is performed according to the IEC/IEEE Standard for 1654 * Binary Floating-Point Arithmetic. 1655 * 1656 * The float_to_float helper only needs to take care of raising 1657 * invalid exceptions and handling the conversion on NaNs. 1658 */ 1659 1660 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1661 float_status *s) 1662 { 1663 if (dstf->arm_althp) { 1664 switch (a.cls) { 1665 case float_class_qnan: 1666 case float_class_snan: 1667 /* There is no NaN in the destination format. Raise Invalid 1668 * and return a zero with the sign of the input NaN. 1669 */ 1670 s->float_exception_flags |= float_flag_invalid; 1671 a.cls = float_class_zero; 1672 a.frac = 0; 1673 a.exp = 0; 1674 break; 1675 1676 case float_class_inf: 1677 /* There is no Inf in the destination format. Raise Invalid 1678 * and return the maximum normal with the correct sign. 1679 */ 1680 s->float_exception_flags |= float_flag_invalid; 1681 a.cls = float_class_normal; 1682 a.exp = dstf->exp_max; 1683 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1684 break; 1685 1686 default: 1687 break; 1688 } 1689 } else if (is_nan(a.cls)) { 1690 if (is_snan(a.cls)) { 1691 s->float_exception_flags |= float_flag_invalid; 1692 a = parts_silence_nan(a, s); 1693 } 1694 if (s->default_nan_mode) { 1695 return parts_default_nan(s); 1696 } 1697 } 1698 return a; 1699 } 1700 1701 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1702 { 1703 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1704 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1705 FloatParts pr = float_to_float(p, &float32_params, s); 1706 return float32_round_pack_canonical(pr, s); 1707 } 1708 1709 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1710 { 1711 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1712 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1713 FloatParts pr = float_to_float(p, &float64_params, s); 1714 return float64_round_pack_canonical(pr, s); 1715 } 1716 1717 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1718 { 1719 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1720 FloatParts p = float32_unpack_canonical(a, s); 1721 FloatParts pr = float_to_float(p, fmt16, s); 1722 return float16a_round_pack_canonical(pr, s, fmt16); 1723 } 1724 1725 float64 float32_to_float64(float32 a, float_status *s) 1726 { 1727 FloatParts p = float32_unpack_canonical(a, s); 1728 FloatParts pr = float_to_float(p, &float64_params, s); 1729 return float64_round_pack_canonical(pr, s); 1730 } 1731 1732 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1733 { 1734 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1735 FloatParts p = float64_unpack_canonical(a, s); 1736 FloatParts pr = float_to_float(p, fmt16, s); 1737 return float16a_round_pack_canonical(pr, s, fmt16); 1738 } 1739 1740 float32 float64_to_float32(float64 a, float_status *s) 1741 { 1742 FloatParts p = float64_unpack_canonical(a, s); 1743 FloatParts pr = float_to_float(p, &float32_params, s); 1744 return float32_round_pack_canonical(pr, s); 1745 } 1746 1747 /* 1748 * Rounds the floating-point value `a' to an integer, and returns the 1749 * result as a floating-point value. The operation is performed 1750 * according to the IEC/IEEE Standard for Binary Floating-Point 1751 * Arithmetic. 1752 */ 1753 1754 static FloatParts round_to_int(FloatParts a, int rmode, 1755 int scale, float_status *s) 1756 { 1757 switch (a.cls) { 1758 case float_class_qnan: 1759 case float_class_snan: 1760 return return_nan(a, s); 1761 1762 case float_class_zero: 1763 case float_class_inf: 1764 /* already "integral" */ 1765 break; 1766 1767 case float_class_normal: 1768 scale = MIN(MAX(scale, -0x10000), 0x10000); 1769 a.exp += scale; 1770 1771 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1772 /* already integral */ 1773 break; 1774 } 1775 if (a.exp < 0) { 1776 bool one; 1777 /* all fractional */ 1778 s->float_exception_flags |= float_flag_inexact; 1779 switch (rmode) { 1780 case float_round_nearest_even: 1781 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1782 break; 1783 case float_round_ties_away: 1784 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1785 break; 1786 case float_round_to_zero: 1787 one = false; 1788 break; 1789 case float_round_up: 1790 one = !a.sign; 1791 break; 1792 case float_round_down: 1793 one = a.sign; 1794 break; 1795 default: 1796 g_assert_not_reached(); 1797 } 1798 1799 if (one) { 1800 a.frac = DECOMPOSED_IMPLICIT_BIT; 1801 a.exp = 0; 1802 } else { 1803 a.cls = float_class_zero; 1804 } 1805 } else { 1806 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1807 uint64_t frac_lsbm1 = frac_lsb >> 1; 1808 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1809 uint64_t rnd_mask = rnd_even_mask >> 1; 1810 uint64_t inc; 1811 1812 switch (rmode) { 1813 case float_round_nearest_even: 1814 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1815 break; 1816 case float_round_ties_away: 1817 inc = frac_lsbm1; 1818 break; 1819 case float_round_to_zero: 1820 inc = 0; 1821 break; 1822 case float_round_up: 1823 inc = a.sign ? 0 : rnd_mask; 1824 break; 1825 case float_round_down: 1826 inc = a.sign ? rnd_mask : 0; 1827 break; 1828 default: 1829 g_assert_not_reached(); 1830 } 1831 1832 if (a.frac & rnd_mask) { 1833 s->float_exception_flags |= float_flag_inexact; 1834 a.frac += inc; 1835 a.frac &= ~rnd_mask; 1836 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1837 a.frac >>= 1; 1838 a.exp++; 1839 } 1840 } 1841 } 1842 break; 1843 default: 1844 g_assert_not_reached(); 1845 } 1846 return a; 1847 } 1848 1849 float16 float16_round_to_int(float16 a, float_status *s) 1850 { 1851 FloatParts pa = float16_unpack_canonical(a, s); 1852 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1853 return float16_round_pack_canonical(pr, s); 1854 } 1855 1856 float32 float32_round_to_int(float32 a, float_status *s) 1857 { 1858 FloatParts pa = float32_unpack_canonical(a, s); 1859 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1860 return float32_round_pack_canonical(pr, s); 1861 } 1862 1863 float64 float64_round_to_int(float64 a, float_status *s) 1864 { 1865 FloatParts pa = float64_unpack_canonical(a, s); 1866 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1867 return float64_round_pack_canonical(pr, s); 1868 } 1869 1870 /* 1871 * Returns the result of converting the floating-point value `a' to 1872 * the two's complement integer format. The conversion is performed 1873 * according to the IEC/IEEE Standard for Binary Floating-Point 1874 * Arithmetic---which means in particular that the conversion is 1875 * rounded according to the current rounding mode. If `a' is a NaN, 1876 * the largest positive integer is returned. Otherwise, if the 1877 * conversion overflows, the largest integer with the same sign as `a' 1878 * is returned. 1879 */ 1880 1881 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1882 int64_t min, int64_t max, 1883 float_status *s) 1884 { 1885 uint64_t r; 1886 int orig_flags = get_float_exception_flags(s); 1887 FloatParts p = round_to_int(in, rmode, scale, s); 1888 1889 switch (p.cls) { 1890 case float_class_snan: 1891 case float_class_qnan: 1892 s->float_exception_flags = orig_flags | float_flag_invalid; 1893 return max; 1894 case float_class_inf: 1895 s->float_exception_flags = orig_flags | float_flag_invalid; 1896 return p.sign ? min : max; 1897 case float_class_zero: 1898 return 0; 1899 case float_class_normal: 1900 if (p.exp < DECOMPOSED_BINARY_POINT) { 1901 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1902 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1903 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1904 } else { 1905 r = UINT64_MAX; 1906 } 1907 if (p.sign) { 1908 if (r <= -(uint64_t) min) { 1909 return -r; 1910 } else { 1911 s->float_exception_flags = orig_flags | float_flag_invalid; 1912 return min; 1913 } 1914 } else { 1915 if (r <= max) { 1916 return r; 1917 } else { 1918 s->float_exception_flags = orig_flags | float_flag_invalid; 1919 return max; 1920 } 1921 } 1922 default: 1923 g_assert_not_reached(); 1924 } 1925 } 1926 1927 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1928 float_status *s) 1929 { 1930 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1931 rmode, scale, INT16_MIN, INT16_MAX, s); 1932 } 1933 1934 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1935 float_status *s) 1936 { 1937 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1938 rmode, scale, INT32_MIN, INT32_MAX, s); 1939 } 1940 1941 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 1942 float_status *s) 1943 { 1944 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1945 rmode, scale, INT64_MIN, INT64_MAX, s); 1946 } 1947 1948 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 1949 float_status *s) 1950 { 1951 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1952 rmode, scale, INT16_MIN, INT16_MAX, s); 1953 } 1954 1955 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 1956 float_status *s) 1957 { 1958 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1959 rmode, scale, INT32_MIN, INT32_MAX, s); 1960 } 1961 1962 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 1963 float_status *s) 1964 { 1965 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1966 rmode, scale, INT64_MIN, INT64_MAX, s); 1967 } 1968 1969 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 1970 float_status *s) 1971 { 1972 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1973 rmode, scale, INT16_MIN, INT16_MAX, s); 1974 } 1975 1976 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 1977 float_status *s) 1978 { 1979 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1980 rmode, scale, INT32_MIN, INT32_MAX, s); 1981 } 1982 1983 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 1984 float_status *s) 1985 { 1986 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1987 rmode, scale, INT64_MIN, INT64_MAX, s); 1988 } 1989 1990 int16_t float16_to_int16(float16 a, float_status *s) 1991 { 1992 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1993 } 1994 1995 int32_t float16_to_int32(float16 a, float_status *s) 1996 { 1997 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1998 } 1999 2000 int64_t float16_to_int64(float16 a, float_status *s) 2001 { 2002 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2003 } 2004 2005 int16_t float32_to_int16(float32 a, float_status *s) 2006 { 2007 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2008 } 2009 2010 int32_t float32_to_int32(float32 a, float_status *s) 2011 { 2012 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2013 } 2014 2015 int64_t float32_to_int64(float32 a, float_status *s) 2016 { 2017 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2018 } 2019 2020 int16_t float64_to_int16(float64 a, float_status *s) 2021 { 2022 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2023 } 2024 2025 int32_t float64_to_int32(float64 a, float_status *s) 2026 { 2027 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2028 } 2029 2030 int64_t float64_to_int64(float64 a, float_status *s) 2031 { 2032 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2033 } 2034 2035 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2036 { 2037 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2038 } 2039 2040 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2041 { 2042 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2043 } 2044 2045 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2046 { 2047 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2048 } 2049 2050 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2051 { 2052 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2053 } 2054 2055 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2056 { 2057 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2058 } 2059 2060 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2061 { 2062 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2063 } 2064 2065 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2066 { 2067 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2068 } 2069 2070 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2071 { 2072 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2073 } 2074 2075 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2076 { 2077 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2078 } 2079 2080 /* 2081 * Returns the result of converting the floating-point value `a' to 2082 * the unsigned integer format. The conversion is performed according 2083 * to the IEC/IEEE Standard for Binary Floating-Point 2084 * Arithmetic---which means in particular that the conversion is 2085 * rounded according to the current rounding mode. If `a' is a NaN, 2086 * the largest unsigned integer is returned. Otherwise, if the 2087 * conversion overflows, the largest unsigned integer is returned. If 2088 * the 'a' is negative, the result is rounded and zero is returned; 2089 * values that do not round to zero will raise the inexact exception 2090 * flag. 2091 */ 2092 2093 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2094 uint64_t max, float_status *s) 2095 { 2096 int orig_flags = get_float_exception_flags(s); 2097 FloatParts p = round_to_int(in, rmode, scale, s); 2098 uint64_t r; 2099 2100 switch (p.cls) { 2101 case float_class_snan: 2102 case float_class_qnan: 2103 s->float_exception_flags = orig_flags | float_flag_invalid; 2104 return max; 2105 case float_class_inf: 2106 s->float_exception_flags = orig_flags | float_flag_invalid; 2107 return p.sign ? 0 : max; 2108 case float_class_zero: 2109 return 0; 2110 case float_class_normal: 2111 if (p.sign) { 2112 s->float_exception_flags = orig_flags | float_flag_invalid; 2113 return 0; 2114 } 2115 2116 if (p.exp < DECOMPOSED_BINARY_POINT) { 2117 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2118 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2119 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2120 } else { 2121 s->float_exception_flags = orig_flags | float_flag_invalid; 2122 return max; 2123 } 2124 2125 /* For uint64 this will never trip, but if p.exp is too large 2126 * to shift a decomposed fraction we shall have exited via the 2127 * 3rd leg above. 2128 */ 2129 if (r > max) { 2130 s->float_exception_flags = orig_flags | float_flag_invalid; 2131 return max; 2132 } 2133 return r; 2134 default: 2135 g_assert_not_reached(); 2136 } 2137 } 2138 2139 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2140 float_status *s) 2141 { 2142 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2143 rmode, scale, UINT16_MAX, s); 2144 } 2145 2146 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2147 float_status *s) 2148 { 2149 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2150 rmode, scale, UINT32_MAX, s); 2151 } 2152 2153 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2154 float_status *s) 2155 { 2156 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2157 rmode, scale, UINT64_MAX, s); 2158 } 2159 2160 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2161 float_status *s) 2162 { 2163 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2164 rmode, scale, UINT16_MAX, s); 2165 } 2166 2167 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2168 float_status *s) 2169 { 2170 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2171 rmode, scale, UINT32_MAX, s); 2172 } 2173 2174 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2175 float_status *s) 2176 { 2177 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2178 rmode, scale, UINT64_MAX, s); 2179 } 2180 2181 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2182 float_status *s) 2183 { 2184 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2185 rmode, scale, UINT16_MAX, s); 2186 } 2187 2188 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2189 float_status *s) 2190 { 2191 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2192 rmode, scale, UINT32_MAX, s); 2193 } 2194 2195 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2196 float_status *s) 2197 { 2198 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2199 rmode, scale, UINT64_MAX, s); 2200 } 2201 2202 uint16_t float16_to_uint16(float16 a, float_status *s) 2203 { 2204 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2205 } 2206 2207 uint32_t float16_to_uint32(float16 a, float_status *s) 2208 { 2209 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2210 } 2211 2212 uint64_t float16_to_uint64(float16 a, float_status *s) 2213 { 2214 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2215 } 2216 2217 uint16_t float32_to_uint16(float32 a, float_status *s) 2218 { 2219 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2220 } 2221 2222 uint32_t float32_to_uint32(float32 a, float_status *s) 2223 { 2224 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2225 } 2226 2227 uint64_t float32_to_uint64(float32 a, float_status *s) 2228 { 2229 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2230 } 2231 2232 uint16_t float64_to_uint16(float64 a, float_status *s) 2233 { 2234 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2235 } 2236 2237 uint32_t float64_to_uint32(float64 a, float_status *s) 2238 { 2239 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2240 } 2241 2242 uint64_t float64_to_uint64(float64 a, float_status *s) 2243 { 2244 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2245 } 2246 2247 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2248 { 2249 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2250 } 2251 2252 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2253 { 2254 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2255 } 2256 2257 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2258 { 2259 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2260 } 2261 2262 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2263 { 2264 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2265 } 2266 2267 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2268 { 2269 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2270 } 2271 2272 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2273 { 2274 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2275 } 2276 2277 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2278 { 2279 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2280 } 2281 2282 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2283 { 2284 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2285 } 2286 2287 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2288 { 2289 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2290 } 2291 2292 /* 2293 * Integer to float conversions 2294 * 2295 * Returns the result of converting the two's complement integer `a' 2296 * to the floating-point format. The conversion is performed according 2297 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2298 */ 2299 2300 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2301 { 2302 FloatParts r = { .sign = false }; 2303 2304 if (a == 0) { 2305 r.cls = float_class_zero; 2306 } else { 2307 uint64_t f = a; 2308 int shift; 2309 2310 r.cls = float_class_normal; 2311 if (a < 0) { 2312 f = -f; 2313 r.sign = true; 2314 } 2315 shift = clz64(f) - 1; 2316 scale = MIN(MAX(scale, -0x10000), 0x10000); 2317 2318 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2319 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2320 } 2321 2322 return r; 2323 } 2324 2325 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2326 { 2327 FloatParts pa = int_to_float(a, scale, status); 2328 return float16_round_pack_canonical(pa, status); 2329 } 2330 2331 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2332 { 2333 return int64_to_float16_scalbn(a, scale, status); 2334 } 2335 2336 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2337 { 2338 return int64_to_float16_scalbn(a, scale, status); 2339 } 2340 2341 float16 int64_to_float16(int64_t a, float_status *status) 2342 { 2343 return int64_to_float16_scalbn(a, 0, status); 2344 } 2345 2346 float16 int32_to_float16(int32_t a, float_status *status) 2347 { 2348 return int64_to_float16_scalbn(a, 0, status); 2349 } 2350 2351 float16 int16_to_float16(int16_t a, float_status *status) 2352 { 2353 return int64_to_float16_scalbn(a, 0, status); 2354 } 2355 2356 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2357 { 2358 FloatParts pa = int_to_float(a, scale, status); 2359 return float32_round_pack_canonical(pa, status); 2360 } 2361 2362 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2363 { 2364 return int64_to_float32_scalbn(a, scale, status); 2365 } 2366 2367 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2368 { 2369 return int64_to_float32_scalbn(a, scale, status); 2370 } 2371 2372 float32 int64_to_float32(int64_t a, float_status *status) 2373 { 2374 return int64_to_float32_scalbn(a, 0, status); 2375 } 2376 2377 float32 int32_to_float32(int32_t a, float_status *status) 2378 { 2379 return int64_to_float32_scalbn(a, 0, status); 2380 } 2381 2382 float32 int16_to_float32(int16_t a, float_status *status) 2383 { 2384 return int64_to_float32_scalbn(a, 0, status); 2385 } 2386 2387 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2388 { 2389 FloatParts pa = int_to_float(a, scale, status); 2390 return float64_round_pack_canonical(pa, status); 2391 } 2392 2393 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2394 { 2395 return int64_to_float64_scalbn(a, scale, status); 2396 } 2397 2398 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2399 { 2400 return int64_to_float64_scalbn(a, scale, status); 2401 } 2402 2403 float64 int64_to_float64(int64_t a, float_status *status) 2404 { 2405 return int64_to_float64_scalbn(a, 0, status); 2406 } 2407 2408 float64 int32_to_float64(int32_t a, float_status *status) 2409 { 2410 return int64_to_float64_scalbn(a, 0, status); 2411 } 2412 2413 float64 int16_to_float64(int16_t a, float_status *status) 2414 { 2415 return int64_to_float64_scalbn(a, 0, status); 2416 } 2417 2418 2419 /* 2420 * Unsigned Integer to float conversions 2421 * 2422 * Returns the result of converting the unsigned integer `a' to the 2423 * floating-point format. The conversion is performed according to the 2424 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2425 */ 2426 2427 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2428 { 2429 FloatParts r = { .sign = false }; 2430 2431 if (a == 0) { 2432 r.cls = float_class_zero; 2433 } else { 2434 scale = MIN(MAX(scale, -0x10000), 0x10000); 2435 r.cls = float_class_normal; 2436 if ((int64_t)a < 0) { 2437 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2438 shift64RightJamming(a, 1, &a); 2439 r.frac = a; 2440 } else { 2441 int shift = clz64(a) - 1; 2442 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2443 r.frac = a << shift; 2444 } 2445 } 2446 2447 return r; 2448 } 2449 2450 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2451 { 2452 FloatParts pa = uint_to_float(a, scale, status); 2453 return float16_round_pack_canonical(pa, status); 2454 } 2455 2456 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2457 { 2458 return uint64_to_float16_scalbn(a, scale, status); 2459 } 2460 2461 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2462 { 2463 return uint64_to_float16_scalbn(a, scale, status); 2464 } 2465 2466 float16 uint64_to_float16(uint64_t a, float_status *status) 2467 { 2468 return uint64_to_float16_scalbn(a, 0, status); 2469 } 2470 2471 float16 uint32_to_float16(uint32_t a, float_status *status) 2472 { 2473 return uint64_to_float16_scalbn(a, 0, status); 2474 } 2475 2476 float16 uint16_to_float16(uint16_t a, float_status *status) 2477 { 2478 return uint64_to_float16_scalbn(a, 0, status); 2479 } 2480 2481 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2482 { 2483 FloatParts pa = uint_to_float(a, scale, status); 2484 return float32_round_pack_canonical(pa, status); 2485 } 2486 2487 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2488 { 2489 return uint64_to_float32_scalbn(a, scale, status); 2490 } 2491 2492 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2493 { 2494 return uint64_to_float32_scalbn(a, scale, status); 2495 } 2496 2497 float32 uint64_to_float32(uint64_t a, float_status *status) 2498 { 2499 return uint64_to_float32_scalbn(a, 0, status); 2500 } 2501 2502 float32 uint32_to_float32(uint32_t a, float_status *status) 2503 { 2504 return uint64_to_float32_scalbn(a, 0, status); 2505 } 2506 2507 float32 uint16_to_float32(uint16_t a, float_status *status) 2508 { 2509 return uint64_to_float32_scalbn(a, 0, status); 2510 } 2511 2512 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2513 { 2514 FloatParts pa = uint_to_float(a, scale, status); 2515 return float64_round_pack_canonical(pa, status); 2516 } 2517 2518 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2519 { 2520 return uint64_to_float64_scalbn(a, scale, status); 2521 } 2522 2523 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2524 { 2525 return uint64_to_float64_scalbn(a, scale, status); 2526 } 2527 2528 float64 uint64_to_float64(uint64_t a, float_status *status) 2529 { 2530 return uint64_to_float64_scalbn(a, 0, status); 2531 } 2532 2533 float64 uint32_to_float64(uint32_t a, float_status *status) 2534 { 2535 return uint64_to_float64_scalbn(a, 0, status); 2536 } 2537 2538 float64 uint16_to_float64(uint16_t a, float_status *status) 2539 { 2540 return uint64_to_float64_scalbn(a, 0, status); 2541 } 2542 2543 /* Float Min/Max */ 2544 /* min() and max() functions. These can't be implemented as 2545 * 'compare and pick one input' because that would mishandle 2546 * NaNs and +0 vs -0. 2547 * 2548 * minnum() and maxnum() functions. These are similar to the min() 2549 * and max() functions but if one of the arguments is a QNaN and 2550 * the other is numerical then the numerical argument is returned. 2551 * SNaNs will get quietened before being returned. 2552 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2553 * and maxNum() operations. min() and max() are the typical min/max 2554 * semantics provided by many CPUs which predate that specification. 2555 * 2556 * minnummag() and maxnummag() functions correspond to minNumMag() 2557 * and minNumMag() from the IEEE-754 2008. 2558 */ 2559 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2560 bool ieee, bool ismag, float_status *s) 2561 { 2562 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2563 if (ieee) { 2564 /* Takes two floating-point values `a' and `b', one of 2565 * which is a NaN, and returns the appropriate NaN 2566 * result. If either `a' or `b' is a signaling NaN, 2567 * the invalid exception is raised. 2568 */ 2569 if (is_snan(a.cls) || is_snan(b.cls)) { 2570 return pick_nan(a, b, s); 2571 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2572 return b; 2573 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2574 return a; 2575 } 2576 } 2577 return pick_nan(a, b, s); 2578 } else { 2579 int a_exp, b_exp; 2580 2581 switch (a.cls) { 2582 case float_class_normal: 2583 a_exp = a.exp; 2584 break; 2585 case float_class_inf: 2586 a_exp = INT_MAX; 2587 break; 2588 case float_class_zero: 2589 a_exp = INT_MIN; 2590 break; 2591 default: 2592 g_assert_not_reached(); 2593 break; 2594 } 2595 switch (b.cls) { 2596 case float_class_normal: 2597 b_exp = b.exp; 2598 break; 2599 case float_class_inf: 2600 b_exp = INT_MAX; 2601 break; 2602 case float_class_zero: 2603 b_exp = INT_MIN; 2604 break; 2605 default: 2606 g_assert_not_reached(); 2607 break; 2608 } 2609 2610 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2611 bool a_less = a_exp < b_exp; 2612 if (a_exp == b_exp) { 2613 a_less = a.frac < b.frac; 2614 } 2615 return a_less ^ ismin ? b : a; 2616 } 2617 2618 if (a.sign == b.sign) { 2619 bool a_less = a_exp < b_exp; 2620 if (a_exp == b_exp) { 2621 a_less = a.frac < b.frac; 2622 } 2623 return a.sign ^ a_less ^ ismin ? b : a; 2624 } else { 2625 return a.sign ^ ismin ? b : a; 2626 } 2627 } 2628 } 2629 2630 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2631 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2632 float_status *s) \ 2633 { \ 2634 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2635 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2636 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2637 \ 2638 return float ## sz ## _round_pack_canonical(pr, s); \ 2639 } 2640 2641 MINMAX(16, min, true, false, false) 2642 MINMAX(16, minnum, true, true, false) 2643 MINMAX(16, minnummag, true, true, true) 2644 MINMAX(16, max, false, false, false) 2645 MINMAX(16, maxnum, false, true, false) 2646 MINMAX(16, maxnummag, false, true, true) 2647 2648 MINMAX(32, min, true, false, false) 2649 MINMAX(32, minnum, true, true, false) 2650 MINMAX(32, minnummag, true, true, true) 2651 MINMAX(32, max, false, false, false) 2652 MINMAX(32, maxnum, false, true, false) 2653 MINMAX(32, maxnummag, false, true, true) 2654 2655 MINMAX(64, min, true, false, false) 2656 MINMAX(64, minnum, true, true, false) 2657 MINMAX(64, minnummag, true, true, true) 2658 MINMAX(64, max, false, false, false) 2659 MINMAX(64, maxnum, false, true, false) 2660 MINMAX(64, maxnummag, false, true, true) 2661 2662 #undef MINMAX 2663 2664 /* Floating point compare */ 2665 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2666 float_status *s) 2667 { 2668 if (is_nan(a.cls) || is_nan(b.cls)) { 2669 if (!is_quiet || 2670 a.cls == float_class_snan || 2671 b.cls == float_class_snan) { 2672 s->float_exception_flags |= float_flag_invalid; 2673 } 2674 return float_relation_unordered; 2675 } 2676 2677 if (a.cls == float_class_zero) { 2678 if (b.cls == float_class_zero) { 2679 return float_relation_equal; 2680 } 2681 return b.sign ? float_relation_greater : float_relation_less; 2682 } else if (b.cls == float_class_zero) { 2683 return a.sign ? float_relation_less : float_relation_greater; 2684 } 2685 2686 /* The only really important thing about infinity is its sign. If 2687 * both are infinities the sign marks the smallest of the two. 2688 */ 2689 if (a.cls == float_class_inf) { 2690 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2691 return float_relation_equal; 2692 } 2693 return a.sign ? float_relation_less : float_relation_greater; 2694 } else if (b.cls == float_class_inf) { 2695 return b.sign ? float_relation_greater : float_relation_less; 2696 } 2697 2698 if (a.sign != b.sign) { 2699 return a.sign ? float_relation_less : float_relation_greater; 2700 } 2701 2702 if (a.exp == b.exp) { 2703 if (a.frac == b.frac) { 2704 return float_relation_equal; 2705 } 2706 if (a.sign) { 2707 return a.frac > b.frac ? 2708 float_relation_less : float_relation_greater; 2709 } else { 2710 return a.frac > b.frac ? 2711 float_relation_greater : float_relation_less; 2712 } 2713 } else { 2714 if (a.sign) { 2715 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2716 } else { 2717 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2718 } 2719 } 2720 } 2721 2722 #define COMPARE(sz) \ 2723 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2724 float_status *s) \ 2725 { \ 2726 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2727 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2728 return compare_floats(pa, pb, false, s); \ 2729 } \ 2730 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2731 float_status *s) \ 2732 { \ 2733 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2734 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2735 return compare_floats(pa, pb, true, s); \ 2736 } 2737 2738 COMPARE(16) 2739 COMPARE(32) 2740 COMPARE(64) 2741 2742 #undef COMPARE 2743 2744 /* Multiply A by 2 raised to the power N. */ 2745 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2746 { 2747 if (unlikely(is_nan(a.cls))) { 2748 return return_nan(a, s); 2749 } 2750 if (a.cls == float_class_normal) { 2751 /* The largest float type (even though not supported by FloatParts) 2752 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2753 * still allows rounding to infinity, without allowing overflow 2754 * within the int32_t that backs FloatParts.exp. 2755 */ 2756 n = MIN(MAX(n, -0x10000), 0x10000); 2757 a.exp += n; 2758 } 2759 return a; 2760 } 2761 2762 float16 float16_scalbn(float16 a, int n, float_status *status) 2763 { 2764 FloatParts pa = float16_unpack_canonical(a, status); 2765 FloatParts pr = scalbn_decomposed(pa, n, status); 2766 return float16_round_pack_canonical(pr, status); 2767 } 2768 2769 float32 float32_scalbn(float32 a, int n, float_status *status) 2770 { 2771 FloatParts pa = float32_unpack_canonical(a, status); 2772 FloatParts pr = scalbn_decomposed(pa, n, status); 2773 return float32_round_pack_canonical(pr, status); 2774 } 2775 2776 float64 float64_scalbn(float64 a, int n, float_status *status) 2777 { 2778 FloatParts pa = float64_unpack_canonical(a, status); 2779 FloatParts pr = scalbn_decomposed(pa, n, status); 2780 return float64_round_pack_canonical(pr, status); 2781 } 2782 2783 /* 2784 * Square Root 2785 * 2786 * The old softfloat code did an approximation step before zeroing in 2787 * on the final result. However for simpleness we just compute the 2788 * square root by iterating down from the implicit bit to enough extra 2789 * bits to ensure we get a correctly rounded result. 2790 * 2791 * This does mean however the calculation is slower than before, 2792 * especially for 64 bit floats. 2793 */ 2794 2795 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2796 { 2797 uint64_t a_frac, r_frac, s_frac; 2798 int bit, last_bit; 2799 2800 if (is_nan(a.cls)) { 2801 return return_nan(a, s); 2802 } 2803 if (a.cls == float_class_zero) { 2804 return a; /* sqrt(+-0) = +-0 */ 2805 } 2806 if (a.sign) { 2807 s->float_exception_flags |= float_flag_invalid; 2808 return parts_default_nan(s); 2809 } 2810 if (a.cls == float_class_inf) { 2811 return a; /* sqrt(+inf) = +inf */ 2812 } 2813 2814 assert(a.cls == float_class_normal); 2815 2816 /* We need two overflow bits at the top. Adding room for that is a 2817 * right shift. If the exponent is odd, we can discard the low bit 2818 * by multiplying the fraction by 2; that's a left shift. Combine 2819 * those and we shift right if the exponent is even. 2820 */ 2821 a_frac = a.frac; 2822 if (!(a.exp & 1)) { 2823 a_frac >>= 1; 2824 } 2825 a.exp >>= 1; 2826 2827 /* Bit-by-bit computation of sqrt. */ 2828 r_frac = 0; 2829 s_frac = 0; 2830 2831 /* Iterate from implicit bit down to the 3 extra bits to compute a 2832 * properly rounded result. Remember we've inserted one more bit 2833 * at the top, so these positions are one less. 2834 */ 2835 bit = DECOMPOSED_BINARY_POINT - 1; 2836 last_bit = MAX(p->frac_shift - 4, 0); 2837 do { 2838 uint64_t q = 1ULL << bit; 2839 uint64_t t_frac = s_frac + q; 2840 if (t_frac <= a_frac) { 2841 s_frac = t_frac + q; 2842 a_frac -= t_frac; 2843 r_frac += q; 2844 } 2845 a_frac <<= 1; 2846 } while (--bit >= last_bit); 2847 2848 /* Undo the right shift done above. If there is any remaining 2849 * fraction, the result is inexact. Set the sticky bit. 2850 */ 2851 a.frac = (r_frac << 1) + (a_frac != 0); 2852 2853 return a; 2854 } 2855 2856 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 2857 { 2858 FloatParts pa = float16_unpack_canonical(a, status); 2859 FloatParts pr = sqrt_float(pa, status, &float16_params); 2860 return float16_round_pack_canonical(pr, status); 2861 } 2862 2863 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 2864 { 2865 FloatParts pa = float32_unpack_canonical(a, status); 2866 FloatParts pr = sqrt_float(pa, status, &float32_params); 2867 return float32_round_pack_canonical(pr, status); 2868 } 2869 2870 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 2871 { 2872 FloatParts pa = float64_unpack_canonical(a, status); 2873 FloatParts pr = sqrt_float(pa, status, &float64_params); 2874 return float64_round_pack_canonical(pr, status); 2875 } 2876 2877 /*---------------------------------------------------------------------------- 2878 | The pattern for a default generated NaN. 2879 *----------------------------------------------------------------------------*/ 2880 2881 float16 float16_default_nan(float_status *status) 2882 { 2883 FloatParts p = parts_default_nan(status); 2884 p.frac >>= float16_params.frac_shift; 2885 return float16_pack_raw(p); 2886 } 2887 2888 float32 float32_default_nan(float_status *status) 2889 { 2890 FloatParts p = parts_default_nan(status); 2891 p.frac >>= float32_params.frac_shift; 2892 return float32_pack_raw(p); 2893 } 2894 2895 float64 float64_default_nan(float_status *status) 2896 { 2897 FloatParts p = parts_default_nan(status); 2898 p.frac >>= float64_params.frac_shift; 2899 return float64_pack_raw(p); 2900 } 2901 2902 float128 float128_default_nan(float_status *status) 2903 { 2904 FloatParts p = parts_default_nan(status); 2905 float128 r; 2906 2907 /* Extrapolate from the choices made by parts_default_nan to fill 2908 * in the quad-floating format. If the low bit is set, assume we 2909 * want to set all non-snan bits. 2910 */ 2911 r.low = -(p.frac & 1); 2912 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2913 r.high |= LIT64(0x7FFF000000000000); 2914 r.high |= (uint64_t)p.sign << 63; 2915 2916 return r; 2917 } 2918 2919 /*---------------------------------------------------------------------------- 2920 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2921 *----------------------------------------------------------------------------*/ 2922 2923 float16 float16_silence_nan(float16 a, float_status *status) 2924 { 2925 FloatParts p = float16_unpack_raw(a); 2926 p.frac <<= float16_params.frac_shift; 2927 p = parts_silence_nan(p, status); 2928 p.frac >>= float16_params.frac_shift; 2929 return float16_pack_raw(p); 2930 } 2931 2932 float32 float32_silence_nan(float32 a, float_status *status) 2933 { 2934 FloatParts p = float32_unpack_raw(a); 2935 p.frac <<= float32_params.frac_shift; 2936 p = parts_silence_nan(p, status); 2937 p.frac >>= float32_params.frac_shift; 2938 return float32_pack_raw(p); 2939 } 2940 2941 float64 float64_silence_nan(float64 a, float_status *status) 2942 { 2943 FloatParts p = float64_unpack_raw(a); 2944 p.frac <<= float64_params.frac_shift; 2945 p = parts_silence_nan(p, status); 2946 p.frac >>= float64_params.frac_shift; 2947 return float64_pack_raw(p); 2948 } 2949 2950 /*---------------------------------------------------------------------------- 2951 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2952 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2953 | input. If `zSign' is 1, the input is negated before being converted to an 2954 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2955 | is simply rounded to an integer, with the inexact exception raised if the 2956 | input cannot be represented exactly as an integer. However, if the fixed- 2957 | point input is too large, the invalid exception is raised and the largest 2958 | positive or negative integer is returned. 2959 *----------------------------------------------------------------------------*/ 2960 2961 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2962 { 2963 int8_t roundingMode; 2964 flag roundNearestEven; 2965 int8_t roundIncrement, roundBits; 2966 int32_t z; 2967 2968 roundingMode = status->float_rounding_mode; 2969 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2970 switch (roundingMode) { 2971 case float_round_nearest_even: 2972 case float_round_ties_away: 2973 roundIncrement = 0x40; 2974 break; 2975 case float_round_to_zero: 2976 roundIncrement = 0; 2977 break; 2978 case float_round_up: 2979 roundIncrement = zSign ? 0 : 0x7f; 2980 break; 2981 case float_round_down: 2982 roundIncrement = zSign ? 0x7f : 0; 2983 break; 2984 default: 2985 abort(); 2986 } 2987 roundBits = absZ & 0x7F; 2988 absZ = ( absZ + roundIncrement )>>7; 2989 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2990 z = absZ; 2991 if ( zSign ) z = - z; 2992 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2993 float_raise(float_flag_invalid, status); 2994 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2995 } 2996 if (roundBits) { 2997 status->float_exception_flags |= float_flag_inexact; 2998 } 2999 return z; 3000 3001 } 3002 3003 /*---------------------------------------------------------------------------- 3004 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3005 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3006 | and returns the properly rounded 64-bit integer corresponding to the input. 3007 | If `zSign' is 1, the input is negated before being converted to an integer. 3008 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3009 | the inexact exception raised if the input cannot be represented exactly as 3010 | an integer. However, if the fixed-point input is too large, the invalid 3011 | exception is raised and the largest positive or negative integer is 3012 | returned. 3013 *----------------------------------------------------------------------------*/ 3014 3015 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3016 float_status *status) 3017 { 3018 int8_t roundingMode; 3019 flag roundNearestEven, increment; 3020 int64_t z; 3021 3022 roundingMode = status->float_rounding_mode; 3023 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3024 switch (roundingMode) { 3025 case float_round_nearest_even: 3026 case float_round_ties_away: 3027 increment = ((int64_t) absZ1 < 0); 3028 break; 3029 case float_round_to_zero: 3030 increment = 0; 3031 break; 3032 case float_round_up: 3033 increment = !zSign && absZ1; 3034 break; 3035 case float_round_down: 3036 increment = zSign && absZ1; 3037 break; 3038 default: 3039 abort(); 3040 } 3041 if ( increment ) { 3042 ++absZ0; 3043 if ( absZ0 == 0 ) goto overflow; 3044 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3045 } 3046 z = absZ0; 3047 if ( zSign ) z = - z; 3048 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3049 overflow: 3050 float_raise(float_flag_invalid, status); 3051 return 3052 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3053 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3054 } 3055 if (absZ1) { 3056 status->float_exception_flags |= float_flag_inexact; 3057 } 3058 return z; 3059 3060 } 3061 3062 /*---------------------------------------------------------------------------- 3063 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3064 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3065 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3066 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3067 | with the inexact exception raised if the input cannot be represented exactly 3068 | as an integer. However, if the fixed-point input is too large, the invalid 3069 | exception is raised and the largest unsigned integer is returned. 3070 *----------------------------------------------------------------------------*/ 3071 3072 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3073 uint64_t absZ1, float_status *status) 3074 { 3075 int8_t roundingMode; 3076 flag roundNearestEven, increment; 3077 3078 roundingMode = status->float_rounding_mode; 3079 roundNearestEven = (roundingMode == float_round_nearest_even); 3080 switch (roundingMode) { 3081 case float_round_nearest_even: 3082 case float_round_ties_away: 3083 increment = ((int64_t)absZ1 < 0); 3084 break; 3085 case float_round_to_zero: 3086 increment = 0; 3087 break; 3088 case float_round_up: 3089 increment = !zSign && absZ1; 3090 break; 3091 case float_round_down: 3092 increment = zSign && absZ1; 3093 break; 3094 default: 3095 abort(); 3096 } 3097 if (increment) { 3098 ++absZ0; 3099 if (absZ0 == 0) { 3100 float_raise(float_flag_invalid, status); 3101 return LIT64(0xFFFFFFFFFFFFFFFF); 3102 } 3103 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3104 } 3105 3106 if (zSign && absZ0) { 3107 float_raise(float_flag_invalid, status); 3108 return 0; 3109 } 3110 3111 if (absZ1) { 3112 status->float_exception_flags |= float_flag_inexact; 3113 } 3114 return absZ0; 3115 } 3116 3117 /*---------------------------------------------------------------------------- 3118 | If `a' is denormal and we are in flush-to-zero mode then set the 3119 | input-denormal exception and return zero. Otherwise just return the value. 3120 *----------------------------------------------------------------------------*/ 3121 float32 float32_squash_input_denormal(float32 a, float_status *status) 3122 { 3123 if (status->flush_inputs_to_zero) { 3124 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3125 float_raise(float_flag_input_denormal, status); 3126 return make_float32(float32_val(a) & 0x80000000); 3127 } 3128 } 3129 return a; 3130 } 3131 3132 /*---------------------------------------------------------------------------- 3133 | Normalizes the subnormal single-precision floating-point value represented 3134 | by the denormalized significand `aSig'. The normalized exponent and 3135 | significand are stored at the locations pointed to by `zExpPtr' and 3136 | `zSigPtr', respectively. 3137 *----------------------------------------------------------------------------*/ 3138 3139 static void 3140 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3141 { 3142 int8_t shiftCount; 3143 3144 shiftCount = clz32(aSig) - 8; 3145 *zSigPtr = aSig<<shiftCount; 3146 *zExpPtr = 1 - shiftCount; 3147 3148 } 3149 3150 /*---------------------------------------------------------------------------- 3151 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3152 | and significand `zSig', and returns the proper single-precision floating- 3153 | point value corresponding to the abstract input. Ordinarily, the abstract 3154 | value is simply rounded and packed into the single-precision format, with 3155 | the inexact exception raised if the abstract input cannot be represented 3156 | exactly. However, if the abstract value is too large, the overflow and 3157 | inexact exceptions are raised and an infinity or maximal finite value is 3158 | returned. If the abstract value is too small, the input value is rounded to 3159 | a subnormal number, and the underflow and inexact exceptions are raised if 3160 | the abstract input cannot be represented exactly as a subnormal single- 3161 | precision floating-point number. 3162 | The input significand `zSig' has its binary point between bits 30 3163 | and 29, which is 7 bits to the left of the usual location. This shifted 3164 | significand must be normalized or smaller. If `zSig' is not normalized, 3165 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3166 | and it must not require rounding. In the usual case that `zSig' is 3167 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3168 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3169 | Binary Floating-Point Arithmetic. 3170 *----------------------------------------------------------------------------*/ 3171 3172 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3173 float_status *status) 3174 { 3175 int8_t roundingMode; 3176 flag roundNearestEven; 3177 int8_t roundIncrement, roundBits; 3178 flag isTiny; 3179 3180 roundingMode = status->float_rounding_mode; 3181 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3182 switch (roundingMode) { 3183 case float_round_nearest_even: 3184 case float_round_ties_away: 3185 roundIncrement = 0x40; 3186 break; 3187 case float_round_to_zero: 3188 roundIncrement = 0; 3189 break; 3190 case float_round_up: 3191 roundIncrement = zSign ? 0 : 0x7f; 3192 break; 3193 case float_round_down: 3194 roundIncrement = zSign ? 0x7f : 0; 3195 break; 3196 default: 3197 abort(); 3198 break; 3199 } 3200 roundBits = zSig & 0x7F; 3201 if ( 0xFD <= (uint16_t) zExp ) { 3202 if ( ( 0xFD < zExp ) 3203 || ( ( zExp == 0xFD ) 3204 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3205 ) { 3206 float_raise(float_flag_overflow | float_flag_inexact, status); 3207 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3208 } 3209 if ( zExp < 0 ) { 3210 if (status->flush_to_zero) { 3211 float_raise(float_flag_output_denormal, status); 3212 return packFloat32(zSign, 0, 0); 3213 } 3214 isTiny = 3215 (status->float_detect_tininess 3216 == float_tininess_before_rounding) 3217 || ( zExp < -1 ) 3218 || ( zSig + roundIncrement < 0x80000000 ); 3219 shift32RightJamming( zSig, - zExp, &zSig ); 3220 zExp = 0; 3221 roundBits = zSig & 0x7F; 3222 if (isTiny && roundBits) { 3223 float_raise(float_flag_underflow, status); 3224 } 3225 } 3226 } 3227 if (roundBits) { 3228 status->float_exception_flags |= float_flag_inexact; 3229 } 3230 zSig = ( zSig + roundIncrement )>>7; 3231 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3232 if ( zSig == 0 ) zExp = 0; 3233 return packFloat32( zSign, zExp, zSig ); 3234 3235 } 3236 3237 /*---------------------------------------------------------------------------- 3238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3239 | and significand `zSig', and returns the proper single-precision floating- 3240 | point value corresponding to the abstract input. This routine is just like 3241 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3242 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3243 | floating-point exponent. 3244 *----------------------------------------------------------------------------*/ 3245 3246 static float32 3247 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3248 float_status *status) 3249 { 3250 int8_t shiftCount; 3251 3252 shiftCount = clz32(zSig) - 1; 3253 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3254 status); 3255 3256 } 3257 3258 /*---------------------------------------------------------------------------- 3259 | If `a' is denormal and we are in flush-to-zero mode then set the 3260 | input-denormal exception and return zero. Otherwise just return the value. 3261 *----------------------------------------------------------------------------*/ 3262 float64 float64_squash_input_denormal(float64 a, float_status *status) 3263 { 3264 if (status->flush_inputs_to_zero) { 3265 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3266 float_raise(float_flag_input_denormal, status); 3267 return make_float64(float64_val(a) & (1ULL << 63)); 3268 } 3269 } 3270 return a; 3271 } 3272 3273 /*---------------------------------------------------------------------------- 3274 | Normalizes the subnormal double-precision floating-point value represented 3275 | by the denormalized significand `aSig'. The normalized exponent and 3276 | significand are stored at the locations pointed to by `zExpPtr' and 3277 | `zSigPtr', respectively. 3278 *----------------------------------------------------------------------------*/ 3279 3280 static void 3281 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3282 { 3283 int8_t shiftCount; 3284 3285 shiftCount = clz64(aSig) - 11; 3286 *zSigPtr = aSig<<shiftCount; 3287 *zExpPtr = 1 - shiftCount; 3288 3289 } 3290 3291 /*---------------------------------------------------------------------------- 3292 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3293 | double-precision floating-point value, returning the result. After being 3294 | shifted into the proper positions, the three fields are simply added 3295 | together to form the result. This means that any integer portion of `zSig' 3296 | will be added into the exponent. Since a properly normalized significand 3297 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3298 | than the desired result exponent whenever `zSig' is a complete, normalized 3299 | significand. 3300 *----------------------------------------------------------------------------*/ 3301 3302 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3303 { 3304 3305 return make_float64( 3306 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3307 3308 } 3309 3310 /*---------------------------------------------------------------------------- 3311 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3312 | and significand `zSig', and returns the proper double-precision floating- 3313 | point value corresponding to the abstract input. Ordinarily, the abstract 3314 | value is simply rounded and packed into the double-precision format, with 3315 | the inexact exception raised if the abstract input cannot be represented 3316 | exactly. However, if the abstract value is too large, the overflow and 3317 | inexact exceptions are raised and an infinity or maximal finite value is 3318 | returned. If the abstract value is too small, the input value is rounded to 3319 | a subnormal number, and the underflow and inexact exceptions are raised if 3320 | the abstract input cannot be represented exactly as a subnormal double- 3321 | precision floating-point number. 3322 | The input significand `zSig' has its binary point between bits 62 3323 | and 61, which is 10 bits to the left of the usual location. This shifted 3324 | significand must be normalized or smaller. If `zSig' is not normalized, 3325 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3326 | and it must not require rounding. In the usual case that `zSig' is 3327 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3328 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3329 | Binary Floating-Point Arithmetic. 3330 *----------------------------------------------------------------------------*/ 3331 3332 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3333 float_status *status) 3334 { 3335 int8_t roundingMode; 3336 flag roundNearestEven; 3337 int roundIncrement, roundBits; 3338 flag isTiny; 3339 3340 roundingMode = status->float_rounding_mode; 3341 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3342 switch (roundingMode) { 3343 case float_round_nearest_even: 3344 case float_round_ties_away: 3345 roundIncrement = 0x200; 3346 break; 3347 case float_round_to_zero: 3348 roundIncrement = 0; 3349 break; 3350 case float_round_up: 3351 roundIncrement = zSign ? 0 : 0x3ff; 3352 break; 3353 case float_round_down: 3354 roundIncrement = zSign ? 0x3ff : 0; 3355 break; 3356 case float_round_to_odd: 3357 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3358 break; 3359 default: 3360 abort(); 3361 } 3362 roundBits = zSig & 0x3FF; 3363 if ( 0x7FD <= (uint16_t) zExp ) { 3364 if ( ( 0x7FD < zExp ) 3365 || ( ( zExp == 0x7FD ) 3366 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3367 ) { 3368 bool overflow_to_inf = roundingMode != float_round_to_odd && 3369 roundIncrement != 0; 3370 float_raise(float_flag_overflow | float_flag_inexact, status); 3371 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3372 } 3373 if ( zExp < 0 ) { 3374 if (status->flush_to_zero) { 3375 float_raise(float_flag_output_denormal, status); 3376 return packFloat64(zSign, 0, 0); 3377 } 3378 isTiny = 3379 (status->float_detect_tininess 3380 == float_tininess_before_rounding) 3381 || ( zExp < -1 ) 3382 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3383 shift64RightJamming( zSig, - zExp, &zSig ); 3384 zExp = 0; 3385 roundBits = zSig & 0x3FF; 3386 if (isTiny && roundBits) { 3387 float_raise(float_flag_underflow, status); 3388 } 3389 if (roundingMode == float_round_to_odd) { 3390 /* 3391 * For round-to-odd case, the roundIncrement depends on 3392 * zSig which just changed. 3393 */ 3394 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3395 } 3396 } 3397 } 3398 if (roundBits) { 3399 status->float_exception_flags |= float_flag_inexact; 3400 } 3401 zSig = ( zSig + roundIncrement )>>10; 3402 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3403 if ( zSig == 0 ) zExp = 0; 3404 return packFloat64( zSign, zExp, zSig ); 3405 3406 } 3407 3408 /*---------------------------------------------------------------------------- 3409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3410 | and significand `zSig', and returns the proper double-precision floating- 3411 | point value corresponding to the abstract input. This routine is just like 3412 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3413 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3414 | floating-point exponent. 3415 *----------------------------------------------------------------------------*/ 3416 3417 static float64 3418 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3419 float_status *status) 3420 { 3421 int8_t shiftCount; 3422 3423 shiftCount = clz64(zSig) - 1; 3424 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3425 status); 3426 3427 } 3428 3429 /*---------------------------------------------------------------------------- 3430 | Normalizes the subnormal extended double-precision floating-point value 3431 | represented by the denormalized significand `aSig'. The normalized exponent 3432 | and significand are stored at the locations pointed to by `zExpPtr' and 3433 | `zSigPtr', respectively. 3434 *----------------------------------------------------------------------------*/ 3435 3436 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3437 uint64_t *zSigPtr) 3438 { 3439 int8_t shiftCount; 3440 3441 shiftCount = clz64(aSig); 3442 *zSigPtr = aSig<<shiftCount; 3443 *zExpPtr = 1 - shiftCount; 3444 } 3445 3446 /*---------------------------------------------------------------------------- 3447 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3448 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3449 | and returns the proper extended double-precision floating-point value 3450 | corresponding to the abstract input. Ordinarily, the abstract value is 3451 | rounded and packed into the extended double-precision format, with the 3452 | inexact exception raised if the abstract input cannot be represented 3453 | exactly. However, if the abstract value is too large, the overflow and 3454 | inexact exceptions are raised and an infinity or maximal finite value is 3455 | returned. If the abstract value is too small, the input value is rounded to 3456 | a subnormal number, and the underflow and inexact exceptions are raised if 3457 | the abstract input cannot be represented exactly as a subnormal extended 3458 | double-precision floating-point number. 3459 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3460 | number of bits as single or double precision, respectively. Otherwise, the 3461 | result is rounded to the full precision of the extended double-precision 3462 | format. 3463 | The input significand must be normalized or smaller. If the input 3464 | significand is not normalized, `zExp' must be 0; in that case, the result 3465 | returned is a subnormal number, and it must not require rounding. The 3466 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3467 | Floating-Point Arithmetic. 3468 *----------------------------------------------------------------------------*/ 3469 3470 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3471 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3472 float_status *status) 3473 { 3474 int8_t roundingMode; 3475 flag roundNearestEven, increment, isTiny; 3476 int64_t roundIncrement, roundMask, roundBits; 3477 3478 roundingMode = status->float_rounding_mode; 3479 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3480 if ( roundingPrecision == 80 ) goto precision80; 3481 if ( roundingPrecision == 64 ) { 3482 roundIncrement = LIT64( 0x0000000000000400 ); 3483 roundMask = LIT64( 0x00000000000007FF ); 3484 } 3485 else if ( roundingPrecision == 32 ) { 3486 roundIncrement = LIT64( 0x0000008000000000 ); 3487 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3488 } 3489 else { 3490 goto precision80; 3491 } 3492 zSig0 |= ( zSig1 != 0 ); 3493 switch (roundingMode) { 3494 case float_round_nearest_even: 3495 case float_round_ties_away: 3496 break; 3497 case float_round_to_zero: 3498 roundIncrement = 0; 3499 break; 3500 case float_round_up: 3501 roundIncrement = zSign ? 0 : roundMask; 3502 break; 3503 case float_round_down: 3504 roundIncrement = zSign ? roundMask : 0; 3505 break; 3506 default: 3507 abort(); 3508 } 3509 roundBits = zSig0 & roundMask; 3510 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3511 if ( ( 0x7FFE < zExp ) 3512 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3513 ) { 3514 goto overflow; 3515 } 3516 if ( zExp <= 0 ) { 3517 if (status->flush_to_zero) { 3518 float_raise(float_flag_output_denormal, status); 3519 return packFloatx80(zSign, 0, 0); 3520 } 3521 isTiny = 3522 (status->float_detect_tininess 3523 == float_tininess_before_rounding) 3524 || ( zExp < 0 ) 3525 || ( zSig0 <= zSig0 + roundIncrement ); 3526 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3527 zExp = 0; 3528 roundBits = zSig0 & roundMask; 3529 if (isTiny && roundBits) { 3530 float_raise(float_flag_underflow, status); 3531 } 3532 if (roundBits) { 3533 status->float_exception_flags |= float_flag_inexact; 3534 } 3535 zSig0 += roundIncrement; 3536 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3537 roundIncrement = roundMask + 1; 3538 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3539 roundMask |= roundIncrement; 3540 } 3541 zSig0 &= ~ roundMask; 3542 return packFloatx80( zSign, zExp, zSig0 ); 3543 } 3544 } 3545 if (roundBits) { 3546 status->float_exception_flags |= float_flag_inexact; 3547 } 3548 zSig0 += roundIncrement; 3549 if ( zSig0 < roundIncrement ) { 3550 ++zExp; 3551 zSig0 = LIT64( 0x8000000000000000 ); 3552 } 3553 roundIncrement = roundMask + 1; 3554 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3555 roundMask |= roundIncrement; 3556 } 3557 zSig0 &= ~ roundMask; 3558 if ( zSig0 == 0 ) zExp = 0; 3559 return packFloatx80( zSign, zExp, zSig0 ); 3560 precision80: 3561 switch (roundingMode) { 3562 case float_round_nearest_even: 3563 case float_round_ties_away: 3564 increment = ((int64_t)zSig1 < 0); 3565 break; 3566 case float_round_to_zero: 3567 increment = 0; 3568 break; 3569 case float_round_up: 3570 increment = !zSign && zSig1; 3571 break; 3572 case float_round_down: 3573 increment = zSign && zSig1; 3574 break; 3575 default: 3576 abort(); 3577 } 3578 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3579 if ( ( 0x7FFE < zExp ) 3580 || ( ( zExp == 0x7FFE ) 3581 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3582 && increment 3583 ) 3584 ) { 3585 roundMask = 0; 3586 overflow: 3587 float_raise(float_flag_overflow | float_flag_inexact, status); 3588 if ( ( roundingMode == float_round_to_zero ) 3589 || ( zSign && ( roundingMode == float_round_up ) ) 3590 || ( ! zSign && ( roundingMode == float_round_down ) ) 3591 ) { 3592 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3593 } 3594 return packFloatx80(zSign, 3595 floatx80_infinity_high, 3596 floatx80_infinity_low); 3597 } 3598 if ( zExp <= 0 ) { 3599 isTiny = 3600 (status->float_detect_tininess 3601 == float_tininess_before_rounding) 3602 || ( zExp < 0 ) 3603 || ! increment 3604 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3605 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3606 zExp = 0; 3607 if (isTiny && zSig1) { 3608 float_raise(float_flag_underflow, status); 3609 } 3610 if (zSig1) { 3611 status->float_exception_flags |= float_flag_inexact; 3612 } 3613 switch (roundingMode) { 3614 case float_round_nearest_even: 3615 case float_round_ties_away: 3616 increment = ((int64_t)zSig1 < 0); 3617 break; 3618 case float_round_to_zero: 3619 increment = 0; 3620 break; 3621 case float_round_up: 3622 increment = !zSign && zSig1; 3623 break; 3624 case float_round_down: 3625 increment = zSign && zSig1; 3626 break; 3627 default: 3628 abort(); 3629 } 3630 if ( increment ) { 3631 ++zSig0; 3632 zSig0 &= 3633 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3634 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3635 } 3636 return packFloatx80( zSign, zExp, zSig0 ); 3637 } 3638 } 3639 if (zSig1) { 3640 status->float_exception_flags |= float_flag_inexact; 3641 } 3642 if ( increment ) { 3643 ++zSig0; 3644 if ( zSig0 == 0 ) { 3645 ++zExp; 3646 zSig0 = LIT64( 0x8000000000000000 ); 3647 } 3648 else { 3649 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3650 } 3651 } 3652 else { 3653 if ( zSig0 == 0 ) zExp = 0; 3654 } 3655 return packFloatx80( zSign, zExp, zSig0 ); 3656 3657 } 3658 3659 /*---------------------------------------------------------------------------- 3660 | Takes an abstract floating-point value having sign `zSign', exponent 3661 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3662 | and returns the proper extended double-precision floating-point value 3663 | corresponding to the abstract input. This routine is just like 3664 | `roundAndPackFloatx80' except that the input significand does not have to be 3665 | normalized. 3666 *----------------------------------------------------------------------------*/ 3667 3668 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3669 flag zSign, int32_t zExp, 3670 uint64_t zSig0, uint64_t zSig1, 3671 float_status *status) 3672 { 3673 int8_t shiftCount; 3674 3675 if ( zSig0 == 0 ) { 3676 zSig0 = zSig1; 3677 zSig1 = 0; 3678 zExp -= 64; 3679 } 3680 shiftCount = clz64(zSig0); 3681 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3682 zExp -= shiftCount; 3683 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3684 zSig0, zSig1, status); 3685 3686 } 3687 3688 /*---------------------------------------------------------------------------- 3689 | Returns the least-significant 64 fraction bits of the quadruple-precision 3690 | floating-point value `a'. 3691 *----------------------------------------------------------------------------*/ 3692 3693 static inline uint64_t extractFloat128Frac1( float128 a ) 3694 { 3695 3696 return a.low; 3697 3698 } 3699 3700 /*---------------------------------------------------------------------------- 3701 | Returns the most-significant 48 fraction bits of the quadruple-precision 3702 | floating-point value `a'. 3703 *----------------------------------------------------------------------------*/ 3704 3705 static inline uint64_t extractFloat128Frac0( float128 a ) 3706 { 3707 3708 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3709 3710 } 3711 3712 /*---------------------------------------------------------------------------- 3713 | Returns the exponent bits of the quadruple-precision floating-point value 3714 | `a'. 3715 *----------------------------------------------------------------------------*/ 3716 3717 static inline int32_t extractFloat128Exp( float128 a ) 3718 { 3719 3720 return ( a.high>>48 ) & 0x7FFF; 3721 3722 } 3723 3724 /*---------------------------------------------------------------------------- 3725 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3726 *----------------------------------------------------------------------------*/ 3727 3728 static inline flag extractFloat128Sign( float128 a ) 3729 { 3730 3731 return a.high>>63; 3732 3733 } 3734 3735 /*---------------------------------------------------------------------------- 3736 | Normalizes the subnormal quadruple-precision floating-point value 3737 | represented by the denormalized significand formed by the concatenation of 3738 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3739 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3740 | significand are stored at the location pointed to by `zSig0Ptr', and the 3741 | least significant 64 bits of the normalized significand are stored at the 3742 | location pointed to by `zSig1Ptr'. 3743 *----------------------------------------------------------------------------*/ 3744 3745 static void 3746 normalizeFloat128Subnormal( 3747 uint64_t aSig0, 3748 uint64_t aSig1, 3749 int32_t *zExpPtr, 3750 uint64_t *zSig0Ptr, 3751 uint64_t *zSig1Ptr 3752 ) 3753 { 3754 int8_t shiftCount; 3755 3756 if ( aSig0 == 0 ) { 3757 shiftCount = clz64(aSig1) - 15; 3758 if ( shiftCount < 0 ) { 3759 *zSig0Ptr = aSig1>>( - shiftCount ); 3760 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3761 } 3762 else { 3763 *zSig0Ptr = aSig1<<shiftCount; 3764 *zSig1Ptr = 0; 3765 } 3766 *zExpPtr = - shiftCount - 63; 3767 } 3768 else { 3769 shiftCount = clz64(aSig0) - 15; 3770 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3771 *zExpPtr = 1 - shiftCount; 3772 } 3773 3774 } 3775 3776 /*---------------------------------------------------------------------------- 3777 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3778 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3779 | floating-point value, returning the result. After being shifted into the 3780 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3781 | added together to form the most significant 32 bits of the result. This 3782 | means that any integer portion of `zSig0' will be added into the exponent. 3783 | Since a properly normalized significand will have an integer portion equal 3784 | to 1, the `zExp' input should be 1 less than the desired result exponent 3785 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3786 | significand. 3787 *----------------------------------------------------------------------------*/ 3788 3789 static inline float128 3790 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3791 { 3792 float128 z; 3793 3794 z.low = zSig1; 3795 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3796 return z; 3797 3798 } 3799 3800 /*---------------------------------------------------------------------------- 3801 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3802 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3803 | and `zSig2', and returns the proper quadruple-precision floating-point value 3804 | corresponding to the abstract input. Ordinarily, the abstract value is 3805 | simply rounded and packed into the quadruple-precision format, with the 3806 | inexact exception raised if the abstract input cannot be represented 3807 | exactly. However, if the abstract value is too large, the overflow and 3808 | inexact exceptions are raised and an infinity or maximal finite value is 3809 | returned. If the abstract value is too small, the input value is rounded to 3810 | a subnormal number, and the underflow and inexact exceptions are raised if 3811 | the abstract input cannot be represented exactly as a subnormal quadruple- 3812 | precision floating-point number. 3813 | The input significand must be normalized or smaller. If the input 3814 | significand is not normalized, `zExp' must be 0; in that case, the result 3815 | returned is a subnormal number, and it must not require rounding. In the 3816 | usual case that the input significand is normalized, `zExp' must be 1 less 3817 | than the ``true'' floating-point exponent. The handling of underflow and 3818 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3819 *----------------------------------------------------------------------------*/ 3820 3821 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3822 uint64_t zSig0, uint64_t zSig1, 3823 uint64_t zSig2, float_status *status) 3824 { 3825 int8_t roundingMode; 3826 flag roundNearestEven, increment, isTiny; 3827 3828 roundingMode = status->float_rounding_mode; 3829 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3830 switch (roundingMode) { 3831 case float_round_nearest_even: 3832 case float_round_ties_away: 3833 increment = ((int64_t)zSig2 < 0); 3834 break; 3835 case float_round_to_zero: 3836 increment = 0; 3837 break; 3838 case float_round_up: 3839 increment = !zSign && zSig2; 3840 break; 3841 case float_round_down: 3842 increment = zSign && zSig2; 3843 break; 3844 case float_round_to_odd: 3845 increment = !(zSig1 & 0x1) && zSig2; 3846 break; 3847 default: 3848 abort(); 3849 } 3850 if ( 0x7FFD <= (uint32_t) zExp ) { 3851 if ( ( 0x7FFD < zExp ) 3852 || ( ( zExp == 0x7FFD ) 3853 && eq128( 3854 LIT64( 0x0001FFFFFFFFFFFF ), 3855 LIT64( 0xFFFFFFFFFFFFFFFF ), 3856 zSig0, 3857 zSig1 3858 ) 3859 && increment 3860 ) 3861 ) { 3862 float_raise(float_flag_overflow | float_flag_inexact, status); 3863 if ( ( roundingMode == float_round_to_zero ) 3864 || ( zSign && ( roundingMode == float_round_up ) ) 3865 || ( ! zSign && ( roundingMode == float_round_down ) ) 3866 || (roundingMode == float_round_to_odd) 3867 ) { 3868 return 3869 packFloat128( 3870 zSign, 3871 0x7FFE, 3872 LIT64( 0x0000FFFFFFFFFFFF ), 3873 LIT64( 0xFFFFFFFFFFFFFFFF ) 3874 ); 3875 } 3876 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3877 } 3878 if ( zExp < 0 ) { 3879 if (status->flush_to_zero) { 3880 float_raise(float_flag_output_denormal, status); 3881 return packFloat128(zSign, 0, 0, 0); 3882 } 3883 isTiny = 3884 (status->float_detect_tininess 3885 == float_tininess_before_rounding) 3886 || ( zExp < -1 ) 3887 || ! increment 3888 || lt128( 3889 zSig0, 3890 zSig1, 3891 LIT64( 0x0001FFFFFFFFFFFF ), 3892 LIT64( 0xFFFFFFFFFFFFFFFF ) 3893 ); 3894 shift128ExtraRightJamming( 3895 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3896 zExp = 0; 3897 if (isTiny && zSig2) { 3898 float_raise(float_flag_underflow, status); 3899 } 3900 switch (roundingMode) { 3901 case float_round_nearest_even: 3902 case float_round_ties_away: 3903 increment = ((int64_t)zSig2 < 0); 3904 break; 3905 case float_round_to_zero: 3906 increment = 0; 3907 break; 3908 case float_round_up: 3909 increment = !zSign && zSig2; 3910 break; 3911 case float_round_down: 3912 increment = zSign && zSig2; 3913 break; 3914 case float_round_to_odd: 3915 increment = !(zSig1 & 0x1) && zSig2; 3916 break; 3917 default: 3918 abort(); 3919 } 3920 } 3921 } 3922 if (zSig2) { 3923 status->float_exception_flags |= float_flag_inexact; 3924 } 3925 if ( increment ) { 3926 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3927 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3928 } 3929 else { 3930 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3931 } 3932 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3933 3934 } 3935 3936 /*---------------------------------------------------------------------------- 3937 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3938 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3939 | returns the proper quadruple-precision floating-point value corresponding 3940 | to the abstract input. This routine is just like `roundAndPackFloat128' 3941 | except that the input significand has fewer bits and does not have to be 3942 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3943 | point exponent. 3944 *----------------------------------------------------------------------------*/ 3945 3946 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3947 uint64_t zSig0, uint64_t zSig1, 3948 float_status *status) 3949 { 3950 int8_t shiftCount; 3951 uint64_t zSig2; 3952 3953 if ( zSig0 == 0 ) { 3954 zSig0 = zSig1; 3955 zSig1 = 0; 3956 zExp -= 64; 3957 } 3958 shiftCount = clz64(zSig0) - 15; 3959 if ( 0 <= shiftCount ) { 3960 zSig2 = 0; 3961 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3962 } 3963 else { 3964 shift128ExtraRightJamming( 3965 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3966 } 3967 zExp -= shiftCount; 3968 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3969 3970 } 3971 3972 3973 /*---------------------------------------------------------------------------- 3974 | Returns the result of converting the 32-bit two's complement integer `a' 3975 | to the extended double-precision floating-point format. The conversion 3976 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3977 | Arithmetic. 3978 *----------------------------------------------------------------------------*/ 3979 3980 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3981 { 3982 flag zSign; 3983 uint32_t absA; 3984 int8_t shiftCount; 3985 uint64_t zSig; 3986 3987 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3988 zSign = ( a < 0 ); 3989 absA = zSign ? - a : a; 3990 shiftCount = clz32(absA) + 32; 3991 zSig = absA; 3992 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3993 3994 } 3995 3996 /*---------------------------------------------------------------------------- 3997 | Returns the result of converting the 32-bit two's complement integer `a' to 3998 | the quadruple-precision floating-point format. The conversion is performed 3999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4000 *----------------------------------------------------------------------------*/ 4001 4002 float128 int32_to_float128(int32_t a, float_status *status) 4003 { 4004 flag zSign; 4005 uint32_t absA; 4006 int8_t shiftCount; 4007 uint64_t zSig0; 4008 4009 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4010 zSign = ( a < 0 ); 4011 absA = zSign ? - a : a; 4012 shiftCount = clz32(absA) + 17; 4013 zSig0 = absA; 4014 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4015 4016 } 4017 4018 /*---------------------------------------------------------------------------- 4019 | Returns the result of converting the 64-bit two's complement integer `a' 4020 | to the extended double-precision floating-point format. The conversion 4021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4022 | Arithmetic. 4023 *----------------------------------------------------------------------------*/ 4024 4025 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4026 { 4027 flag zSign; 4028 uint64_t absA; 4029 int8_t shiftCount; 4030 4031 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4032 zSign = ( a < 0 ); 4033 absA = zSign ? - a : a; 4034 shiftCount = clz64(absA); 4035 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4036 4037 } 4038 4039 /*---------------------------------------------------------------------------- 4040 | Returns the result of converting the 64-bit two's complement integer `a' to 4041 | the quadruple-precision floating-point format. The conversion is performed 4042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4043 *----------------------------------------------------------------------------*/ 4044 4045 float128 int64_to_float128(int64_t a, float_status *status) 4046 { 4047 flag zSign; 4048 uint64_t absA; 4049 int8_t shiftCount; 4050 int32_t zExp; 4051 uint64_t zSig0, zSig1; 4052 4053 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4054 zSign = ( a < 0 ); 4055 absA = zSign ? - a : a; 4056 shiftCount = clz64(absA) + 49; 4057 zExp = 0x406E - shiftCount; 4058 if ( 64 <= shiftCount ) { 4059 zSig1 = 0; 4060 zSig0 = absA; 4061 shiftCount -= 64; 4062 } 4063 else { 4064 zSig1 = absA; 4065 zSig0 = 0; 4066 } 4067 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4068 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4069 4070 } 4071 4072 /*---------------------------------------------------------------------------- 4073 | Returns the result of converting the 64-bit unsigned integer `a' 4074 | to the quadruple-precision floating-point format. The conversion is performed 4075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4076 *----------------------------------------------------------------------------*/ 4077 4078 float128 uint64_to_float128(uint64_t a, float_status *status) 4079 { 4080 if (a == 0) { 4081 return float128_zero; 4082 } 4083 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4084 } 4085 4086 /*---------------------------------------------------------------------------- 4087 | Returns the result of converting the single-precision floating-point value 4088 | `a' to the extended double-precision floating-point format. The conversion 4089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4090 | Arithmetic. 4091 *----------------------------------------------------------------------------*/ 4092 4093 floatx80 float32_to_floatx80(float32 a, float_status *status) 4094 { 4095 flag aSign; 4096 int aExp; 4097 uint32_t aSig; 4098 4099 a = float32_squash_input_denormal(a, status); 4100 aSig = extractFloat32Frac( a ); 4101 aExp = extractFloat32Exp( a ); 4102 aSign = extractFloat32Sign( a ); 4103 if ( aExp == 0xFF ) { 4104 if (aSig) { 4105 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4106 } 4107 return packFloatx80(aSign, 4108 floatx80_infinity_high, 4109 floatx80_infinity_low); 4110 } 4111 if ( aExp == 0 ) { 4112 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4113 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4114 } 4115 aSig |= 0x00800000; 4116 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4117 4118 } 4119 4120 /*---------------------------------------------------------------------------- 4121 | Returns the result of converting the single-precision floating-point value 4122 | `a' to the double-precision floating-point format. The conversion is 4123 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4124 | Arithmetic. 4125 *----------------------------------------------------------------------------*/ 4126 4127 float128 float32_to_float128(float32 a, float_status *status) 4128 { 4129 flag aSign; 4130 int aExp; 4131 uint32_t aSig; 4132 4133 a = float32_squash_input_denormal(a, status); 4134 aSig = extractFloat32Frac( a ); 4135 aExp = extractFloat32Exp( a ); 4136 aSign = extractFloat32Sign( a ); 4137 if ( aExp == 0xFF ) { 4138 if (aSig) { 4139 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4140 } 4141 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4142 } 4143 if ( aExp == 0 ) { 4144 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4145 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4146 --aExp; 4147 } 4148 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4149 4150 } 4151 4152 /*---------------------------------------------------------------------------- 4153 | Returns the remainder of the single-precision floating-point value `a' 4154 | with respect to the corresponding value `b'. The operation is performed 4155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4156 *----------------------------------------------------------------------------*/ 4157 4158 float32 float32_rem(float32 a, float32 b, float_status *status) 4159 { 4160 flag aSign, zSign; 4161 int aExp, bExp, expDiff; 4162 uint32_t aSig, bSig; 4163 uint32_t q; 4164 uint64_t aSig64, bSig64, q64; 4165 uint32_t alternateASig; 4166 int32_t sigMean; 4167 a = float32_squash_input_denormal(a, status); 4168 b = float32_squash_input_denormal(b, status); 4169 4170 aSig = extractFloat32Frac( a ); 4171 aExp = extractFloat32Exp( a ); 4172 aSign = extractFloat32Sign( a ); 4173 bSig = extractFloat32Frac( b ); 4174 bExp = extractFloat32Exp( b ); 4175 if ( aExp == 0xFF ) { 4176 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4177 return propagateFloat32NaN(a, b, status); 4178 } 4179 float_raise(float_flag_invalid, status); 4180 return float32_default_nan(status); 4181 } 4182 if ( bExp == 0xFF ) { 4183 if (bSig) { 4184 return propagateFloat32NaN(a, b, status); 4185 } 4186 return a; 4187 } 4188 if ( bExp == 0 ) { 4189 if ( bSig == 0 ) { 4190 float_raise(float_flag_invalid, status); 4191 return float32_default_nan(status); 4192 } 4193 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4194 } 4195 if ( aExp == 0 ) { 4196 if ( aSig == 0 ) return a; 4197 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4198 } 4199 expDiff = aExp - bExp; 4200 aSig |= 0x00800000; 4201 bSig |= 0x00800000; 4202 if ( expDiff < 32 ) { 4203 aSig <<= 8; 4204 bSig <<= 8; 4205 if ( expDiff < 0 ) { 4206 if ( expDiff < -1 ) return a; 4207 aSig >>= 1; 4208 } 4209 q = ( bSig <= aSig ); 4210 if ( q ) aSig -= bSig; 4211 if ( 0 < expDiff ) { 4212 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4213 q >>= 32 - expDiff; 4214 bSig >>= 2; 4215 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4216 } 4217 else { 4218 aSig >>= 2; 4219 bSig >>= 2; 4220 } 4221 } 4222 else { 4223 if ( bSig <= aSig ) aSig -= bSig; 4224 aSig64 = ( (uint64_t) aSig )<<40; 4225 bSig64 = ( (uint64_t) bSig )<<40; 4226 expDiff -= 64; 4227 while ( 0 < expDiff ) { 4228 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4229 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4230 aSig64 = - ( ( bSig * q64 )<<38 ); 4231 expDiff -= 62; 4232 } 4233 expDiff += 64; 4234 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4235 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4236 q = q64>>( 64 - expDiff ); 4237 bSig <<= 6; 4238 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4239 } 4240 do { 4241 alternateASig = aSig; 4242 ++q; 4243 aSig -= bSig; 4244 } while ( 0 <= (int32_t) aSig ); 4245 sigMean = aSig + alternateASig; 4246 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4247 aSig = alternateASig; 4248 } 4249 zSign = ( (int32_t) aSig < 0 ); 4250 if ( zSign ) aSig = - aSig; 4251 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4252 } 4253 4254 4255 4256 /*---------------------------------------------------------------------------- 4257 | Returns the binary exponential of the single-precision floating-point value 4258 | `a'. The operation is performed according to the IEC/IEEE Standard for 4259 | Binary Floating-Point Arithmetic. 4260 | 4261 | Uses the following identities: 4262 | 4263 | 1. ------------------------------------------------------------------------- 4264 | x x*ln(2) 4265 | 2 = e 4266 | 4267 | 2. ------------------------------------------------------------------------- 4268 | 2 3 4 5 n 4269 | x x x x x x x 4270 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4271 | 1! 2! 3! 4! 5! n! 4272 *----------------------------------------------------------------------------*/ 4273 4274 static const float64 float32_exp2_coefficients[15] = 4275 { 4276 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4277 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4278 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4279 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4280 const_float64( 0x3f81111111111111ll ), /* 5 */ 4281 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4282 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4283 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4284 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4285 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4286 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4287 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4288 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4289 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4290 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4291 }; 4292 4293 float32 float32_exp2(float32 a, float_status *status) 4294 { 4295 flag aSign; 4296 int aExp; 4297 uint32_t aSig; 4298 float64 r, x, xn; 4299 int i; 4300 a = float32_squash_input_denormal(a, status); 4301 4302 aSig = extractFloat32Frac( a ); 4303 aExp = extractFloat32Exp( a ); 4304 aSign = extractFloat32Sign( a ); 4305 4306 if ( aExp == 0xFF) { 4307 if (aSig) { 4308 return propagateFloat32NaN(a, float32_zero, status); 4309 } 4310 return (aSign) ? float32_zero : a; 4311 } 4312 if (aExp == 0) { 4313 if (aSig == 0) return float32_one; 4314 } 4315 4316 float_raise(float_flag_inexact, status); 4317 4318 /* ******************************* */ 4319 /* using float64 for approximation */ 4320 /* ******************************* */ 4321 x = float32_to_float64(a, status); 4322 x = float64_mul(x, float64_ln2, status); 4323 4324 xn = x; 4325 r = float64_one; 4326 for (i = 0 ; i < 15 ; i++) { 4327 float64 f; 4328 4329 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4330 r = float64_add(r, f, status); 4331 4332 xn = float64_mul(xn, x, status); 4333 } 4334 4335 return float64_to_float32(r, status); 4336 } 4337 4338 /*---------------------------------------------------------------------------- 4339 | Returns the binary log of the single-precision floating-point value `a'. 4340 | The operation is performed according to the IEC/IEEE Standard for Binary 4341 | Floating-Point Arithmetic. 4342 *----------------------------------------------------------------------------*/ 4343 float32 float32_log2(float32 a, float_status *status) 4344 { 4345 flag aSign, zSign; 4346 int aExp; 4347 uint32_t aSig, zSig, i; 4348 4349 a = float32_squash_input_denormal(a, status); 4350 aSig = extractFloat32Frac( a ); 4351 aExp = extractFloat32Exp( a ); 4352 aSign = extractFloat32Sign( a ); 4353 4354 if ( aExp == 0 ) { 4355 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4356 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4357 } 4358 if ( aSign ) { 4359 float_raise(float_flag_invalid, status); 4360 return float32_default_nan(status); 4361 } 4362 if ( aExp == 0xFF ) { 4363 if (aSig) { 4364 return propagateFloat32NaN(a, float32_zero, status); 4365 } 4366 return a; 4367 } 4368 4369 aExp -= 0x7F; 4370 aSig |= 0x00800000; 4371 zSign = aExp < 0; 4372 zSig = aExp << 23; 4373 4374 for (i = 1 << 22; i > 0; i >>= 1) { 4375 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4376 if ( aSig & 0x01000000 ) { 4377 aSig >>= 1; 4378 zSig |= i; 4379 } 4380 } 4381 4382 if ( zSign ) 4383 zSig = -zSig; 4384 4385 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4386 } 4387 4388 /*---------------------------------------------------------------------------- 4389 | Returns 1 if the single-precision floating-point value `a' is equal to 4390 | the corresponding value `b', and 0 otherwise. The invalid exception is 4391 | raised if either operand is a NaN. Otherwise, the comparison is performed 4392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4393 *----------------------------------------------------------------------------*/ 4394 4395 int float32_eq(float32 a, float32 b, float_status *status) 4396 { 4397 uint32_t av, bv; 4398 a = float32_squash_input_denormal(a, status); 4399 b = float32_squash_input_denormal(b, status); 4400 4401 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4402 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4403 ) { 4404 float_raise(float_flag_invalid, status); 4405 return 0; 4406 } 4407 av = float32_val(a); 4408 bv = float32_val(b); 4409 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4410 } 4411 4412 /*---------------------------------------------------------------------------- 4413 | Returns 1 if the single-precision floating-point value `a' is less than 4414 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4415 | exception is raised if either operand is a NaN. The comparison is performed 4416 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4417 *----------------------------------------------------------------------------*/ 4418 4419 int float32_le(float32 a, float32 b, float_status *status) 4420 { 4421 flag aSign, bSign; 4422 uint32_t av, bv; 4423 a = float32_squash_input_denormal(a, status); 4424 b = float32_squash_input_denormal(b, status); 4425 4426 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4427 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4428 ) { 4429 float_raise(float_flag_invalid, status); 4430 return 0; 4431 } 4432 aSign = extractFloat32Sign( a ); 4433 bSign = extractFloat32Sign( b ); 4434 av = float32_val(a); 4435 bv = float32_val(b); 4436 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4437 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4438 4439 } 4440 4441 /*---------------------------------------------------------------------------- 4442 | Returns 1 if the single-precision floating-point value `a' is less than 4443 | the corresponding value `b', and 0 otherwise. The invalid exception is 4444 | raised if either operand is a NaN. The comparison is performed according 4445 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4446 *----------------------------------------------------------------------------*/ 4447 4448 int float32_lt(float32 a, float32 b, float_status *status) 4449 { 4450 flag aSign, bSign; 4451 uint32_t av, bv; 4452 a = float32_squash_input_denormal(a, status); 4453 b = float32_squash_input_denormal(b, status); 4454 4455 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4456 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4457 ) { 4458 float_raise(float_flag_invalid, status); 4459 return 0; 4460 } 4461 aSign = extractFloat32Sign( a ); 4462 bSign = extractFloat32Sign( b ); 4463 av = float32_val(a); 4464 bv = float32_val(b); 4465 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4466 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4467 4468 } 4469 4470 /*---------------------------------------------------------------------------- 4471 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4472 | be compared, and 0 otherwise. The invalid exception is raised if either 4473 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4474 | Standard for Binary Floating-Point Arithmetic. 4475 *----------------------------------------------------------------------------*/ 4476 4477 int float32_unordered(float32 a, float32 b, float_status *status) 4478 { 4479 a = float32_squash_input_denormal(a, status); 4480 b = float32_squash_input_denormal(b, status); 4481 4482 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4483 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4484 ) { 4485 float_raise(float_flag_invalid, status); 4486 return 1; 4487 } 4488 return 0; 4489 } 4490 4491 /*---------------------------------------------------------------------------- 4492 | Returns 1 if the single-precision floating-point value `a' is equal to 4493 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4494 | exception. The comparison is performed according to the IEC/IEEE Standard 4495 | for Binary Floating-Point Arithmetic. 4496 *----------------------------------------------------------------------------*/ 4497 4498 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4499 { 4500 a = float32_squash_input_denormal(a, status); 4501 b = float32_squash_input_denormal(b, status); 4502 4503 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4504 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4505 ) { 4506 if (float32_is_signaling_nan(a, status) 4507 || float32_is_signaling_nan(b, status)) { 4508 float_raise(float_flag_invalid, status); 4509 } 4510 return 0; 4511 } 4512 return ( float32_val(a) == float32_val(b) ) || 4513 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4514 } 4515 4516 /*---------------------------------------------------------------------------- 4517 | Returns 1 if the single-precision floating-point value `a' is less than or 4518 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4519 | cause an exception. Otherwise, the comparison is performed according to the 4520 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4521 *----------------------------------------------------------------------------*/ 4522 4523 int float32_le_quiet(float32 a, float32 b, float_status *status) 4524 { 4525 flag aSign, bSign; 4526 uint32_t av, bv; 4527 a = float32_squash_input_denormal(a, status); 4528 b = float32_squash_input_denormal(b, status); 4529 4530 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4531 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4532 ) { 4533 if (float32_is_signaling_nan(a, status) 4534 || float32_is_signaling_nan(b, status)) { 4535 float_raise(float_flag_invalid, status); 4536 } 4537 return 0; 4538 } 4539 aSign = extractFloat32Sign( a ); 4540 bSign = extractFloat32Sign( b ); 4541 av = float32_val(a); 4542 bv = float32_val(b); 4543 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4544 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4545 4546 } 4547 4548 /*---------------------------------------------------------------------------- 4549 | Returns 1 if the single-precision floating-point value `a' is less than 4550 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4551 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4552 | Standard for Binary Floating-Point Arithmetic. 4553 *----------------------------------------------------------------------------*/ 4554 4555 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4556 { 4557 flag aSign, bSign; 4558 uint32_t av, bv; 4559 a = float32_squash_input_denormal(a, status); 4560 b = float32_squash_input_denormal(b, status); 4561 4562 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4563 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4564 ) { 4565 if (float32_is_signaling_nan(a, status) 4566 || float32_is_signaling_nan(b, status)) { 4567 float_raise(float_flag_invalid, status); 4568 } 4569 return 0; 4570 } 4571 aSign = extractFloat32Sign( a ); 4572 bSign = extractFloat32Sign( b ); 4573 av = float32_val(a); 4574 bv = float32_val(b); 4575 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4576 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4577 4578 } 4579 4580 /*---------------------------------------------------------------------------- 4581 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4582 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4583 | comparison is performed according to the IEC/IEEE Standard for Binary 4584 | Floating-Point Arithmetic. 4585 *----------------------------------------------------------------------------*/ 4586 4587 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4588 { 4589 a = float32_squash_input_denormal(a, status); 4590 b = float32_squash_input_denormal(b, status); 4591 4592 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4593 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4594 ) { 4595 if (float32_is_signaling_nan(a, status) 4596 || float32_is_signaling_nan(b, status)) { 4597 float_raise(float_flag_invalid, status); 4598 } 4599 return 1; 4600 } 4601 return 0; 4602 } 4603 4604 /*---------------------------------------------------------------------------- 4605 | If `a' is denormal and we are in flush-to-zero mode then set the 4606 | input-denormal exception and return zero. Otherwise just return the value. 4607 *----------------------------------------------------------------------------*/ 4608 float16 float16_squash_input_denormal(float16 a, float_status *status) 4609 { 4610 if (status->flush_inputs_to_zero) { 4611 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4612 float_raise(float_flag_input_denormal, status); 4613 return make_float16(float16_val(a) & 0x8000); 4614 } 4615 } 4616 return a; 4617 } 4618 4619 /*---------------------------------------------------------------------------- 4620 | Returns the result of converting the double-precision floating-point value 4621 | `a' to the extended double-precision floating-point format. The conversion 4622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4623 | Arithmetic. 4624 *----------------------------------------------------------------------------*/ 4625 4626 floatx80 float64_to_floatx80(float64 a, float_status *status) 4627 { 4628 flag aSign; 4629 int aExp; 4630 uint64_t aSig; 4631 4632 a = float64_squash_input_denormal(a, status); 4633 aSig = extractFloat64Frac( a ); 4634 aExp = extractFloat64Exp( a ); 4635 aSign = extractFloat64Sign( a ); 4636 if ( aExp == 0x7FF ) { 4637 if (aSig) { 4638 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4639 } 4640 return packFloatx80(aSign, 4641 floatx80_infinity_high, 4642 floatx80_infinity_low); 4643 } 4644 if ( aExp == 0 ) { 4645 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4646 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4647 } 4648 return 4649 packFloatx80( 4650 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4651 4652 } 4653 4654 /*---------------------------------------------------------------------------- 4655 | Returns the result of converting the double-precision floating-point value 4656 | `a' to the quadruple-precision floating-point format. The conversion is 4657 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4658 | Arithmetic. 4659 *----------------------------------------------------------------------------*/ 4660 4661 float128 float64_to_float128(float64 a, float_status *status) 4662 { 4663 flag aSign; 4664 int aExp; 4665 uint64_t aSig, zSig0, zSig1; 4666 4667 a = float64_squash_input_denormal(a, status); 4668 aSig = extractFloat64Frac( a ); 4669 aExp = extractFloat64Exp( a ); 4670 aSign = extractFloat64Sign( a ); 4671 if ( aExp == 0x7FF ) { 4672 if (aSig) { 4673 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4674 } 4675 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4676 } 4677 if ( aExp == 0 ) { 4678 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4679 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4680 --aExp; 4681 } 4682 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4683 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4684 4685 } 4686 4687 4688 /*---------------------------------------------------------------------------- 4689 | Returns the remainder of the double-precision floating-point value `a' 4690 | with respect to the corresponding value `b'. The operation is performed 4691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4692 *----------------------------------------------------------------------------*/ 4693 4694 float64 float64_rem(float64 a, float64 b, float_status *status) 4695 { 4696 flag aSign, zSign; 4697 int aExp, bExp, expDiff; 4698 uint64_t aSig, bSig; 4699 uint64_t q, alternateASig; 4700 int64_t sigMean; 4701 4702 a = float64_squash_input_denormal(a, status); 4703 b = float64_squash_input_denormal(b, status); 4704 aSig = extractFloat64Frac( a ); 4705 aExp = extractFloat64Exp( a ); 4706 aSign = extractFloat64Sign( a ); 4707 bSig = extractFloat64Frac( b ); 4708 bExp = extractFloat64Exp( b ); 4709 if ( aExp == 0x7FF ) { 4710 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4711 return propagateFloat64NaN(a, b, status); 4712 } 4713 float_raise(float_flag_invalid, status); 4714 return float64_default_nan(status); 4715 } 4716 if ( bExp == 0x7FF ) { 4717 if (bSig) { 4718 return propagateFloat64NaN(a, b, status); 4719 } 4720 return a; 4721 } 4722 if ( bExp == 0 ) { 4723 if ( bSig == 0 ) { 4724 float_raise(float_flag_invalid, status); 4725 return float64_default_nan(status); 4726 } 4727 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4728 } 4729 if ( aExp == 0 ) { 4730 if ( aSig == 0 ) return a; 4731 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4732 } 4733 expDiff = aExp - bExp; 4734 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4735 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4736 if ( expDiff < 0 ) { 4737 if ( expDiff < -1 ) return a; 4738 aSig >>= 1; 4739 } 4740 q = ( bSig <= aSig ); 4741 if ( q ) aSig -= bSig; 4742 expDiff -= 64; 4743 while ( 0 < expDiff ) { 4744 q = estimateDiv128To64( aSig, 0, bSig ); 4745 q = ( 2 < q ) ? q - 2 : 0; 4746 aSig = - ( ( bSig>>2 ) * q ); 4747 expDiff -= 62; 4748 } 4749 expDiff += 64; 4750 if ( 0 < expDiff ) { 4751 q = estimateDiv128To64( aSig, 0, bSig ); 4752 q = ( 2 < q ) ? q - 2 : 0; 4753 q >>= 64 - expDiff; 4754 bSig >>= 2; 4755 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4756 } 4757 else { 4758 aSig >>= 2; 4759 bSig >>= 2; 4760 } 4761 do { 4762 alternateASig = aSig; 4763 ++q; 4764 aSig -= bSig; 4765 } while ( 0 <= (int64_t) aSig ); 4766 sigMean = aSig + alternateASig; 4767 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4768 aSig = alternateASig; 4769 } 4770 zSign = ( (int64_t) aSig < 0 ); 4771 if ( zSign ) aSig = - aSig; 4772 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4773 4774 } 4775 4776 /*---------------------------------------------------------------------------- 4777 | Returns the binary log of the double-precision floating-point value `a'. 4778 | The operation is performed according to the IEC/IEEE Standard for Binary 4779 | Floating-Point Arithmetic. 4780 *----------------------------------------------------------------------------*/ 4781 float64 float64_log2(float64 a, float_status *status) 4782 { 4783 flag aSign, zSign; 4784 int aExp; 4785 uint64_t aSig, aSig0, aSig1, zSig, i; 4786 a = float64_squash_input_denormal(a, status); 4787 4788 aSig = extractFloat64Frac( a ); 4789 aExp = extractFloat64Exp( a ); 4790 aSign = extractFloat64Sign( a ); 4791 4792 if ( aExp == 0 ) { 4793 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4794 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4795 } 4796 if ( aSign ) { 4797 float_raise(float_flag_invalid, status); 4798 return float64_default_nan(status); 4799 } 4800 if ( aExp == 0x7FF ) { 4801 if (aSig) { 4802 return propagateFloat64NaN(a, float64_zero, status); 4803 } 4804 return a; 4805 } 4806 4807 aExp -= 0x3FF; 4808 aSig |= LIT64( 0x0010000000000000 ); 4809 zSign = aExp < 0; 4810 zSig = (uint64_t)aExp << 52; 4811 for (i = 1LL << 51; i > 0; i >>= 1) { 4812 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4813 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4814 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4815 aSig >>= 1; 4816 zSig |= i; 4817 } 4818 } 4819 4820 if ( zSign ) 4821 zSig = -zSig; 4822 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4823 } 4824 4825 /*---------------------------------------------------------------------------- 4826 | Returns 1 if the double-precision floating-point value `a' is equal to the 4827 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4828 | if either operand is a NaN. Otherwise, the comparison is performed 4829 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4830 *----------------------------------------------------------------------------*/ 4831 4832 int float64_eq(float64 a, float64 b, float_status *status) 4833 { 4834 uint64_t av, bv; 4835 a = float64_squash_input_denormal(a, status); 4836 b = float64_squash_input_denormal(b, status); 4837 4838 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4839 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4840 ) { 4841 float_raise(float_flag_invalid, status); 4842 return 0; 4843 } 4844 av = float64_val(a); 4845 bv = float64_val(b); 4846 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4847 4848 } 4849 4850 /*---------------------------------------------------------------------------- 4851 | Returns 1 if the double-precision floating-point value `a' is less than or 4852 | equal to the corresponding value `b', and 0 otherwise. The invalid 4853 | exception is raised if either operand is a NaN. The comparison is performed 4854 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4855 *----------------------------------------------------------------------------*/ 4856 4857 int float64_le(float64 a, float64 b, float_status *status) 4858 { 4859 flag aSign, bSign; 4860 uint64_t av, bv; 4861 a = float64_squash_input_denormal(a, status); 4862 b = float64_squash_input_denormal(b, status); 4863 4864 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4865 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4866 ) { 4867 float_raise(float_flag_invalid, status); 4868 return 0; 4869 } 4870 aSign = extractFloat64Sign( a ); 4871 bSign = extractFloat64Sign( b ); 4872 av = float64_val(a); 4873 bv = float64_val(b); 4874 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4875 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4876 4877 } 4878 4879 /*---------------------------------------------------------------------------- 4880 | Returns 1 if the double-precision floating-point value `a' is less than 4881 | the corresponding value `b', and 0 otherwise. The invalid exception is 4882 | raised if either operand is a NaN. The comparison is performed according 4883 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4884 *----------------------------------------------------------------------------*/ 4885 4886 int float64_lt(float64 a, float64 b, float_status *status) 4887 { 4888 flag aSign, bSign; 4889 uint64_t av, bv; 4890 4891 a = float64_squash_input_denormal(a, status); 4892 b = float64_squash_input_denormal(b, status); 4893 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4894 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4895 ) { 4896 float_raise(float_flag_invalid, status); 4897 return 0; 4898 } 4899 aSign = extractFloat64Sign( a ); 4900 bSign = extractFloat64Sign( b ); 4901 av = float64_val(a); 4902 bv = float64_val(b); 4903 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4904 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4905 4906 } 4907 4908 /*---------------------------------------------------------------------------- 4909 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4910 | be compared, and 0 otherwise. The invalid exception is raised if either 4911 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4912 | Standard for Binary Floating-Point Arithmetic. 4913 *----------------------------------------------------------------------------*/ 4914 4915 int float64_unordered(float64 a, float64 b, float_status *status) 4916 { 4917 a = float64_squash_input_denormal(a, status); 4918 b = float64_squash_input_denormal(b, status); 4919 4920 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4921 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4922 ) { 4923 float_raise(float_flag_invalid, status); 4924 return 1; 4925 } 4926 return 0; 4927 } 4928 4929 /*---------------------------------------------------------------------------- 4930 | Returns 1 if the double-precision floating-point value `a' is equal to the 4931 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4932 | exception.The comparison is performed according to the IEC/IEEE Standard 4933 | for Binary Floating-Point Arithmetic. 4934 *----------------------------------------------------------------------------*/ 4935 4936 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4937 { 4938 uint64_t av, bv; 4939 a = float64_squash_input_denormal(a, status); 4940 b = float64_squash_input_denormal(b, status); 4941 4942 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4943 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4944 ) { 4945 if (float64_is_signaling_nan(a, status) 4946 || float64_is_signaling_nan(b, status)) { 4947 float_raise(float_flag_invalid, status); 4948 } 4949 return 0; 4950 } 4951 av = float64_val(a); 4952 bv = float64_val(b); 4953 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4954 4955 } 4956 4957 /*---------------------------------------------------------------------------- 4958 | Returns 1 if the double-precision floating-point value `a' is less than or 4959 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4960 | cause an exception. Otherwise, the comparison is performed according to the 4961 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4962 *----------------------------------------------------------------------------*/ 4963 4964 int float64_le_quiet(float64 a, float64 b, float_status *status) 4965 { 4966 flag aSign, bSign; 4967 uint64_t av, bv; 4968 a = float64_squash_input_denormal(a, status); 4969 b = float64_squash_input_denormal(b, status); 4970 4971 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4972 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4973 ) { 4974 if (float64_is_signaling_nan(a, status) 4975 || float64_is_signaling_nan(b, status)) { 4976 float_raise(float_flag_invalid, status); 4977 } 4978 return 0; 4979 } 4980 aSign = extractFloat64Sign( a ); 4981 bSign = extractFloat64Sign( b ); 4982 av = float64_val(a); 4983 bv = float64_val(b); 4984 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4985 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4986 4987 } 4988 4989 /*---------------------------------------------------------------------------- 4990 | Returns 1 if the double-precision floating-point value `a' is less than 4991 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4992 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4993 | Standard for Binary Floating-Point Arithmetic. 4994 *----------------------------------------------------------------------------*/ 4995 4996 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4997 { 4998 flag aSign, bSign; 4999 uint64_t av, bv; 5000 a = float64_squash_input_denormal(a, status); 5001 b = float64_squash_input_denormal(b, status); 5002 5003 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5004 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5005 ) { 5006 if (float64_is_signaling_nan(a, status) 5007 || float64_is_signaling_nan(b, status)) { 5008 float_raise(float_flag_invalid, status); 5009 } 5010 return 0; 5011 } 5012 aSign = extractFloat64Sign( a ); 5013 bSign = extractFloat64Sign( b ); 5014 av = float64_val(a); 5015 bv = float64_val(b); 5016 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5017 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5018 5019 } 5020 5021 /*---------------------------------------------------------------------------- 5022 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5023 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5024 | comparison is performed according to the IEC/IEEE Standard for Binary 5025 | Floating-Point Arithmetic. 5026 *----------------------------------------------------------------------------*/ 5027 5028 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5029 { 5030 a = float64_squash_input_denormal(a, status); 5031 b = float64_squash_input_denormal(b, status); 5032 5033 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5034 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5035 ) { 5036 if (float64_is_signaling_nan(a, status) 5037 || float64_is_signaling_nan(b, status)) { 5038 float_raise(float_flag_invalid, status); 5039 } 5040 return 1; 5041 } 5042 return 0; 5043 } 5044 5045 /*---------------------------------------------------------------------------- 5046 | Returns the result of converting the extended double-precision floating- 5047 | point value `a' to the 32-bit two's complement integer format. The 5048 | conversion is performed according to the IEC/IEEE Standard for Binary 5049 | Floating-Point Arithmetic---which means in particular that the conversion 5050 | is rounded according to the current rounding mode. If `a' is a NaN, the 5051 | largest positive integer is returned. Otherwise, if the conversion 5052 | overflows, the largest integer with the same sign as `a' is returned. 5053 *----------------------------------------------------------------------------*/ 5054 5055 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5056 { 5057 flag aSign; 5058 int32_t aExp, shiftCount; 5059 uint64_t aSig; 5060 5061 if (floatx80_invalid_encoding(a)) { 5062 float_raise(float_flag_invalid, status); 5063 return 1 << 31; 5064 } 5065 aSig = extractFloatx80Frac( a ); 5066 aExp = extractFloatx80Exp( a ); 5067 aSign = extractFloatx80Sign( a ); 5068 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5069 shiftCount = 0x4037 - aExp; 5070 if ( shiftCount <= 0 ) shiftCount = 1; 5071 shift64RightJamming( aSig, shiftCount, &aSig ); 5072 return roundAndPackInt32(aSign, aSig, status); 5073 5074 } 5075 5076 /*---------------------------------------------------------------------------- 5077 | Returns the result of converting the extended double-precision floating- 5078 | point value `a' to the 32-bit two's complement integer format. The 5079 | conversion is performed according to the IEC/IEEE Standard for Binary 5080 | Floating-Point Arithmetic, except that the conversion is always rounded 5081 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5082 | Otherwise, if the conversion overflows, the largest integer with the same 5083 | sign as `a' is returned. 5084 *----------------------------------------------------------------------------*/ 5085 5086 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5087 { 5088 flag aSign; 5089 int32_t aExp, shiftCount; 5090 uint64_t aSig, savedASig; 5091 int32_t z; 5092 5093 if (floatx80_invalid_encoding(a)) { 5094 float_raise(float_flag_invalid, status); 5095 return 1 << 31; 5096 } 5097 aSig = extractFloatx80Frac( a ); 5098 aExp = extractFloatx80Exp( a ); 5099 aSign = extractFloatx80Sign( a ); 5100 if ( 0x401E < aExp ) { 5101 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5102 goto invalid; 5103 } 5104 else if ( aExp < 0x3FFF ) { 5105 if (aExp || aSig) { 5106 status->float_exception_flags |= float_flag_inexact; 5107 } 5108 return 0; 5109 } 5110 shiftCount = 0x403E - aExp; 5111 savedASig = aSig; 5112 aSig >>= shiftCount; 5113 z = aSig; 5114 if ( aSign ) z = - z; 5115 if ( ( z < 0 ) ^ aSign ) { 5116 invalid: 5117 float_raise(float_flag_invalid, status); 5118 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5119 } 5120 if ( ( aSig<<shiftCount ) != savedASig ) { 5121 status->float_exception_flags |= float_flag_inexact; 5122 } 5123 return z; 5124 5125 } 5126 5127 /*---------------------------------------------------------------------------- 5128 | Returns the result of converting the extended double-precision floating- 5129 | point value `a' to the 64-bit two's complement integer format. The 5130 | conversion is performed according to the IEC/IEEE Standard for Binary 5131 | Floating-Point Arithmetic---which means in particular that the conversion 5132 | is rounded according to the current rounding mode. If `a' is a NaN, 5133 | the largest positive integer is returned. Otherwise, if the conversion 5134 | overflows, the largest integer with the same sign as `a' is returned. 5135 *----------------------------------------------------------------------------*/ 5136 5137 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5138 { 5139 flag aSign; 5140 int32_t aExp, shiftCount; 5141 uint64_t aSig, aSigExtra; 5142 5143 if (floatx80_invalid_encoding(a)) { 5144 float_raise(float_flag_invalid, status); 5145 return 1ULL << 63; 5146 } 5147 aSig = extractFloatx80Frac( a ); 5148 aExp = extractFloatx80Exp( a ); 5149 aSign = extractFloatx80Sign( a ); 5150 shiftCount = 0x403E - aExp; 5151 if ( shiftCount <= 0 ) { 5152 if ( shiftCount ) { 5153 float_raise(float_flag_invalid, status); 5154 if (!aSign || floatx80_is_any_nan(a)) { 5155 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5156 } 5157 return (int64_t) LIT64( 0x8000000000000000 ); 5158 } 5159 aSigExtra = 0; 5160 } 5161 else { 5162 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5163 } 5164 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5165 5166 } 5167 5168 /*---------------------------------------------------------------------------- 5169 | Returns the result of converting the extended double-precision floating- 5170 | point value `a' to the 64-bit two's complement integer format. The 5171 | conversion is performed according to the IEC/IEEE Standard for Binary 5172 | Floating-Point Arithmetic, except that the conversion is always rounded 5173 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5174 | Otherwise, if the conversion overflows, the largest integer with the same 5175 | sign as `a' is returned. 5176 *----------------------------------------------------------------------------*/ 5177 5178 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5179 { 5180 flag aSign; 5181 int32_t aExp, shiftCount; 5182 uint64_t aSig; 5183 int64_t z; 5184 5185 if (floatx80_invalid_encoding(a)) { 5186 float_raise(float_flag_invalid, status); 5187 return 1ULL << 63; 5188 } 5189 aSig = extractFloatx80Frac( a ); 5190 aExp = extractFloatx80Exp( a ); 5191 aSign = extractFloatx80Sign( a ); 5192 shiftCount = aExp - 0x403E; 5193 if ( 0 <= shiftCount ) { 5194 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5195 if ( ( a.high != 0xC03E ) || aSig ) { 5196 float_raise(float_flag_invalid, status); 5197 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5198 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5199 } 5200 } 5201 return (int64_t) LIT64( 0x8000000000000000 ); 5202 } 5203 else if ( aExp < 0x3FFF ) { 5204 if (aExp | aSig) { 5205 status->float_exception_flags |= float_flag_inexact; 5206 } 5207 return 0; 5208 } 5209 z = aSig>>( - shiftCount ); 5210 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5211 status->float_exception_flags |= float_flag_inexact; 5212 } 5213 if ( aSign ) z = - z; 5214 return z; 5215 5216 } 5217 5218 /*---------------------------------------------------------------------------- 5219 | Returns the result of converting the extended double-precision floating- 5220 | point value `a' to the single-precision floating-point format. The 5221 | conversion is performed according to the IEC/IEEE Standard for Binary 5222 | Floating-Point Arithmetic. 5223 *----------------------------------------------------------------------------*/ 5224 5225 float32 floatx80_to_float32(floatx80 a, float_status *status) 5226 { 5227 flag aSign; 5228 int32_t aExp; 5229 uint64_t aSig; 5230 5231 if (floatx80_invalid_encoding(a)) { 5232 float_raise(float_flag_invalid, status); 5233 return float32_default_nan(status); 5234 } 5235 aSig = extractFloatx80Frac( a ); 5236 aExp = extractFloatx80Exp( a ); 5237 aSign = extractFloatx80Sign( a ); 5238 if ( aExp == 0x7FFF ) { 5239 if ( (uint64_t) ( aSig<<1 ) ) { 5240 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5241 } 5242 return packFloat32( aSign, 0xFF, 0 ); 5243 } 5244 shift64RightJamming( aSig, 33, &aSig ); 5245 if ( aExp || aSig ) aExp -= 0x3F81; 5246 return roundAndPackFloat32(aSign, aExp, aSig, status); 5247 5248 } 5249 5250 /*---------------------------------------------------------------------------- 5251 | Returns the result of converting the extended double-precision floating- 5252 | point value `a' to the double-precision floating-point format. The 5253 | conversion is performed according to the IEC/IEEE Standard for Binary 5254 | Floating-Point Arithmetic. 5255 *----------------------------------------------------------------------------*/ 5256 5257 float64 floatx80_to_float64(floatx80 a, float_status *status) 5258 { 5259 flag aSign; 5260 int32_t aExp; 5261 uint64_t aSig, zSig; 5262 5263 if (floatx80_invalid_encoding(a)) { 5264 float_raise(float_flag_invalid, status); 5265 return float64_default_nan(status); 5266 } 5267 aSig = extractFloatx80Frac( a ); 5268 aExp = extractFloatx80Exp( a ); 5269 aSign = extractFloatx80Sign( a ); 5270 if ( aExp == 0x7FFF ) { 5271 if ( (uint64_t) ( aSig<<1 ) ) { 5272 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5273 } 5274 return packFloat64( aSign, 0x7FF, 0 ); 5275 } 5276 shift64RightJamming( aSig, 1, &zSig ); 5277 if ( aExp || aSig ) aExp -= 0x3C01; 5278 return roundAndPackFloat64(aSign, aExp, zSig, status); 5279 5280 } 5281 5282 /*---------------------------------------------------------------------------- 5283 | Returns the result of converting the extended double-precision floating- 5284 | point value `a' to the quadruple-precision floating-point format. The 5285 | conversion is performed according to the IEC/IEEE Standard for Binary 5286 | Floating-Point Arithmetic. 5287 *----------------------------------------------------------------------------*/ 5288 5289 float128 floatx80_to_float128(floatx80 a, float_status *status) 5290 { 5291 flag aSign; 5292 int aExp; 5293 uint64_t aSig, zSig0, zSig1; 5294 5295 if (floatx80_invalid_encoding(a)) { 5296 float_raise(float_flag_invalid, status); 5297 return float128_default_nan(status); 5298 } 5299 aSig = extractFloatx80Frac( a ); 5300 aExp = extractFloatx80Exp( a ); 5301 aSign = extractFloatx80Sign( a ); 5302 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5303 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5304 } 5305 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5306 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5307 5308 } 5309 5310 /*---------------------------------------------------------------------------- 5311 | Rounds the extended double-precision floating-point value `a' 5312 | to the precision provided by floatx80_rounding_precision and returns the 5313 | result as an extended double-precision floating-point value. 5314 | The operation is performed according to the IEC/IEEE Standard for Binary 5315 | Floating-Point Arithmetic. 5316 *----------------------------------------------------------------------------*/ 5317 5318 floatx80 floatx80_round(floatx80 a, float_status *status) 5319 { 5320 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5321 extractFloatx80Sign(a), 5322 extractFloatx80Exp(a), 5323 extractFloatx80Frac(a), 0, status); 5324 } 5325 5326 /*---------------------------------------------------------------------------- 5327 | Rounds the extended double-precision floating-point value `a' to an integer, 5328 | and returns the result as an extended quadruple-precision floating-point 5329 | value. The operation is performed according to the IEC/IEEE Standard for 5330 | Binary Floating-Point Arithmetic. 5331 *----------------------------------------------------------------------------*/ 5332 5333 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5334 { 5335 flag aSign; 5336 int32_t aExp; 5337 uint64_t lastBitMask, roundBitsMask; 5338 floatx80 z; 5339 5340 if (floatx80_invalid_encoding(a)) { 5341 float_raise(float_flag_invalid, status); 5342 return floatx80_default_nan(status); 5343 } 5344 aExp = extractFloatx80Exp( a ); 5345 if ( 0x403E <= aExp ) { 5346 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5347 return propagateFloatx80NaN(a, a, status); 5348 } 5349 return a; 5350 } 5351 if ( aExp < 0x3FFF ) { 5352 if ( ( aExp == 0 ) 5353 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5354 return a; 5355 } 5356 status->float_exception_flags |= float_flag_inexact; 5357 aSign = extractFloatx80Sign( a ); 5358 switch (status->float_rounding_mode) { 5359 case float_round_nearest_even: 5360 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5361 ) { 5362 return 5363 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5364 } 5365 break; 5366 case float_round_ties_away: 5367 if (aExp == 0x3FFE) { 5368 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5369 } 5370 break; 5371 case float_round_down: 5372 return 5373 aSign ? 5374 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5375 : packFloatx80( 0, 0, 0 ); 5376 case float_round_up: 5377 return 5378 aSign ? packFloatx80( 1, 0, 0 ) 5379 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5380 } 5381 return packFloatx80( aSign, 0, 0 ); 5382 } 5383 lastBitMask = 1; 5384 lastBitMask <<= 0x403E - aExp; 5385 roundBitsMask = lastBitMask - 1; 5386 z = a; 5387 switch (status->float_rounding_mode) { 5388 case float_round_nearest_even: 5389 z.low += lastBitMask>>1; 5390 if ((z.low & roundBitsMask) == 0) { 5391 z.low &= ~lastBitMask; 5392 } 5393 break; 5394 case float_round_ties_away: 5395 z.low += lastBitMask >> 1; 5396 break; 5397 case float_round_to_zero: 5398 break; 5399 case float_round_up: 5400 if (!extractFloatx80Sign(z)) { 5401 z.low += roundBitsMask; 5402 } 5403 break; 5404 case float_round_down: 5405 if (extractFloatx80Sign(z)) { 5406 z.low += roundBitsMask; 5407 } 5408 break; 5409 default: 5410 abort(); 5411 } 5412 z.low &= ~ roundBitsMask; 5413 if ( z.low == 0 ) { 5414 ++z.high; 5415 z.low = LIT64( 0x8000000000000000 ); 5416 } 5417 if (z.low != a.low) { 5418 status->float_exception_flags |= float_flag_inexact; 5419 } 5420 return z; 5421 5422 } 5423 5424 /*---------------------------------------------------------------------------- 5425 | Returns the result of adding the absolute values of the extended double- 5426 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5427 | negated before being returned. `zSign' is ignored if the result is a NaN. 5428 | The addition is performed according to the IEC/IEEE Standard for Binary 5429 | Floating-Point Arithmetic. 5430 *----------------------------------------------------------------------------*/ 5431 5432 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5433 float_status *status) 5434 { 5435 int32_t aExp, bExp, zExp; 5436 uint64_t aSig, bSig, zSig0, zSig1; 5437 int32_t expDiff; 5438 5439 aSig = extractFloatx80Frac( a ); 5440 aExp = extractFloatx80Exp( a ); 5441 bSig = extractFloatx80Frac( b ); 5442 bExp = extractFloatx80Exp( b ); 5443 expDiff = aExp - bExp; 5444 if ( 0 < expDiff ) { 5445 if ( aExp == 0x7FFF ) { 5446 if ((uint64_t)(aSig << 1)) { 5447 return propagateFloatx80NaN(a, b, status); 5448 } 5449 return a; 5450 } 5451 if ( bExp == 0 ) --expDiff; 5452 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5453 zExp = aExp; 5454 } 5455 else if ( expDiff < 0 ) { 5456 if ( bExp == 0x7FFF ) { 5457 if ((uint64_t)(bSig << 1)) { 5458 return propagateFloatx80NaN(a, b, status); 5459 } 5460 return packFloatx80(zSign, 5461 floatx80_infinity_high, 5462 floatx80_infinity_low); 5463 } 5464 if ( aExp == 0 ) ++expDiff; 5465 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5466 zExp = bExp; 5467 } 5468 else { 5469 if ( aExp == 0x7FFF ) { 5470 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5471 return propagateFloatx80NaN(a, b, status); 5472 } 5473 return a; 5474 } 5475 zSig1 = 0; 5476 zSig0 = aSig + bSig; 5477 if ( aExp == 0 ) { 5478 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5479 goto roundAndPack; 5480 } 5481 zExp = aExp; 5482 goto shiftRight1; 5483 } 5484 zSig0 = aSig + bSig; 5485 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5486 shiftRight1: 5487 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5488 zSig0 |= LIT64( 0x8000000000000000 ); 5489 ++zExp; 5490 roundAndPack: 5491 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5492 zSign, zExp, zSig0, zSig1, status); 5493 } 5494 5495 /*---------------------------------------------------------------------------- 5496 | Returns the result of subtracting the absolute values of the extended 5497 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5498 | difference is negated before being returned. `zSign' is ignored if the 5499 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5500 | Standard for Binary Floating-Point Arithmetic. 5501 *----------------------------------------------------------------------------*/ 5502 5503 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5504 float_status *status) 5505 { 5506 int32_t aExp, bExp, zExp; 5507 uint64_t aSig, bSig, zSig0, zSig1; 5508 int32_t expDiff; 5509 5510 aSig = extractFloatx80Frac( a ); 5511 aExp = extractFloatx80Exp( a ); 5512 bSig = extractFloatx80Frac( b ); 5513 bExp = extractFloatx80Exp( b ); 5514 expDiff = aExp - bExp; 5515 if ( 0 < expDiff ) goto aExpBigger; 5516 if ( expDiff < 0 ) goto bExpBigger; 5517 if ( aExp == 0x7FFF ) { 5518 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5519 return propagateFloatx80NaN(a, b, status); 5520 } 5521 float_raise(float_flag_invalid, status); 5522 return floatx80_default_nan(status); 5523 } 5524 if ( aExp == 0 ) { 5525 aExp = 1; 5526 bExp = 1; 5527 } 5528 zSig1 = 0; 5529 if ( bSig < aSig ) goto aBigger; 5530 if ( aSig < bSig ) goto bBigger; 5531 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5532 bExpBigger: 5533 if ( bExp == 0x7FFF ) { 5534 if ((uint64_t)(bSig << 1)) { 5535 return propagateFloatx80NaN(a, b, status); 5536 } 5537 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5538 floatx80_infinity_low); 5539 } 5540 if ( aExp == 0 ) ++expDiff; 5541 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5542 bBigger: 5543 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5544 zExp = bExp; 5545 zSign ^= 1; 5546 goto normalizeRoundAndPack; 5547 aExpBigger: 5548 if ( aExp == 0x7FFF ) { 5549 if ((uint64_t)(aSig << 1)) { 5550 return propagateFloatx80NaN(a, b, status); 5551 } 5552 return a; 5553 } 5554 if ( bExp == 0 ) --expDiff; 5555 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5556 aBigger: 5557 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5558 zExp = aExp; 5559 normalizeRoundAndPack: 5560 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5561 zSign, zExp, zSig0, zSig1, status); 5562 } 5563 5564 /*---------------------------------------------------------------------------- 5565 | Returns the result of adding the extended double-precision floating-point 5566 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5567 | Standard for Binary Floating-Point Arithmetic. 5568 *----------------------------------------------------------------------------*/ 5569 5570 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5571 { 5572 flag aSign, bSign; 5573 5574 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5575 float_raise(float_flag_invalid, status); 5576 return floatx80_default_nan(status); 5577 } 5578 aSign = extractFloatx80Sign( a ); 5579 bSign = extractFloatx80Sign( b ); 5580 if ( aSign == bSign ) { 5581 return addFloatx80Sigs(a, b, aSign, status); 5582 } 5583 else { 5584 return subFloatx80Sigs(a, b, aSign, status); 5585 } 5586 5587 } 5588 5589 /*---------------------------------------------------------------------------- 5590 | Returns the result of subtracting the extended double-precision floating- 5591 | point values `a' and `b'. The operation is performed according to the 5592 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5593 *----------------------------------------------------------------------------*/ 5594 5595 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5596 { 5597 flag aSign, bSign; 5598 5599 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5600 float_raise(float_flag_invalid, status); 5601 return floatx80_default_nan(status); 5602 } 5603 aSign = extractFloatx80Sign( a ); 5604 bSign = extractFloatx80Sign( b ); 5605 if ( aSign == bSign ) { 5606 return subFloatx80Sigs(a, b, aSign, status); 5607 } 5608 else { 5609 return addFloatx80Sigs(a, b, aSign, status); 5610 } 5611 5612 } 5613 5614 /*---------------------------------------------------------------------------- 5615 | Returns the result of multiplying the extended double-precision floating- 5616 | point values `a' and `b'. The operation is performed according to the 5617 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5618 *----------------------------------------------------------------------------*/ 5619 5620 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5621 { 5622 flag aSign, bSign, zSign; 5623 int32_t aExp, bExp, zExp; 5624 uint64_t aSig, bSig, zSig0, zSig1; 5625 5626 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5627 float_raise(float_flag_invalid, status); 5628 return floatx80_default_nan(status); 5629 } 5630 aSig = extractFloatx80Frac( a ); 5631 aExp = extractFloatx80Exp( a ); 5632 aSign = extractFloatx80Sign( a ); 5633 bSig = extractFloatx80Frac( b ); 5634 bExp = extractFloatx80Exp( b ); 5635 bSign = extractFloatx80Sign( b ); 5636 zSign = aSign ^ bSign; 5637 if ( aExp == 0x7FFF ) { 5638 if ( (uint64_t) ( aSig<<1 ) 5639 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5640 return propagateFloatx80NaN(a, b, status); 5641 } 5642 if ( ( bExp | bSig ) == 0 ) goto invalid; 5643 return packFloatx80(zSign, floatx80_infinity_high, 5644 floatx80_infinity_low); 5645 } 5646 if ( bExp == 0x7FFF ) { 5647 if ((uint64_t)(bSig << 1)) { 5648 return propagateFloatx80NaN(a, b, status); 5649 } 5650 if ( ( aExp | aSig ) == 0 ) { 5651 invalid: 5652 float_raise(float_flag_invalid, status); 5653 return floatx80_default_nan(status); 5654 } 5655 return packFloatx80(zSign, floatx80_infinity_high, 5656 floatx80_infinity_low); 5657 } 5658 if ( aExp == 0 ) { 5659 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5660 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5661 } 5662 if ( bExp == 0 ) { 5663 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5664 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5665 } 5666 zExp = aExp + bExp - 0x3FFE; 5667 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5668 if ( 0 < (int64_t) zSig0 ) { 5669 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5670 --zExp; 5671 } 5672 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5673 zSign, zExp, zSig0, zSig1, status); 5674 } 5675 5676 /*---------------------------------------------------------------------------- 5677 | Returns the result of dividing the extended double-precision floating-point 5678 | value `a' by the corresponding value `b'. The operation is performed 5679 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5680 *----------------------------------------------------------------------------*/ 5681 5682 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5683 { 5684 flag aSign, bSign, zSign; 5685 int32_t aExp, bExp, zExp; 5686 uint64_t aSig, bSig, zSig0, zSig1; 5687 uint64_t rem0, rem1, rem2, term0, term1, term2; 5688 5689 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5690 float_raise(float_flag_invalid, status); 5691 return floatx80_default_nan(status); 5692 } 5693 aSig = extractFloatx80Frac( a ); 5694 aExp = extractFloatx80Exp( a ); 5695 aSign = extractFloatx80Sign( a ); 5696 bSig = extractFloatx80Frac( b ); 5697 bExp = extractFloatx80Exp( b ); 5698 bSign = extractFloatx80Sign( b ); 5699 zSign = aSign ^ bSign; 5700 if ( aExp == 0x7FFF ) { 5701 if ((uint64_t)(aSig << 1)) { 5702 return propagateFloatx80NaN(a, b, status); 5703 } 5704 if ( bExp == 0x7FFF ) { 5705 if ((uint64_t)(bSig << 1)) { 5706 return propagateFloatx80NaN(a, b, status); 5707 } 5708 goto invalid; 5709 } 5710 return packFloatx80(zSign, floatx80_infinity_high, 5711 floatx80_infinity_low); 5712 } 5713 if ( bExp == 0x7FFF ) { 5714 if ((uint64_t)(bSig << 1)) { 5715 return propagateFloatx80NaN(a, b, status); 5716 } 5717 return packFloatx80( zSign, 0, 0 ); 5718 } 5719 if ( bExp == 0 ) { 5720 if ( bSig == 0 ) { 5721 if ( ( aExp | aSig ) == 0 ) { 5722 invalid: 5723 float_raise(float_flag_invalid, status); 5724 return floatx80_default_nan(status); 5725 } 5726 float_raise(float_flag_divbyzero, status); 5727 return packFloatx80(zSign, floatx80_infinity_high, 5728 floatx80_infinity_low); 5729 } 5730 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5731 } 5732 if ( aExp == 0 ) { 5733 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5734 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5735 } 5736 zExp = aExp - bExp + 0x3FFE; 5737 rem1 = 0; 5738 if ( bSig <= aSig ) { 5739 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5740 ++zExp; 5741 } 5742 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5743 mul64To128( bSig, zSig0, &term0, &term1 ); 5744 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5745 while ( (int64_t) rem0 < 0 ) { 5746 --zSig0; 5747 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5748 } 5749 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5750 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5751 mul64To128( bSig, zSig1, &term1, &term2 ); 5752 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5753 while ( (int64_t) rem1 < 0 ) { 5754 --zSig1; 5755 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5756 } 5757 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5758 } 5759 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5760 zSign, zExp, zSig0, zSig1, status); 5761 } 5762 5763 /*---------------------------------------------------------------------------- 5764 | Returns the remainder of the extended double-precision floating-point value 5765 | `a' with respect to the corresponding value `b'. The operation is performed 5766 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5767 *----------------------------------------------------------------------------*/ 5768 5769 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5770 { 5771 flag aSign, zSign; 5772 int32_t aExp, bExp, expDiff; 5773 uint64_t aSig0, aSig1, bSig; 5774 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5775 5776 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5777 float_raise(float_flag_invalid, status); 5778 return floatx80_default_nan(status); 5779 } 5780 aSig0 = extractFloatx80Frac( a ); 5781 aExp = extractFloatx80Exp( a ); 5782 aSign = extractFloatx80Sign( a ); 5783 bSig = extractFloatx80Frac( b ); 5784 bExp = extractFloatx80Exp( b ); 5785 if ( aExp == 0x7FFF ) { 5786 if ( (uint64_t) ( aSig0<<1 ) 5787 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5788 return propagateFloatx80NaN(a, b, status); 5789 } 5790 goto invalid; 5791 } 5792 if ( bExp == 0x7FFF ) { 5793 if ((uint64_t)(bSig << 1)) { 5794 return propagateFloatx80NaN(a, b, status); 5795 } 5796 return a; 5797 } 5798 if ( bExp == 0 ) { 5799 if ( bSig == 0 ) { 5800 invalid: 5801 float_raise(float_flag_invalid, status); 5802 return floatx80_default_nan(status); 5803 } 5804 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5805 } 5806 if ( aExp == 0 ) { 5807 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5808 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5809 } 5810 bSig |= LIT64( 0x8000000000000000 ); 5811 zSign = aSign; 5812 expDiff = aExp - bExp; 5813 aSig1 = 0; 5814 if ( expDiff < 0 ) { 5815 if ( expDiff < -1 ) return a; 5816 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5817 expDiff = 0; 5818 } 5819 q = ( bSig <= aSig0 ); 5820 if ( q ) aSig0 -= bSig; 5821 expDiff -= 64; 5822 while ( 0 < expDiff ) { 5823 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5824 q = ( 2 < q ) ? q - 2 : 0; 5825 mul64To128( bSig, q, &term0, &term1 ); 5826 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5827 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5828 expDiff -= 62; 5829 } 5830 expDiff += 64; 5831 if ( 0 < expDiff ) { 5832 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5833 q = ( 2 < q ) ? q - 2 : 0; 5834 q >>= 64 - expDiff; 5835 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5836 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5837 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5838 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5839 ++q; 5840 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5841 } 5842 } 5843 else { 5844 term1 = 0; 5845 term0 = bSig; 5846 } 5847 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5848 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5849 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5850 && ( q & 1 ) ) 5851 ) { 5852 aSig0 = alternateASig0; 5853 aSig1 = alternateASig1; 5854 zSign = ! zSign; 5855 } 5856 return 5857 normalizeRoundAndPackFloatx80( 5858 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5859 5860 } 5861 5862 /*---------------------------------------------------------------------------- 5863 | Returns the square root of the extended double-precision floating-point 5864 | value `a'. The operation is performed according to the IEC/IEEE Standard 5865 | for Binary Floating-Point Arithmetic. 5866 *----------------------------------------------------------------------------*/ 5867 5868 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5869 { 5870 flag aSign; 5871 int32_t aExp, zExp; 5872 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5873 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5874 5875 if (floatx80_invalid_encoding(a)) { 5876 float_raise(float_flag_invalid, status); 5877 return floatx80_default_nan(status); 5878 } 5879 aSig0 = extractFloatx80Frac( a ); 5880 aExp = extractFloatx80Exp( a ); 5881 aSign = extractFloatx80Sign( a ); 5882 if ( aExp == 0x7FFF ) { 5883 if ((uint64_t)(aSig0 << 1)) { 5884 return propagateFloatx80NaN(a, a, status); 5885 } 5886 if ( ! aSign ) return a; 5887 goto invalid; 5888 } 5889 if ( aSign ) { 5890 if ( ( aExp | aSig0 ) == 0 ) return a; 5891 invalid: 5892 float_raise(float_flag_invalid, status); 5893 return floatx80_default_nan(status); 5894 } 5895 if ( aExp == 0 ) { 5896 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5897 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5898 } 5899 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5900 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5901 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5902 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5903 doubleZSig0 = zSig0<<1; 5904 mul64To128( zSig0, zSig0, &term0, &term1 ); 5905 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5906 while ( (int64_t) rem0 < 0 ) { 5907 --zSig0; 5908 doubleZSig0 -= 2; 5909 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5910 } 5911 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5912 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5913 if ( zSig1 == 0 ) zSig1 = 1; 5914 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5915 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5916 mul64To128( zSig1, zSig1, &term2, &term3 ); 5917 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5918 while ( (int64_t) rem1 < 0 ) { 5919 --zSig1; 5920 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5921 term3 |= 1; 5922 term2 |= doubleZSig0; 5923 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5924 } 5925 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5926 } 5927 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5928 zSig0 |= doubleZSig0; 5929 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5930 0, zExp, zSig0, zSig1, status); 5931 } 5932 5933 /*---------------------------------------------------------------------------- 5934 | Returns 1 if the extended double-precision floating-point value `a' is equal 5935 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5936 | raised if either operand is a NaN. Otherwise, the comparison is performed 5937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5938 *----------------------------------------------------------------------------*/ 5939 5940 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5941 { 5942 5943 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5944 || (extractFloatx80Exp(a) == 0x7FFF 5945 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5946 || (extractFloatx80Exp(b) == 0x7FFF 5947 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5948 ) { 5949 float_raise(float_flag_invalid, status); 5950 return 0; 5951 } 5952 return 5953 ( a.low == b.low ) 5954 && ( ( a.high == b.high ) 5955 || ( ( a.low == 0 ) 5956 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5957 ); 5958 5959 } 5960 5961 /*---------------------------------------------------------------------------- 5962 | Returns 1 if the extended double-precision floating-point value `a' is 5963 | less than or equal to the corresponding value `b', and 0 otherwise. The 5964 | invalid exception is raised if either operand is a NaN. The comparison is 5965 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5966 | Arithmetic. 5967 *----------------------------------------------------------------------------*/ 5968 5969 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5970 { 5971 flag aSign, bSign; 5972 5973 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5974 || (extractFloatx80Exp(a) == 0x7FFF 5975 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5976 || (extractFloatx80Exp(b) == 0x7FFF 5977 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5978 ) { 5979 float_raise(float_flag_invalid, status); 5980 return 0; 5981 } 5982 aSign = extractFloatx80Sign( a ); 5983 bSign = extractFloatx80Sign( b ); 5984 if ( aSign != bSign ) { 5985 return 5986 aSign 5987 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5988 == 0 ); 5989 } 5990 return 5991 aSign ? le128( b.high, b.low, a.high, a.low ) 5992 : le128( a.high, a.low, b.high, b.low ); 5993 5994 } 5995 5996 /*---------------------------------------------------------------------------- 5997 | Returns 1 if the extended double-precision floating-point value `a' is 5998 | less than the corresponding value `b', and 0 otherwise. The invalid 5999 | exception is raised if either operand is a NaN. The comparison is performed 6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6001 *----------------------------------------------------------------------------*/ 6002 6003 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6004 { 6005 flag aSign, bSign; 6006 6007 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6008 || (extractFloatx80Exp(a) == 0x7FFF 6009 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6010 || (extractFloatx80Exp(b) == 0x7FFF 6011 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6012 ) { 6013 float_raise(float_flag_invalid, status); 6014 return 0; 6015 } 6016 aSign = extractFloatx80Sign( a ); 6017 bSign = extractFloatx80Sign( b ); 6018 if ( aSign != bSign ) { 6019 return 6020 aSign 6021 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6022 != 0 ); 6023 } 6024 return 6025 aSign ? lt128( b.high, b.low, a.high, a.low ) 6026 : lt128( a.high, a.low, b.high, b.low ); 6027 6028 } 6029 6030 /*---------------------------------------------------------------------------- 6031 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6032 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6033 | either operand is a NaN. The comparison is performed according to the 6034 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6035 *----------------------------------------------------------------------------*/ 6036 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6037 { 6038 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6039 || (extractFloatx80Exp(a) == 0x7FFF 6040 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6041 || (extractFloatx80Exp(b) == 0x7FFF 6042 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6043 ) { 6044 float_raise(float_flag_invalid, status); 6045 return 1; 6046 } 6047 return 0; 6048 } 6049 6050 /*---------------------------------------------------------------------------- 6051 | Returns 1 if the extended double-precision floating-point value `a' is 6052 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6053 | cause an exception. The comparison is performed according to the IEC/IEEE 6054 | Standard for Binary Floating-Point Arithmetic. 6055 *----------------------------------------------------------------------------*/ 6056 6057 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6058 { 6059 6060 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6061 float_raise(float_flag_invalid, status); 6062 return 0; 6063 } 6064 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6065 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6066 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6067 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6068 ) { 6069 if (floatx80_is_signaling_nan(a, status) 6070 || floatx80_is_signaling_nan(b, status)) { 6071 float_raise(float_flag_invalid, status); 6072 } 6073 return 0; 6074 } 6075 return 6076 ( a.low == b.low ) 6077 && ( ( a.high == b.high ) 6078 || ( ( a.low == 0 ) 6079 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6080 ); 6081 6082 } 6083 6084 /*---------------------------------------------------------------------------- 6085 | Returns 1 if the extended double-precision floating-point value `a' is less 6086 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6087 | do not cause an exception. Otherwise, the comparison is performed according 6088 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6089 *----------------------------------------------------------------------------*/ 6090 6091 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6092 { 6093 flag aSign, bSign; 6094 6095 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6096 float_raise(float_flag_invalid, status); 6097 return 0; 6098 } 6099 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6100 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6101 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6102 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6103 ) { 6104 if (floatx80_is_signaling_nan(a, status) 6105 || floatx80_is_signaling_nan(b, status)) { 6106 float_raise(float_flag_invalid, status); 6107 } 6108 return 0; 6109 } 6110 aSign = extractFloatx80Sign( a ); 6111 bSign = extractFloatx80Sign( b ); 6112 if ( aSign != bSign ) { 6113 return 6114 aSign 6115 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6116 == 0 ); 6117 } 6118 return 6119 aSign ? le128( b.high, b.low, a.high, a.low ) 6120 : le128( a.high, a.low, b.high, b.low ); 6121 6122 } 6123 6124 /*---------------------------------------------------------------------------- 6125 | Returns 1 if the extended double-precision floating-point value `a' is less 6126 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6127 | an exception. Otherwise, the comparison is performed according to the 6128 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6129 *----------------------------------------------------------------------------*/ 6130 6131 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6132 { 6133 flag aSign, bSign; 6134 6135 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6136 float_raise(float_flag_invalid, status); 6137 return 0; 6138 } 6139 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6140 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6141 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6142 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6143 ) { 6144 if (floatx80_is_signaling_nan(a, status) 6145 || floatx80_is_signaling_nan(b, status)) { 6146 float_raise(float_flag_invalid, status); 6147 } 6148 return 0; 6149 } 6150 aSign = extractFloatx80Sign( a ); 6151 bSign = extractFloatx80Sign( b ); 6152 if ( aSign != bSign ) { 6153 return 6154 aSign 6155 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6156 != 0 ); 6157 } 6158 return 6159 aSign ? lt128( b.high, b.low, a.high, a.low ) 6160 : lt128( a.high, a.low, b.high, b.low ); 6161 6162 } 6163 6164 /*---------------------------------------------------------------------------- 6165 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6166 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6167 | The comparison is performed according to the IEC/IEEE Standard for Binary 6168 | Floating-Point Arithmetic. 6169 *----------------------------------------------------------------------------*/ 6170 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6171 { 6172 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6173 float_raise(float_flag_invalid, status); 6174 return 1; 6175 } 6176 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6177 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6178 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6179 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6180 ) { 6181 if (floatx80_is_signaling_nan(a, status) 6182 || floatx80_is_signaling_nan(b, status)) { 6183 float_raise(float_flag_invalid, status); 6184 } 6185 return 1; 6186 } 6187 return 0; 6188 } 6189 6190 /*---------------------------------------------------------------------------- 6191 | Returns the result of converting the quadruple-precision floating-point 6192 | value `a' to the 32-bit two's complement integer format. The conversion 6193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6194 | Arithmetic---which means in particular that the conversion is rounded 6195 | according to the current rounding mode. If `a' is a NaN, the largest 6196 | positive integer is returned. Otherwise, if the conversion overflows, the 6197 | largest integer with the same sign as `a' is returned. 6198 *----------------------------------------------------------------------------*/ 6199 6200 int32_t float128_to_int32(float128 a, float_status *status) 6201 { 6202 flag aSign; 6203 int32_t aExp, shiftCount; 6204 uint64_t aSig0, aSig1; 6205 6206 aSig1 = extractFloat128Frac1( a ); 6207 aSig0 = extractFloat128Frac0( a ); 6208 aExp = extractFloat128Exp( a ); 6209 aSign = extractFloat128Sign( a ); 6210 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6211 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6212 aSig0 |= ( aSig1 != 0 ); 6213 shiftCount = 0x4028 - aExp; 6214 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6215 return roundAndPackInt32(aSign, aSig0, status); 6216 6217 } 6218 6219 /*---------------------------------------------------------------------------- 6220 | Returns the result of converting the quadruple-precision floating-point 6221 | value `a' to the 32-bit two's complement integer format. The conversion 6222 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6223 | Arithmetic, except that the conversion is always rounded toward zero. If 6224 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6225 | conversion overflows, the largest integer with the same sign as `a' is 6226 | returned. 6227 *----------------------------------------------------------------------------*/ 6228 6229 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6230 { 6231 flag aSign; 6232 int32_t aExp, shiftCount; 6233 uint64_t aSig0, aSig1, savedASig; 6234 int32_t z; 6235 6236 aSig1 = extractFloat128Frac1( a ); 6237 aSig0 = extractFloat128Frac0( a ); 6238 aExp = extractFloat128Exp( a ); 6239 aSign = extractFloat128Sign( a ); 6240 aSig0 |= ( aSig1 != 0 ); 6241 if ( 0x401E < aExp ) { 6242 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6243 goto invalid; 6244 } 6245 else if ( aExp < 0x3FFF ) { 6246 if (aExp || aSig0) { 6247 status->float_exception_flags |= float_flag_inexact; 6248 } 6249 return 0; 6250 } 6251 aSig0 |= LIT64( 0x0001000000000000 ); 6252 shiftCount = 0x402F - aExp; 6253 savedASig = aSig0; 6254 aSig0 >>= shiftCount; 6255 z = aSig0; 6256 if ( aSign ) z = - z; 6257 if ( ( z < 0 ) ^ aSign ) { 6258 invalid: 6259 float_raise(float_flag_invalid, status); 6260 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6261 } 6262 if ( ( aSig0<<shiftCount ) != savedASig ) { 6263 status->float_exception_flags |= float_flag_inexact; 6264 } 6265 return z; 6266 6267 } 6268 6269 /*---------------------------------------------------------------------------- 6270 | Returns the result of converting the quadruple-precision floating-point 6271 | value `a' to the 64-bit two's complement integer format. The conversion 6272 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6273 | Arithmetic---which means in particular that the conversion is rounded 6274 | according to the current rounding mode. If `a' is a NaN, the largest 6275 | positive integer is returned. Otherwise, if the conversion overflows, the 6276 | largest integer with the same sign as `a' is returned. 6277 *----------------------------------------------------------------------------*/ 6278 6279 int64_t float128_to_int64(float128 a, float_status *status) 6280 { 6281 flag aSign; 6282 int32_t aExp, shiftCount; 6283 uint64_t aSig0, aSig1; 6284 6285 aSig1 = extractFloat128Frac1( a ); 6286 aSig0 = extractFloat128Frac0( a ); 6287 aExp = extractFloat128Exp( a ); 6288 aSign = extractFloat128Sign( a ); 6289 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6290 shiftCount = 0x402F - aExp; 6291 if ( shiftCount <= 0 ) { 6292 if ( 0x403E < aExp ) { 6293 float_raise(float_flag_invalid, status); 6294 if ( ! aSign 6295 || ( ( aExp == 0x7FFF ) 6296 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6297 ) 6298 ) { 6299 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6300 } 6301 return (int64_t) LIT64( 0x8000000000000000 ); 6302 } 6303 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6304 } 6305 else { 6306 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6307 } 6308 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6309 6310 } 6311 6312 /*---------------------------------------------------------------------------- 6313 | Returns the result of converting the quadruple-precision floating-point 6314 | value `a' to the 64-bit two's complement integer format. The conversion 6315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6316 | Arithmetic, except that the conversion is always rounded toward zero. 6317 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6318 | the conversion overflows, the largest integer with the same sign as `a' is 6319 | returned. 6320 *----------------------------------------------------------------------------*/ 6321 6322 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6323 { 6324 flag aSign; 6325 int32_t aExp, shiftCount; 6326 uint64_t aSig0, aSig1; 6327 int64_t z; 6328 6329 aSig1 = extractFloat128Frac1( a ); 6330 aSig0 = extractFloat128Frac0( a ); 6331 aExp = extractFloat128Exp( a ); 6332 aSign = extractFloat128Sign( a ); 6333 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6334 shiftCount = aExp - 0x402F; 6335 if ( 0 < shiftCount ) { 6336 if ( 0x403E <= aExp ) { 6337 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6338 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6339 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6340 if (aSig1) { 6341 status->float_exception_flags |= float_flag_inexact; 6342 } 6343 } 6344 else { 6345 float_raise(float_flag_invalid, status); 6346 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6347 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6348 } 6349 } 6350 return (int64_t) LIT64( 0x8000000000000000 ); 6351 } 6352 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6353 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6354 status->float_exception_flags |= float_flag_inexact; 6355 } 6356 } 6357 else { 6358 if ( aExp < 0x3FFF ) { 6359 if ( aExp | aSig0 | aSig1 ) { 6360 status->float_exception_flags |= float_flag_inexact; 6361 } 6362 return 0; 6363 } 6364 z = aSig0>>( - shiftCount ); 6365 if ( aSig1 6366 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6367 status->float_exception_flags |= float_flag_inexact; 6368 } 6369 } 6370 if ( aSign ) z = - z; 6371 return z; 6372 6373 } 6374 6375 /*---------------------------------------------------------------------------- 6376 | Returns the result of converting the quadruple-precision floating-point value 6377 | `a' to the 64-bit unsigned integer format. The conversion is 6378 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6379 | Arithmetic---which means in particular that the conversion is rounded 6380 | according to the current rounding mode. If `a' is a NaN, the largest 6381 | positive integer is returned. If the conversion overflows, the 6382 | largest unsigned integer is returned. If 'a' is negative, the value is 6383 | rounded and zero is returned; negative values that do not round to zero 6384 | will raise the inexact exception. 6385 *----------------------------------------------------------------------------*/ 6386 6387 uint64_t float128_to_uint64(float128 a, float_status *status) 6388 { 6389 flag aSign; 6390 int aExp; 6391 int shiftCount; 6392 uint64_t aSig0, aSig1; 6393 6394 aSig0 = extractFloat128Frac0(a); 6395 aSig1 = extractFloat128Frac1(a); 6396 aExp = extractFloat128Exp(a); 6397 aSign = extractFloat128Sign(a); 6398 if (aSign && (aExp > 0x3FFE)) { 6399 float_raise(float_flag_invalid, status); 6400 if (float128_is_any_nan(a)) { 6401 return LIT64(0xFFFFFFFFFFFFFFFF); 6402 } else { 6403 return 0; 6404 } 6405 } 6406 if (aExp) { 6407 aSig0 |= LIT64(0x0001000000000000); 6408 } 6409 shiftCount = 0x402F - aExp; 6410 if (shiftCount <= 0) { 6411 if (0x403E < aExp) { 6412 float_raise(float_flag_invalid, status); 6413 return LIT64(0xFFFFFFFFFFFFFFFF); 6414 } 6415 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6416 } else { 6417 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6418 } 6419 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6420 } 6421 6422 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6423 { 6424 uint64_t v; 6425 signed char current_rounding_mode = status->float_rounding_mode; 6426 6427 set_float_rounding_mode(float_round_to_zero, status); 6428 v = float128_to_uint64(a, status); 6429 set_float_rounding_mode(current_rounding_mode, status); 6430 6431 return v; 6432 } 6433 6434 /*---------------------------------------------------------------------------- 6435 | Returns the result of converting the quadruple-precision floating-point 6436 | value `a' to the 32-bit unsigned integer format. The conversion 6437 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6438 | Arithmetic except that the conversion is always rounded toward zero. 6439 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6440 | if the conversion overflows, the largest unsigned integer is returned. 6441 | If 'a' is negative, the value is rounded and zero is returned; negative 6442 | values that do not round to zero will raise the inexact exception. 6443 *----------------------------------------------------------------------------*/ 6444 6445 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6446 { 6447 uint64_t v; 6448 uint32_t res; 6449 int old_exc_flags = get_float_exception_flags(status); 6450 6451 v = float128_to_uint64_round_to_zero(a, status); 6452 if (v > 0xffffffff) { 6453 res = 0xffffffff; 6454 } else { 6455 return v; 6456 } 6457 set_float_exception_flags(old_exc_flags, status); 6458 float_raise(float_flag_invalid, status); 6459 return res; 6460 } 6461 6462 /*---------------------------------------------------------------------------- 6463 | Returns the result of converting the quadruple-precision floating-point 6464 | value `a' to the single-precision floating-point format. The conversion 6465 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6466 | Arithmetic. 6467 *----------------------------------------------------------------------------*/ 6468 6469 float32 float128_to_float32(float128 a, float_status *status) 6470 { 6471 flag aSign; 6472 int32_t aExp; 6473 uint64_t aSig0, aSig1; 6474 uint32_t zSig; 6475 6476 aSig1 = extractFloat128Frac1( a ); 6477 aSig0 = extractFloat128Frac0( a ); 6478 aExp = extractFloat128Exp( a ); 6479 aSign = extractFloat128Sign( a ); 6480 if ( aExp == 0x7FFF ) { 6481 if ( aSig0 | aSig1 ) { 6482 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6483 } 6484 return packFloat32( aSign, 0xFF, 0 ); 6485 } 6486 aSig0 |= ( aSig1 != 0 ); 6487 shift64RightJamming( aSig0, 18, &aSig0 ); 6488 zSig = aSig0; 6489 if ( aExp || zSig ) { 6490 zSig |= 0x40000000; 6491 aExp -= 0x3F81; 6492 } 6493 return roundAndPackFloat32(aSign, aExp, zSig, status); 6494 6495 } 6496 6497 /*---------------------------------------------------------------------------- 6498 | Returns the result of converting the quadruple-precision floating-point 6499 | value `a' to the double-precision floating-point format. The conversion 6500 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6501 | Arithmetic. 6502 *----------------------------------------------------------------------------*/ 6503 6504 float64 float128_to_float64(float128 a, float_status *status) 6505 { 6506 flag aSign; 6507 int32_t aExp; 6508 uint64_t aSig0, aSig1; 6509 6510 aSig1 = extractFloat128Frac1( a ); 6511 aSig0 = extractFloat128Frac0( a ); 6512 aExp = extractFloat128Exp( a ); 6513 aSign = extractFloat128Sign( a ); 6514 if ( aExp == 0x7FFF ) { 6515 if ( aSig0 | aSig1 ) { 6516 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6517 } 6518 return packFloat64( aSign, 0x7FF, 0 ); 6519 } 6520 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6521 aSig0 |= ( aSig1 != 0 ); 6522 if ( aExp || aSig0 ) { 6523 aSig0 |= LIT64( 0x4000000000000000 ); 6524 aExp -= 0x3C01; 6525 } 6526 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6527 6528 } 6529 6530 /*---------------------------------------------------------------------------- 6531 | Returns the result of converting the quadruple-precision floating-point 6532 | value `a' to the extended double-precision floating-point format. The 6533 | conversion is performed according to the IEC/IEEE Standard for Binary 6534 | Floating-Point Arithmetic. 6535 *----------------------------------------------------------------------------*/ 6536 6537 floatx80 float128_to_floatx80(float128 a, float_status *status) 6538 { 6539 flag aSign; 6540 int32_t aExp; 6541 uint64_t aSig0, aSig1; 6542 6543 aSig1 = extractFloat128Frac1( a ); 6544 aSig0 = extractFloat128Frac0( a ); 6545 aExp = extractFloat128Exp( a ); 6546 aSign = extractFloat128Sign( a ); 6547 if ( aExp == 0x7FFF ) { 6548 if ( aSig0 | aSig1 ) { 6549 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6550 } 6551 return packFloatx80(aSign, floatx80_infinity_high, 6552 floatx80_infinity_low); 6553 } 6554 if ( aExp == 0 ) { 6555 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6556 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6557 } 6558 else { 6559 aSig0 |= LIT64( 0x0001000000000000 ); 6560 } 6561 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6562 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6563 6564 } 6565 6566 /*---------------------------------------------------------------------------- 6567 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6568 | returns the result as a quadruple-precision floating-point value. The 6569 | operation is performed according to the IEC/IEEE Standard for Binary 6570 | Floating-Point Arithmetic. 6571 *----------------------------------------------------------------------------*/ 6572 6573 float128 float128_round_to_int(float128 a, float_status *status) 6574 { 6575 flag aSign; 6576 int32_t aExp; 6577 uint64_t lastBitMask, roundBitsMask; 6578 float128 z; 6579 6580 aExp = extractFloat128Exp( a ); 6581 if ( 0x402F <= aExp ) { 6582 if ( 0x406F <= aExp ) { 6583 if ( ( aExp == 0x7FFF ) 6584 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6585 ) { 6586 return propagateFloat128NaN(a, a, status); 6587 } 6588 return a; 6589 } 6590 lastBitMask = 1; 6591 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6592 roundBitsMask = lastBitMask - 1; 6593 z = a; 6594 switch (status->float_rounding_mode) { 6595 case float_round_nearest_even: 6596 if ( lastBitMask ) { 6597 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6598 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6599 } 6600 else { 6601 if ( (int64_t) z.low < 0 ) { 6602 ++z.high; 6603 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6604 } 6605 } 6606 break; 6607 case float_round_ties_away: 6608 if (lastBitMask) { 6609 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6610 } else { 6611 if ((int64_t) z.low < 0) { 6612 ++z.high; 6613 } 6614 } 6615 break; 6616 case float_round_to_zero: 6617 break; 6618 case float_round_up: 6619 if (!extractFloat128Sign(z)) { 6620 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6621 } 6622 break; 6623 case float_round_down: 6624 if (extractFloat128Sign(z)) { 6625 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6626 } 6627 break; 6628 default: 6629 abort(); 6630 } 6631 z.low &= ~ roundBitsMask; 6632 } 6633 else { 6634 if ( aExp < 0x3FFF ) { 6635 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6636 status->float_exception_flags |= float_flag_inexact; 6637 aSign = extractFloat128Sign( a ); 6638 switch (status->float_rounding_mode) { 6639 case float_round_nearest_even: 6640 if ( ( aExp == 0x3FFE ) 6641 && ( extractFloat128Frac0( a ) 6642 | extractFloat128Frac1( a ) ) 6643 ) { 6644 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6645 } 6646 break; 6647 case float_round_ties_away: 6648 if (aExp == 0x3FFE) { 6649 return packFloat128(aSign, 0x3FFF, 0, 0); 6650 } 6651 break; 6652 case float_round_down: 6653 return 6654 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6655 : packFloat128( 0, 0, 0, 0 ); 6656 case float_round_up: 6657 return 6658 aSign ? packFloat128( 1, 0, 0, 0 ) 6659 : packFloat128( 0, 0x3FFF, 0, 0 ); 6660 } 6661 return packFloat128( aSign, 0, 0, 0 ); 6662 } 6663 lastBitMask = 1; 6664 lastBitMask <<= 0x402F - aExp; 6665 roundBitsMask = lastBitMask - 1; 6666 z.low = 0; 6667 z.high = a.high; 6668 switch (status->float_rounding_mode) { 6669 case float_round_nearest_even: 6670 z.high += lastBitMask>>1; 6671 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6672 z.high &= ~ lastBitMask; 6673 } 6674 break; 6675 case float_round_ties_away: 6676 z.high += lastBitMask>>1; 6677 break; 6678 case float_round_to_zero: 6679 break; 6680 case float_round_up: 6681 if (!extractFloat128Sign(z)) { 6682 z.high |= ( a.low != 0 ); 6683 z.high += roundBitsMask; 6684 } 6685 break; 6686 case float_round_down: 6687 if (extractFloat128Sign(z)) { 6688 z.high |= (a.low != 0); 6689 z.high += roundBitsMask; 6690 } 6691 break; 6692 default: 6693 abort(); 6694 } 6695 z.high &= ~ roundBitsMask; 6696 } 6697 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6698 status->float_exception_flags |= float_flag_inexact; 6699 } 6700 return z; 6701 6702 } 6703 6704 /*---------------------------------------------------------------------------- 6705 | Returns the result of adding the absolute values of the quadruple-precision 6706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6707 | before being returned. `zSign' is ignored if the result is a NaN. 6708 | The addition is performed according to the IEC/IEEE Standard for Binary 6709 | Floating-Point Arithmetic. 6710 *----------------------------------------------------------------------------*/ 6711 6712 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6713 float_status *status) 6714 { 6715 int32_t aExp, bExp, zExp; 6716 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6717 int32_t expDiff; 6718 6719 aSig1 = extractFloat128Frac1( a ); 6720 aSig0 = extractFloat128Frac0( a ); 6721 aExp = extractFloat128Exp( a ); 6722 bSig1 = extractFloat128Frac1( b ); 6723 bSig0 = extractFloat128Frac0( b ); 6724 bExp = extractFloat128Exp( b ); 6725 expDiff = aExp - bExp; 6726 if ( 0 < expDiff ) { 6727 if ( aExp == 0x7FFF ) { 6728 if (aSig0 | aSig1) { 6729 return propagateFloat128NaN(a, b, status); 6730 } 6731 return a; 6732 } 6733 if ( bExp == 0 ) { 6734 --expDiff; 6735 } 6736 else { 6737 bSig0 |= LIT64( 0x0001000000000000 ); 6738 } 6739 shift128ExtraRightJamming( 6740 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6741 zExp = aExp; 6742 } 6743 else if ( expDiff < 0 ) { 6744 if ( bExp == 0x7FFF ) { 6745 if (bSig0 | bSig1) { 6746 return propagateFloat128NaN(a, b, status); 6747 } 6748 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6749 } 6750 if ( aExp == 0 ) { 6751 ++expDiff; 6752 } 6753 else { 6754 aSig0 |= LIT64( 0x0001000000000000 ); 6755 } 6756 shift128ExtraRightJamming( 6757 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6758 zExp = bExp; 6759 } 6760 else { 6761 if ( aExp == 0x7FFF ) { 6762 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6763 return propagateFloat128NaN(a, b, status); 6764 } 6765 return a; 6766 } 6767 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6768 if ( aExp == 0 ) { 6769 if (status->flush_to_zero) { 6770 if (zSig0 | zSig1) { 6771 float_raise(float_flag_output_denormal, status); 6772 } 6773 return packFloat128(zSign, 0, 0, 0); 6774 } 6775 return packFloat128( zSign, 0, zSig0, zSig1 ); 6776 } 6777 zSig2 = 0; 6778 zSig0 |= LIT64( 0x0002000000000000 ); 6779 zExp = aExp; 6780 goto shiftRight1; 6781 } 6782 aSig0 |= LIT64( 0x0001000000000000 ); 6783 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6784 --zExp; 6785 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6786 ++zExp; 6787 shiftRight1: 6788 shift128ExtraRightJamming( 6789 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6790 roundAndPack: 6791 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6792 6793 } 6794 6795 /*---------------------------------------------------------------------------- 6796 | Returns the result of subtracting the absolute values of the quadruple- 6797 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6798 | difference is negated before being returned. `zSign' is ignored if the 6799 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6800 | Standard for Binary Floating-Point Arithmetic. 6801 *----------------------------------------------------------------------------*/ 6802 6803 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6804 float_status *status) 6805 { 6806 int32_t aExp, bExp, zExp; 6807 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6808 int32_t expDiff; 6809 6810 aSig1 = extractFloat128Frac1( a ); 6811 aSig0 = extractFloat128Frac0( a ); 6812 aExp = extractFloat128Exp( a ); 6813 bSig1 = extractFloat128Frac1( b ); 6814 bSig0 = extractFloat128Frac0( b ); 6815 bExp = extractFloat128Exp( b ); 6816 expDiff = aExp - bExp; 6817 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6818 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6819 if ( 0 < expDiff ) goto aExpBigger; 6820 if ( expDiff < 0 ) goto bExpBigger; 6821 if ( aExp == 0x7FFF ) { 6822 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6823 return propagateFloat128NaN(a, b, status); 6824 } 6825 float_raise(float_flag_invalid, status); 6826 return float128_default_nan(status); 6827 } 6828 if ( aExp == 0 ) { 6829 aExp = 1; 6830 bExp = 1; 6831 } 6832 if ( bSig0 < aSig0 ) goto aBigger; 6833 if ( aSig0 < bSig0 ) goto bBigger; 6834 if ( bSig1 < aSig1 ) goto aBigger; 6835 if ( aSig1 < bSig1 ) goto bBigger; 6836 return packFloat128(status->float_rounding_mode == float_round_down, 6837 0, 0, 0); 6838 bExpBigger: 6839 if ( bExp == 0x7FFF ) { 6840 if (bSig0 | bSig1) { 6841 return propagateFloat128NaN(a, b, status); 6842 } 6843 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6844 } 6845 if ( aExp == 0 ) { 6846 ++expDiff; 6847 } 6848 else { 6849 aSig0 |= LIT64( 0x4000000000000000 ); 6850 } 6851 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6852 bSig0 |= LIT64( 0x4000000000000000 ); 6853 bBigger: 6854 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6855 zExp = bExp; 6856 zSign ^= 1; 6857 goto normalizeRoundAndPack; 6858 aExpBigger: 6859 if ( aExp == 0x7FFF ) { 6860 if (aSig0 | aSig1) { 6861 return propagateFloat128NaN(a, b, status); 6862 } 6863 return a; 6864 } 6865 if ( bExp == 0 ) { 6866 --expDiff; 6867 } 6868 else { 6869 bSig0 |= LIT64( 0x4000000000000000 ); 6870 } 6871 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6872 aSig0 |= LIT64( 0x4000000000000000 ); 6873 aBigger: 6874 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6875 zExp = aExp; 6876 normalizeRoundAndPack: 6877 --zExp; 6878 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6879 status); 6880 6881 } 6882 6883 /*---------------------------------------------------------------------------- 6884 | Returns the result of adding the quadruple-precision floating-point values 6885 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6886 | for Binary Floating-Point Arithmetic. 6887 *----------------------------------------------------------------------------*/ 6888 6889 float128 float128_add(float128 a, float128 b, float_status *status) 6890 { 6891 flag aSign, bSign; 6892 6893 aSign = extractFloat128Sign( a ); 6894 bSign = extractFloat128Sign( b ); 6895 if ( aSign == bSign ) { 6896 return addFloat128Sigs(a, b, aSign, status); 6897 } 6898 else { 6899 return subFloat128Sigs(a, b, aSign, status); 6900 } 6901 6902 } 6903 6904 /*---------------------------------------------------------------------------- 6905 | Returns the result of subtracting the quadruple-precision floating-point 6906 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6907 | Standard for Binary Floating-Point Arithmetic. 6908 *----------------------------------------------------------------------------*/ 6909 6910 float128 float128_sub(float128 a, float128 b, float_status *status) 6911 { 6912 flag aSign, bSign; 6913 6914 aSign = extractFloat128Sign( a ); 6915 bSign = extractFloat128Sign( b ); 6916 if ( aSign == bSign ) { 6917 return subFloat128Sigs(a, b, aSign, status); 6918 } 6919 else { 6920 return addFloat128Sigs(a, b, aSign, status); 6921 } 6922 6923 } 6924 6925 /*---------------------------------------------------------------------------- 6926 | Returns the result of multiplying the quadruple-precision floating-point 6927 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6928 | Standard for Binary Floating-Point Arithmetic. 6929 *----------------------------------------------------------------------------*/ 6930 6931 float128 float128_mul(float128 a, float128 b, float_status *status) 6932 { 6933 flag aSign, bSign, zSign; 6934 int32_t aExp, bExp, zExp; 6935 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6936 6937 aSig1 = extractFloat128Frac1( a ); 6938 aSig0 = extractFloat128Frac0( a ); 6939 aExp = extractFloat128Exp( a ); 6940 aSign = extractFloat128Sign( a ); 6941 bSig1 = extractFloat128Frac1( b ); 6942 bSig0 = extractFloat128Frac0( b ); 6943 bExp = extractFloat128Exp( b ); 6944 bSign = extractFloat128Sign( b ); 6945 zSign = aSign ^ bSign; 6946 if ( aExp == 0x7FFF ) { 6947 if ( ( aSig0 | aSig1 ) 6948 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6949 return propagateFloat128NaN(a, b, status); 6950 } 6951 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6952 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6953 } 6954 if ( bExp == 0x7FFF ) { 6955 if (bSig0 | bSig1) { 6956 return propagateFloat128NaN(a, b, status); 6957 } 6958 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6959 invalid: 6960 float_raise(float_flag_invalid, status); 6961 return float128_default_nan(status); 6962 } 6963 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6964 } 6965 if ( aExp == 0 ) { 6966 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6967 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6968 } 6969 if ( bExp == 0 ) { 6970 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6971 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6972 } 6973 zExp = aExp + bExp - 0x4000; 6974 aSig0 |= LIT64( 0x0001000000000000 ); 6975 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6976 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6977 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6978 zSig2 |= ( zSig3 != 0 ); 6979 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6980 shift128ExtraRightJamming( 6981 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6982 ++zExp; 6983 } 6984 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6985 6986 } 6987 6988 /*---------------------------------------------------------------------------- 6989 | Returns the result of dividing the quadruple-precision floating-point value 6990 | `a' by the corresponding value `b'. The operation is performed according to 6991 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6992 *----------------------------------------------------------------------------*/ 6993 6994 float128 float128_div(float128 a, float128 b, float_status *status) 6995 { 6996 flag aSign, bSign, zSign; 6997 int32_t aExp, bExp, zExp; 6998 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6999 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7000 7001 aSig1 = extractFloat128Frac1( a ); 7002 aSig0 = extractFloat128Frac0( a ); 7003 aExp = extractFloat128Exp( a ); 7004 aSign = extractFloat128Sign( a ); 7005 bSig1 = extractFloat128Frac1( b ); 7006 bSig0 = extractFloat128Frac0( b ); 7007 bExp = extractFloat128Exp( b ); 7008 bSign = extractFloat128Sign( b ); 7009 zSign = aSign ^ bSign; 7010 if ( aExp == 0x7FFF ) { 7011 if (aSig0 | aSig1) { 7012 return propagateFloat128NaN(a, b, status); 7013 } 7014 if ( bExp == 0x7FFF ) { 7015 if (bSig0 | bSig1) { 7016 return propagateFloat128NaN(a, b, status); 7017 } 7018 goto invalid; 7019 } 7020 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7021 } 7022 if ( bExp == 0x7FFF ) { 7023 if (bSig0 | bSig1) { 7024 return propagateFloat128NaN(a, b, status); 7025 } 7026 return packFloat128( zSign, 0, 0, 0 ); 7027 } 7028 if ( bExp == 0 ) { 7029 if ( ( bSig0 | bSig1 ) == 0 ) { 7030 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7031 invalid: 7032 float_raise(float_flag_invalid, status); 7033 return float128_default_nan(status); 7034 } 7035 float_raise(float_flag_divbyzero, status); 7036 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7037 } 7038 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7039 } 7040 if ( aExp == 0 ) { 7041 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7042 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7043 } 7044 zExp = aExp - bExp + 0x3FFD; 7045 shortShift128Left( 7046 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7047 shortShift128Left( 7048 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7049 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7050 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7051 ++zExp; 7052 } 7053 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7054 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7055 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7056 while ( (int64_t) rem0 < 0 ) { 7057 --zSig0; 7058 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7059 } 7060 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7061 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7062 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7063 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7064 while ( (int64_t) rem1 < 0 ) { 7065 --zSig1; 7066 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7067 } 7068 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7069 } 7070 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7071 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7072 7073 } 7074 7075 /*---------------------------------------------------------------------------- 7076 | Returns the remainder of the quadruple-precision floating-point value `a' 7077 | with respect to the corresponding value `b'. The operation is performed 7078 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7079 *----------------------------------------------------------------------------*/ 7080 7081 float128 float128_rem(float128 a, float128 b, float_status *status) 7082 { 7083 flag aSign, zSign; 7084 int32_t aExp, bExp, expDiff; 7085 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7086 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7087 int64_t sigMean0; 7088 7089 aSig1 = extractFloat128Frac1( a ); 7090 aSig0 = extractFloat128Frac0( a ); 7091 aExp = extractFloat128Exp( a ); 7092 aSign = extractFloat128Sign( a ); 7093 bSig1 = extractFloat128Frac1( b ); 7094 bSig0 = extractFloat128Frac0( b ); 7095 bExp = extractFloat128Exp( b ); 7096 if ( aExp == 0x7FFF ) { 7097 if ( ( aSig0 | aSig1 ) 7098 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7099 return propagateFloat128NaN(a, b, status); 7100 } 7101 goto invalid; 7102 } 7103 if ( bExp == 0x7FFF ) { 7104 if (bSig0 | bSig1) { 7105 return propagateFloat128NaN(a, b, status); 7106 } 7107 return a; 7108 } 7109 if ( bExp == 0 ) { 7110 if ( ( bSig0 | bSig1 ) == 0 ) { 7111 invalid: 7112 float_raise(float_flag_invalid, status); 7113 return float128_default_nan(status); 7114 } 7115 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7116 } 7117 if ( aExp == 0 ) { 7118 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7119 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7120 } 7121 expDiff = aExp - bExp; 7122 if ( expDiff < -1 ) return a; 7123 shortShift128Left( 7124 aSig0 | LIT64( 0x0001000000000000 ), 7125 aSig1, 7126 15 - ( expDiff < 0 ), 7127 &aSig0, 7128 &aSig1 7129 ); 7130 shortShift128Left( 7131 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7132 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7133 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7134 expDiff -= 64; 7135 while ( 0 < expDiff ) { 7136 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7137 q = ( 4 < q ) ? q - 4 : 0; 7138 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7139 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7140 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7141 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7142 expDiff -= 61; 7143 } 7144 if ( -64 < expDiff ) { 7145 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7146 q = ( 4 < q ) ? q - 4 : 0; 7147 q >>= - expDiff; 7148 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7149 expDiff += 52; 7150 if ( expDiff < 0 ) { 7151 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7152 } 7153 else { 7154 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7155 } 7156 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7157 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7158 } 7159 else { 7160 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7161 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7162 } 7163 do { 7164 alternateASig0 = aSig0; 7165 alternateASig1 = aSig1; 7166 ++q; 7167 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7168 } while ( 0 <= (int64_t) aSig0 ); 7169 add128( 7170 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7171 if ( ( sigMean0 < 0 ) 7172 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7173 aSig0 = alternateASig0; 7174 aSig1 = alternateASig1; 7175 } 7176 zSign = ( (int64_t) aSig0 < 0 ); 7177 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7178 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7179 status); 7180 } 7181 7182 /*---------------------------------------------------------------------------- 7183 | Returns the square root of the quadruple-precision floating-point value `a'. 7184 | The operation is performed according to the IEC/IEEE Standard for Binary 7185 | Floating-Point Arithmetic. 7186 *----------------------------------------------------------------------------*/ 7187 7188 float128 float128_sqrt(float128 a, float_status *status) 7189 { 7190 flag aSign; 7191 int32_t aExp, zExp; 7192 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7193 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7194 7195 aSig1 = extractFloat128Frac1( a ); 7196 aSig0 = extractFloat128Frac0( a ); 7197 aExp = extractFloat128Exp( a ); 7198 aSign = extractFloat128Sign( a ); 7199 if ( aExp == 0x7FFF ) { 7200 if (aSig0 | aSig1) { 7201 return propagateFloat128NaN(a, a, status); 7202 } 7203 if ( ! aSign ) return a; 7204 goto invalid; 7205 } 7206 if ( aSign ) { 7207 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7208 invalid: 7209 float_raise(float_flag_invalid, status); 7210 return float128_default_nan(status); 7211 } 7212 if ( aExp == 0 ) { 7213 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7214 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7215 } 7216 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7217 aSig0 |= LIT64( 0x0001000000000000 ); 7218 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7219 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7220 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7221 doubleZSig0 = zSig0<<1; 7222 mul64To128( zSig0, zSig0, &term0, &term1 ); 7223 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7224 while ( (int64_t) rem0 < 0 ) { 7225 --zSig0; 7226 doubleZSig0 -= 2; 7227 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7228 } 7229 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7230 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7231 if ( zSig1 == 0 ) zSig1 = 1; 7232 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7233 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7234 mul64To128( zSig1, zSig1, &term2, &term3 ); 7235 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7236 while ( (int64_t) rem1 < 0 ) { 7237 --zSig1; 7238 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7239 term3 |= 1; 7240 term2 |= doubleZSig0; 7241 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7242 } 7243 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7244 } 7245 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7246 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7247 7248 } 7249 7250 /*---------------------------------------------------------------------------- 7251 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7252 | the corresponding value `b', and 0 otherwise. The invalid exception is 7253 | raised if either operand is a NaN. Otherwise, the comparison is performed 7254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7255 *----------------------------------------------------------------------------*/ 7256 7257 int float128_eq(float128 a, float128 b, float_status *status) 7258 { 7259 7260 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7261 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7262 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7263 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7264 ) { 7265 float_raise(float_flag_invalid, status); 7266 return 0; 7267 } 7268 return 7269 ( a.low == b.low ) 7270 && ( ( a.high == b.high ) 7271 || ( ( a.low == 0 ) 7272 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7273 ); 7274 7275 } 7276 7277 /*---------------------------------------------------------------------------- 7278 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7279 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7280 | exception is raised if either operand is a NaN. The comparison is performed 7281 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7282 *----------------------------------------------------------------------------*/ 7283 7284 int float128_le(float128 a, float128 b, float_status *status) 7285 { 7286 flag aSign, bSign; 7287 7288 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7289 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7290 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7291 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7292 ) { 7293 float_raise(float_flag_invalid, status); 7294 return 0; 7295 } 7296 aSign = extractFloat128Sign( a ); 7297 bSign = extractFloat128Sign( b ); 7298 if ( aSign != bSign ) { 7299 return 7300 aSign 7301 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7302 == 0 ); 7303 } 7304 return 7305 aSign ? le128( b.high, b.low, a.high, a.low ) 7306 : le128( a.high, a.low, b.high, b.low ); 7307 7308 } 7309 7310 /*---------------------------------------------------------------------------- 7311 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7312 | the corresponding value `b', and 0 otherwise. The invalid exception is 7313 | raised if either operand is a NaN. The comparison is performed according 7314 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7315 *----------------------------------------------------------------------------*/ 7316 7317 int float128_lt(float128 a, float128 b, float_status *status) 7318 { 7319 flag aSign, bSign; 7320 7321 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7322 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7323 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7324 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7325 ) { 7326 float_raise(float_flag_invalid, status); 7327 return 0; 7328 } 7329 aSign = extractFloat128Sign( a ); 7330 bSign = extractFloat128Sign( b ); 7331 if ( aSign != bSign ) { 7332 return 7333 aSign 7334 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7335 != 0 ); 7336 } 7337 return 7338 aSign ? lt128( b.high, b.low, a.high, a.low ) 7339 : lt128( a.high, a.low, b.high, b.low ); 7340 7341 } 7342 7343 /*---------------------------------------------------------------------------- 7344 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7345 | be compared, and 0 otherwise. The invalid exception is raised if either 7346 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7347 | Standard for Binary Floating-Point Arithmetic. 7348 *----------------------------------------------------------------------------*/ 7349 7350 int float128_unordered(float128 a, float128 b, float_status *status) 7351 { 7352 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7353 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7354 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7355 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7356 ) { 7357 float_raise(float_flag_invalid, status); 7358 return 1; 7359 } 7360 return 0; 7361 } 7362 7363 /*---------------------------------------------------------------------------- 7364 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7365 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7366 | exception. The comparison is performed according to the IEC/IEEE Standard 7367 | for Binary Floating-Point Arithmetic. 7368 *----------------------------------------------------------------------------*/ 7369 7370 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7371 { 7372 7373 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7374 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7375 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7376 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7377 ) { 7378 if (float128_is_signaling_nan(a, status) 7379 || float128_is_signaling_nan(b, status)) { 7380 float_raise(float_flag_invalid, status); 7381 } 7382 return 0; 7383 } 7384 return 7385 ( a.low == b.low ) 7386 && ( ( a.high == b.high ) 7387 || ( ( a.low == 0 ) 7388 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7389 ); 7390 7391 } 7392 7393 /*---------------------------------------------------------------------------- 7394 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7395 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7396 | cause an exception. Otherwise, the comparison is performed according to the 7397 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7398 *----------------------------------------------------------------------------*/ 7399 7400 int float128_le_quiet(float128 a, float128 b, float_status *status) 7401 { 7402 flag aSign, bSign; 7403 7404 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7405 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7406 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7407 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7408 ) { 7409 if (float128_is_signaling_nan(a, status) 7410 || float128_is_signaling_nan(b, status)) { 7411 float_raise(float_flag_invalid, status); 7412 } 7413 return 0; 7414 } 7415 aSign = extractFloat128Sign( a ); 7416 bSign = extractFloat128Sign( b ); 7417 if ( aSign != bSign ) { 7418 return 7419 aSign 7420 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7421 == 0 ); 7422 } 7423 return 7424 aSign ? le128( b.high, b.low, a.high, a.low ) 7425 : le128( a.high, a.low, b.high, b.low ); 7426 7427 } 7428 7429 /*---------------------------------------------------------------------------- 7430 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7431 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7432 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7433 | Standard for Binary Floating-Point Arithmetic. 7434 *----------------------------------------------------------------------------*/ 7435 7436 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7437 { 7438 flag aSign, bSign; 7439 7440 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7441 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7442 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7443 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7444 ) { 7445 if (float128_is_signaling_nan(a, status) 7446 || float128_is_signaling_nan(b, status)) { 7447 float_raise(float_flag_invalid, status); 7448 } 7449 return 0; 7450 } 7451 aSign = extractFloat128Sign( a ); 7452 bSign = extractFloat128Sign( b ); 7453 if ( aSign != bSign ) { 7454 return 7455 aSign 7456 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7457 != 0 ); 7458 } 7459 return 7460 aSign ? lt128( b.high, b.low, a.high, a.low ) 7461 : lt128( a.high, a.low, b.high, b.low ); 7462 7463 } 7464 7465 /*---------------------------------------------------------------------------- 7466 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7467 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7468 | comparison is performed according to the IEC/IEEE Standard for Binary 7469 | Floating-Point Arithmetic. 7470 *----------------------------------------------------------------------------*/ 7471 7472 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7473 { 7474 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7475 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7476 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7477 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7478 ) { 7479 if (float128_is_signaling_nan(a, status) 7480 || float128_is_signaling_nan(b, status)) { 7481 float_raise(float_flag_invalid, status); 7482 } 7483 return 1; 7484 } 7485 return 0; 7486 } 7487 7488 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7489 int is_quiet, float_status *status) 7490 { 7491 flag aSign, bSign; 7492 7493 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7494 float_raise(float_flag_invalid, status); 7495 return float_relation_unordered; 7496 } 7497 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7498 ( extractFloatx80Frac( a )<<1 ) ) || 7499 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7500 ( extractFloatx80Frac( b )<<1 ) )) { 7501 if (!is_quiet || 7502 floatx80_is_signaling_nan(a, status) || 7503 floatx80_is_signaling_nan(b, status)) { 7504 float_raise(float_flag_invalid, status); 7505 } 7506 return float_relation_unordered; 7507 } 7508 aSign = extractFloatx80Sign( a ); 7509 bSign = extractFloatx80Sign( b ); 7510 if ( aSign != bSign ) { 7511 7512 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7513 ( ( a.low | b.low ) == 0 ) ) { 7514 /* zero case */ 7515 return float_relation_equal; 7516 } else { 7517 return 1 - (2 * aSign); 7518 } 7519 } else { 7520 if (a.low == b.low && a.high == b.high) { 7521 return float_relation_equal; 7522 } else { 7523 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7524 } 7525 } 7526 } 7527 7528 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7529 { 7530 return floatx80_compare_internal(a, b, 0, status); 7531 } 7532 7533 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7534 { 7535 return floatx80_compare_internal(a, b, 1, status); 7536 } 7537 7538 static inline int float128_compare_internal(float128 a, float128 b, 7539 int is_quiet, float_status *status) 7540 { 7541 flag aSign, bSign; 7542 7543 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7544 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7545 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7546 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7547 if (!is_quiet || 7548 float128_is_signaling_nan(a, status) || 7549 float128_is_signaling_nan(b, status)) { 7550 float_raise(float_flag_invalid, status); 7551 } 7552 return float_relation_unordered; 7553 } 7554 aSign = extractFloat128Sign( a ); 7555 bSign = extractFloat128Sign( b ); 7556 if ( aSign != bSign ) { 7557 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7558 /* zero case */ 7559 return float_relation_equal; 7560 } else { 7561 return 1 - (2 * aSign); 7562 } 7563 } else { 7564 if (a.low == b.low && a.high == b.high) { 7565 return float_relation_equal; 7566 } else { 7567 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7568 } 7569 } 7570 } 7571 7572 int float128_compare(float128 a, float128 b, float_status *status) 7573 { 7574 return float128_compare_internal(a, b, 0, status); 7575 } 7576 7577 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7578 { 7579 return float128_compare_internal(a, b, 1, status); 7580 } 7581 7582 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7583 { 7584 flag aSign; 7585 int32_t aExp; 7586 uint64_t aSig; 7587 7588 if (floatx80_invalid_encoding(a)) { 7589 float_raise(float_flag_invalid, status); 7590 return floatx80_default_nan(status); 7591 } 7592 aSig = extractFloatx80Frac( a ); 7593 aExp = extractFloatx80Exp( a ); 7594 aSign = extractFloatx80Sign( a ); 7595 7596 if ( aExp == 0x7FFF ) { 7597 if ( aSig<<1 ) { 7598 return propagateFloatx80NaN(a, a, status); 7599 } 7600 return a; 7601 } 7602 7603 if (aExp == 0) { 7604 if (aSig == 0) { 7605 return a; 7606 } 7607 aExp++; 7608 } 7609 7610 if (n > 0x10000) { 7611 n = 0x10000; 7612 } else if (n < -0x10000) { 7613 n = -0x10000; 7614 } 7615 7616 aExp += n; 7617 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7618 aSign, aExp, aSig, 0, status); 7619 } 7620 7621 float128 float128_scalbn(float128 a, int n, float_status *status) 7622 { 7623 flag aSign; 7624 int32_t aExp; 7625 uint64_t aSig0, aSig1; 7626 7627 aSig1 = extractFloat128Frac1( a ); 7628 aSig0 = extractFloat128Frac0( a ); 7629 aExp = extractFloat128Exp( a ); 7630 aSign = extractFloat128Sign( a ); 7631 if ( aExp == 0x7FFF ) { 7632 if ( aSig0 | aSig1 ) { 7633 return propagateFloat128NaN(a, a, status); 7634 } 7635 return a; 7636 } 7637 if (aExp != 0) { 7638 aSig0 |= LIT64( 0x0001000000000000 ); 7639 } else if (aSig0 == 0 && aSig1 == 0) { 7640 return a; 7641 } else { 7642 aExp++; 7643 } 7644 7645 if (n > 0x10000) { 7646 n = 0x10000; 7647 } else if (n < -0x10000) { 7648 n = -0x10000; 7649 } 7650 7651 aExp += n - 1; 7652 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7653 , status); 7654 7655 } 7656