1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 static float32 QEMU_SOFTFLOAT_ATTR 1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1523 float_status *status) 1524 { 1525 FloatParts pa = float32_unpack_canonical(a, status); 1526 FloatParts pb = float32_unpack_canonical(b, status); 1527 FloatParts pc = float32_unpack_canonical(c, status); 1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1529 1530 return float32_round_pack_canonical(pr, status); 1531 } 1532 1533 static float64 QEMU_SOFTFLOAT_ATTR 1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1535 float_status *status) 1536 { 1537 FloatParts pa = float64_unpack_canonical(a, status); 1538 FloatParts pb = float64_unpack_canonical(b, status); 1539 FloatParts pc = float64_unpack_canonical(c, status); 1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1541 1542 return float64_round_pack_canonical(pr, status); 1543 } 1544 1545 float32 QEMU_FLATTEN 1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1547 { 1548 union_float32 ua, ub, uc, ur; 1549 1550 ua.s = xa; 1551 ub.s = xb; 1552 uc.s = xc; 1553 1554 if (unlikely(!can_use_fpu(s))) { 1555 goto soft; 1556 } 1557 if (unlikely(flags & float_muladd_halve_result)) { 1558 goto soft; 1559 } 1560 1561 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1562 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1563 goto soft; 1564 } 1565 /* 1566 * When (a || b) == 0, there's no need to check for under/over flow, 1567 * since we know the addend is (normal || 0) and the product is 0. 1568 */ 1569 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1570 union_float32 up; 1571 bool prod_sign; 1572 1573 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1574 prod_sign ^= !!(flags & float_muladd_negate_product); 1575 up.s = float32_set_sign(float32_zero, prod_sign); 1576 1577 if (flags & float_muladd_negate_c) { 1578 uc.h = -uc.h; 1579 } 1580 ur.h = up.h + uc.h; 1581 } else { 1582 if (flags & float_muladd_negate_product) { 1583 ua.h = -ua.h; 1584 } 1585 if (flags & float_muladd_negate_c) { 1586 uc.h = -uc.h; 1587 } 1588 1589 ur.h = fmaf(ua.h, ub.h, uc.h); 1590 1591 if (unlikely(f32_is_inf(ur))) { 1592 s->float_exception_flags |= float_flag_overflow; 1593 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1594 goto soft; 1595 } 1596 } 1597 if (flags & float_muladd_negate_result) { 1598 return float32_chs(ur.s); 1599 } 1600 return ur.s; 1601 1602 soft: 1603 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1604 } 1605 1606 float64 QEMU_FLATTEN 1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1608 { 1609 union_float64 ua, ub, uc, ur; 1610 1611 ua.s = xa; 1612 ub.s = xb; 1613 uc.s = xc; 1614 1615 if (unlikely(!can_use_fpu(s))) { 1616 goto soft; 1617 } 1618 if (unlikely(flags & float_muladd_halve_result)) { 1619 goto soft; 1620 } 1621 1622 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1623 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1624 goto soft; 1625 } 1626 /* 1627 * When (a || b) == 0, there's no need to check for under/over flow, 1628 * since we know the addend is (normal || 0) and the product is 0. 1629 */ 1630 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1631 union_float64 up; 1632 bool prod_sign; 1633 1634 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1635 prod_sign ^= !!(flags & float_muladd_negate_product); 1636 up.s = float64_set_sign(float64_zero, prod_sign); 1637 1638 if (flags & float_muladd_negate_c) { 1639 uc.h = -uc.h; 1640 } 1641 ur.h = up.h + uc.h; 1642 } else { 1643 if (flags & float_muladd_negate_product) { 1644 ua.h = -ua.h; 1645 } 1646 if (flags & float_muladd_negate_c) { 1647 uc.h = -uc.h; 1648 } 1649 1650 ur.h = fma(ua.h, ub.h, uc.h); 1651 1652 if (unlikely(f64_is_inf(ur))) { 1653 s->float_exception_flags |= float_flag_overflow; 1654 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1655 goto soft; 1656 } 1657 } 1658 if (flags & float_muladd_negate_result) { 1659 return float64_chs(ur.s); 1660 } 1661 return ur.s; 1662 1663 soft: 1664 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1665 } 1666 1667 /* 1668 * Returns the result of dividing the floating-point value `a' by the 1669 * corresponding value `b'. The operation is performed according to 1670 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1671 */ 1672 1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1674 { 1675 bool sign = a.sign ^ b.sign; 1676 1677 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1678 uint64_t n0, n1, q, r; 1679 int exp = a.exp - b.exp; 1680 1681 /* 1682 * We want a 2*N / N-bit division to produce exactly an N-bit 1683 * result, so that we do not lose any precision and so that we 1684 * do not have to renormalize afterward. If A.frac < B.frac, 1685 * then division would produce an (N-1)-bit result; shift A left 1686 * by one to produce the an N-bit result, and decrement the 1687 * exponent to match. 1688 * 1689 * The udiv_qrnnd algorithm that we're using requires normalization, 1690 * i.e. the msb of the denominator must be set. Since we know that 1691 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1692 * by one (more), and the remainder must be shifted right by one. 1693 */ 1694 if (a.frac < b.frac) { 1695 exp -= 1; 1696 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1697 } else { 1698 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1699 } 1700 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1701 1702 /* 1703 * Set lsb if there is a remainder, to set inexact. 1704 * As mentioned above, to find the actual value of the remainder we 1705 * would need to shift right, but (1) we are only concerned about 1706 * non-zero-ness, and (2) the remainder will always be even because 1707 * both inputs to the division primitive are even. 1708 */ 1709 a.frac = q | (r != 0); 1710 a.sign = sign; 1711 a.exp = exp; 1712 return a; 1713 } 1714 /* handle all the NaN cases */ 1715 if (is_nan(a.cls) || is_nan(b.cls)) { 1716 return pick_nan(a, b, s); 1717 } 1718 /* 0/0 or Inf/Inf */ 1719 if (a.cls == b.cls 1720 && 1721 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1722 s->float_exception_flags |= float_flag_invalid; 1723 return parts_default_nan(s); 1724 } 1725 /* Inf / x or 0 / x */ 1726 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1727 a.sign = sign; 1728 return a; 1729 } 1730 /* Div 0 => Inf */ 1731 if (b.cls == float_class_zero) { 1732 s->float_exception_flags |= float_flag_divbyzero; 1733 a.cls = float_class_inf; 1734 a.sign = sign; 1735 return a; 1736 } 1737 /* Div by Inf */ 1738 if (b.cls == float_class_inf) { 1739 a.cls = float_class_zero; 1740 a.sign = sign; 1741 return a; 1742 } 1743 g_assert_not_reached(); 1744 } 1745 1746 float16 float16_div(float16 a, float16 b, float_status *status) 1747 { 1748 FloatParts pa = float16_unpack_canonical(a, status); 1749 FloatParts pb = float16_unpack_canonical(b, status); 1750 FloatParts pr = div_floats(pa, pb, status); 1751 1752 return float16_round_pack_canonical(pr, status); 1753 } 1754 1755 static float32 QEMU_SOFTFLOAT_ATTR 1756 soft_f32_div(float32 a, float32 b, float_status *status) 1757 { 1758 FloatParts pa = float32_unpack_canonical(a, status); 1759 FloatParts pb = float32_unpack_canonical(b, status); 1760 FloatParts pr = div_floats(pa, pb, status); 1761 1762 return float32_round_pack_canonical(pr, status); 1763 } 1764 1765 static float64 QEMU_SOFTFLOAT_ATTR 1766 soft_f64_div(float64 a, float64 b, float_status *status) 1767 { 1768 FloatParts pa = float64_unpack_canonical(a, status); 1769 FloatParts pb = float64_unpack_canonical(b, status); 1770 FloatParts pr = div_floats(pa, pb, status); 1771 1772 return float64_round_pack_canonical(pr, status); 1773 } 1774 1775 static float hard_f32_div(float a, float b) 1776 { 1777 return a / b; 1778 } 1779 1780 static double hard_f64_div(double a, double b) 1781 { 1782 return a / b; 1783 } 1784 1785 static bool f32_div_pre(union_float32 a, union_float32 b) 1786 { 1787 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1788 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1789 fpclassify(b.h) == FP_NORMAL; 1790 } 1791 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1792 } 1793 1794 static bool f64_div_pre(union_float64 a, union_float64 b) 1795 { 1796 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1797 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1798 fpclassify(b.h) == FP_NORMAL; 1799 } 1800 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1801 } 1802 1803 static bool f32_div_post(union_float32 a, union_float32 b) 1804 { 1805 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1806 return fpclassify(a.h) != FP_ZERO; 1807 } 1808 return !float32_is_zero(a.s); 1809 } 1810 1811 static bool f64_div_post(union_float64 a, union_float64 b) 1812 { 1813 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1814 return fpclassify(a.h) != FP_ZERO; 1815 } 1816 return !float64_is_zero(a.s); 1817 } 1818 1819 float32 QEMU_FLATTEN 1820 float32_div(float32 a, float32 b, float_status *s) 1821 { 1822 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1823 f32_div_pre, f32_div_post, NULL, NULL); 1824 } 1825 1826 float64 QEMU_FLATTEN 1827 float64_div(float64 a, float64 b, float_status *s) 1828 { 1829 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1830 f64_div_pre, f64_div_post, NULL, NULL); 1831 } 1832 1833 /* 1834 * Float to Float conversions 1835 * 1836 * Returns the result of converting one float format to another. The 1837 * conversion is performed according to the IEC/IEEE Standard for 1838 * Binary Floating-Point Arithmetic. 1839 * 1840 * The float_to_float helper only needs to take care of raising 1841 * invalid exceptions and handling the conversion on NaNs. 1842 */ 1843 1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1845 float_status *s) 1846 { 1847 if (dstf->arm_althp) { 1848 switch (a.cls) { 1849 case float_class_qnan: 1850 case float_class_snan: 1851 /* There is no NaN in the destination format. Raise Invalid 1852 * and return a zero with the sign of the input NaN. 1853 */ 1854 s->float_exception_flags |= float_flag_invalid; 1855 a.cls = float_class_zero; 1856 a.frac = 0; 1857 a.exp = 0; 1858 break; 1859 1860 case float_class_inf: 1861 /* There is no Inf in the destination format. Raise Invalid 1862 * and return the maximum normal with the correct sign. 1863 */ 1864 s->float_exception_flags |= float_flag_invalid; 1865 a.cls = float_class_normal; 1866 a.exp = dstf->exp_max; 1867 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1868 break; 1869 1870 default: 1871 break; 1872 } 1873 } else if (is_nan(a.cls)) { 1874 if (is_snan(a.cls)) { 1875 s->float_exception_flags |= float_flag_invalid; 1876 a = parts_silence_nan(a, s); 1877 } 1878 if (s->default_nan_mode) { 1879 return parts_default_nan(s); 1880 } 1881 } 1882 return a; 1883 } 1884 1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1886 { 1887 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1888 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1889 FloatParts pr = float_to_float(p, &float32_params, s); 1890 return float32_round_pack_canonical(pr, s); 1891 } 1892 1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1894 { 1895 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1896 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1897 FloatParts pr = float_to_float(p, &float64_params, s); 1898 return float64_round_pack_canonical(pr, s); 1899 } 1900 1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1902 { 1903 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1904 FloatParts p = float32_unpack_canonical(a, s); 1905 FloatParts pr = float_to_float(p, fmt16, s); 1906 return float16a_round_pack_canonical(pr, s, fmt16); 1907 } 1908 1909 float64 float32_to_float64(float32 a, float_status *s) 1910 { 1911 FloatParts p = float32_unpack_canonical(a, s); 1912 FloatParts pr = float_to_float(p, &float64_params, s); 1913 return float64_round_pack_canonical(pr, s); 1914 } 1915 1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1917 { 1918 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1919 FloatParts p = float64_unpack_canonical(a, s); 1920 FloatParts pr = float_to_float(p, fmt16, s); 1921 return float16a_round_pack_canonical(pr, s, fmt16); 1922 } 1923 1924 float32 float64_to_float32(float64 a, float_status *s) 1925 { 1926 FloatParts p = float64_unpack_canonical(a, s); 1927 FloatParts pr = float_to_float(p, &float32_params, s); 1928 return float32_round_pack_canonical(pr, s); 1929 } 1930 1931 /* 1932 * Rounds the floating-point value `a' to an integer, and returns the 1933 * result as a floating-point value. The operation is performed 1934 * according to the IEC/IEEE Standard for Binary Floating-Point 1935 * Arithmetic. 1936 */ 1937 1938 static FloatParts round_to_int(FloatParts a, int rmode, 1939 int scale, float_status *s) 1940 { 1941 switch (a.cls) { 1942 case float_class_qnan: 1943 case float_class_snan: 1944 return return_nan(a, s); 1945 1946 case float_class_zero: 1947 case float_class_inf: 1948 /* already "integral" */ 1949 break; 1950 1951 case float_class_normal: 1952 scale = MIN(MAX(scale, -0x10000), 0x10000); 1953 a.exp += scale; 1954 1955 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1956 /* already integral */ 1957 break; 1958 } 1959 if (a.exp < 0) { 1960 bool one; 1961 /* all fractional */ 1962 s->float_exception_flags |= float_flag_inexact; 1963 switch (rmode) { 1964 case float_round_nearest_even: 1965 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1966 break; 1967 case float_round_ties_away: 1968 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1969 break; 1970 case float_round_to_zero: 1971 one = false; 1972 break; 1973 case float_round_up: 1974 one = !a.sign; 1975 break; 1976 case float_round_down: 1977 one = a.sign; 1978 break; 1979 default: 1980 g_assert_not_reached(); 1981 } 1982 1983 if (one) { 1984 a.frac = DECOMPOSED_IMPLICIT_BIT; 1985 a.exp = 0; 1986 } else { 1987 a.cls = float_class_zero; 1988 } 1989 } else { 1990 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1991 uint64_t frac_lsbm1 = frac_lsb >> 1; 1992 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1993 uint64_t rnd_mask = rnd_even_mask >> 1; 1994 uint64_t inc; 1995 1996 switch (rmode) { 1997 case float_round_nearest_even: 1998 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1999 break; 2000 case float_round_ties_away: 2001 inc = frac_lsbm1; 2002 break; 2003 case float_round_to_zero: 2004 inc = 0; 2005 break; 2006 case float_round_up: 2007 inc = a.sign ? 0 : rnd_mask; 2008 break; 2009 case float_round_down: 2010 inc = a.sign ? rnd_mask : 0; 2011 break; 2012 default: 2013 g_assert_not_reached(); 2014 } 2015 2016 if (a.frac & rnd_mask) { 2017 s->float_exception_flags |= float_flag_inexact; 2018 a.frac += inc; 2019 a.frac &= ~rnd_mask; 2020 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2021 a.frac >>= 1; 2022 a.exp++; 2023 } 2024 } 2025 } 2026 break; 2027 default: 2028 g_assert_not_reached(); 2029 } 2030 return a; 2031 } 2032 2033 float16 float16_round_to_int(float16 a, float_status *s) 2034 { 2035 FloatParts pa = float16_unpack_canonical(a, s); 2036 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2037 return float16_round_pack_canonical(pr, s); 2038 } 2039 2040 float32 float32_round_to_int(float32 a, float_status *s) 2041 { 2042 FloatParts pa = float32_unpack_canonical(a, s); 2043 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2044 return float32_round_pack_canonical(pr, s); 2045 } 2046 2047 float64 float64_round_to_int(float64 a, float_status *s) 2048 { 2049 FloatParts pa = float64_unpack_canonical(a, s); 2050 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2051 return float64_round_pack_canonical(pr, s); 2052 } 2053 2054 /* 2055 * Returns the result of converting the floating-point value `a' to 2056 * the two's complement integer format. The conversion is performed 2057 * according to the IEC/IEEE Standard for Binary Floating-Point 2058 * Arithmetic---which means in particular that the conversion is 2059 * rounded according to the current rounding mode. If `a' is a NaN, 2060 * the largest positive integer is returned. Otherwise, if the 2061 * conversion overflows, the largest integer with the same sign as `a' 2062 * is returned. 2063 */ 2064 2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2066 int64_t min, int64_t max, 2067 float_status *s) 2068 { 2069 uint64_t r; 2070 int orig_flags = get_float_exception_flags(s); 2071 FloatParts p = round_to_int(in, rmode, scale, s); 2072 2073 switch (p.cls) { 2074 case float_class_snan: 2075 case float_class_qnan: 2076 s->float_exception_flags = orig_flags | float_flag_invalid; 2077 return max; 2078 case float_class_inf: 2079 s->float_exception_flags = orig_flags | float_flag_invalid; 2080 return p.sign ? min : max; 2081 case float_class_zero: 2082 return 0; 2083 case float_class_normal: 2084 if (p.exp < DECOMPOSED_BINARY_POINT) { 2085 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2086 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2087 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2088 } else { 2089 r = UINT64_MAX; 2090 } 2091 if (p.sign) { 2092 if (r <= -(uint64_t) min) { 2093 return -r; 2094 } else { 2095 s->float_exception_flags = orig_flags | float_flag_invalid; 2096 return min; 2097 } 2098 } else { 2099 if (r <= max) { 2100 return r; 2101 } else { 2102 s->float_exception_flags = orig_flags | float_flag_invalid; 2103 return max; 2104 } 2105 } 2106 default: 2107 g_assert_not_reached(); 2108 } 2109 } 2110 2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2112 float_status *s) 2113 { 2114 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2115 rmode, scale, INT16_MIN, INT16_MAX, s); 2116 } 2117 2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2119 float_status *s) 2120 { 2121 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2122 rmode, scale, INT32_MIN, INT32_MAX, s); 2123 } 2124 2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2126 float_status *s) 2127 { 2128 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2129 rmode, scale, INT64_MIN, INT64_MAX, s); 2130 } 2131 2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2133 float_status *s) 2134 { 2135 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2136 rmode, scale, INT16_MIN, INT16_MAX, s); 2137 } 2138 2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2140 float_status *s) 2141 { 2142 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2143 rmode, scale, INT32_MIN, INT32_MAX, s); 2144 } 2145 2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2147 float_status *s) 2148 { 2149 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2150 rmode, scale, INT64_MIN, INT64_MAX, s); 2151 } 2152 2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2154 float_status *s) 2155 { 2156 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2157 rmode, scale, INT16_MIN, INT16_MAX, s); 2158 } 2159 2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2161 float_status *s) 2162 { 2163 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2164 rmode, scale, INT32_MIN, INT32_MAX, s); 2165 } 2166 2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2168 float_status *s) 2169 { 2170 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2171 rmode, scale, INT64_MIN, INT64_MAX, s); 2172 } 2173 2174 int16_t float16_to_int16(float16 a, float_status *s) 2175 { 2176 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2177 } 2178 2179 int32_t float16_to_int32(float16 a, float_status *s) 2180 { 2181 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2182 } 2183 2184 int64_t float16_to_int64(float16 a, float_status *s) 2185 { 2186 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2187 } 2188 2189 int16_t float32_to_int16(float32 a, float_status *s) 2190 { 2191 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2192 } 2193 2194 int32_t float32_to_int32(float32 a, float_status *s) 2195 { 2196 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2197 } 2198 2199 int64_t float32_to_int64(float32 a, float_status *s) 2200 { 2201 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2202 } 2203 2204 int16_t float64_to_int16(float64 a, float_status *s) 2205 { 2206 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2207 } 2208 2209 int32_t float64_to_int32(float64 a, float_status *s) 2210 { 2211 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2212 } 2213 2214 int64_t float64_to_int64(float64 a, float_status *s) 2215 { 2216 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2217 } 2218 2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2220 { 2221 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2222 } 2223 2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2225 { 2226 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2227 } 2228 2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2230 { 2231 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2232 } 2233 2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2235 { 2236 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2237 } 2238 2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2240 { 2241 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2242 } 2243 2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2245 { 2246 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2247 } 2248 2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2250 { 2251 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2252 } 2253 2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2255 { 2256 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2257 } 2258 2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2260 { 2261 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2262 } 2263 2264 /* 2265 * Returns the result of converting the floating-point value `a' to 2266 * the unsigned integer format. The conversion is performed according 2267 * to the IEC/IEEE Standard for Binary Floating-Point 2268 * Arithmetic---which means in particular that the conversion is 2269 * rounded according to the current rounding mode. If `a' is a NaN, 2270 * the largest unsigned integer is returned. Otherwise, if the 2271 * conversion overflows, the largest unsigned integer is returned. If 2272 * the 'a' is negative, the result is rounded and zero is returned; 2273 * values that do not round to zero will raise the inexact exception 2274 * flag. 2275 */ 2276 2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2278 uint64_t max, float_status *s) 2279 { 2280 int orig_flags = get_float_exception_flags(s); 2281 FloatParts p = round_to_int(in, rmode, scale, s); 2282 uint64_t r; 2283 2284 switch (p.cls) { 2285 case float_class_snan: 2286 case float_class_qnan: 2287 s->float_exception_flags = orig_flags | float_flag_invalid; 2288 return max; 2289 case float_class_inf: 2290 s->float_exception_flags = orig_flags | float_flag_invalid; 2291 return p.sign ? 0 : max; 2292 case float_class_zero: 2293 return 0; 2294 case float_class_normal: 2295 if (p.sign) { 2296 s->float_exception_flags = orig_flags | float_flag_invalid; 2297 return 0; 2298 } 2299 2300 if (p.exp < DECOMPOSED_BINARY_POINT) { 2301 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2302 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2303 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2304 } else { 2305 s->float_exception_flags = orig_flags | float_flag_invalid; 2306 return max; 2307 } 2308 2309 /* For uint64 this will never trip, but if p.exp is too large 2310 * to shift a decomposed fraction we shall have exited via the 2311 * 3rd leg above. 2312 */ 2313 if (r > max) { 2314 s->float_exception_flags = orig_flags | float_flag_invalid; 2315 return max; 2316 } 2317 return r; 2318 default: 2319 g_assert_not_reached(); 2320 } 2321 } 2322 2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2324 float_status *s) 2325 { 2326 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2327 rmode, scale, UINT16_MAX, s); 2328 } 2329 2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2331 float_status *s) 2332 { 2333 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2334 rmode, scale, UINT32_MAX, s); 2335 } 2336 2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2338 float_status *s) 2339 { 2340 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2341 rmode, scale, UINT64_MAX, s); 2342 } 2343 2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2345 float_status *s) 2346 { 2347 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2348 rmode, scale, UINT16_MAX, s); 2349 } 2350 2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2352 float_status *s) 2353 { 2354 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2355 rmode, scale, UINT32_MAX, s); 2356 } 2357 2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2359 float_status *s) 2360 { 2361 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2362 rmode, scale, UINT64_MAX, s); 2363 } 2364 2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2366 float_status *s) 2367 { 2368 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2369 rmode, scale, UINT16_MAX, s); 2370 } 2371 2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2373 float_status *s) 2374 { 2375 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2376 rmode, scale, UINT32_MAX, s); 2377 } 2378 2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2380 float_status *s) 2381 { 2382 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2383 rmode, scale, UINT64_MAX, s); 2384 } 2385 2386 uint16_t float16_to_uint16(float16 a, float_status *s) 2387 { 2388 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2389 } 2390 2391 uint32_t float16_to_uint32(float16 a, float_status *s) 2392 { 2393 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2394 } 2395 2396 uint64_t float16_to_uint64(float16 a, float_status *s) 2397 { 2398 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2399 } 2400 2401 uint16_t float32_to_uint16(float32 a, float_status *s) 2402 { 2403 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2404 } 2405 2406 uint32_t float32_to_uint32(float32 a, float_status *s) 2407 { 2408 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2409 } 2410 2411 uint64_t float32_to_uint64(float32 a, float_status *s) 2412 { 2413 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2414 } 2415 2416 uint16_t float64_to_uint16(float64 a, float_status *s) 2417 { 2418 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2419 } 2420 2421 uint32_t float64_to_uint32(float64 a, float_status *s) 2422 { 2423 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2424 } 2425 2426 uint64_t float64_to_uint64(float64 a, float_status *s) 2427 { 2428 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2429 } 2430 2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2432 { 2433 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2434 } 2435 2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2437 { 2438 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2439 } 2440 2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2442 { 2443 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2444 } 2445 2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2447 { 2448 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2449 } 2450 2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2452 { 2453 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2454 } 2455 2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2457 { 2458 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2459 } 2460 2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2462 { 2463 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2464 } 2465 2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2467 { 2468 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2469 } 2470 2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2472 { 2473 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2474 } 2475 2476 /* 2477 * Integer to float conversions 2478 * 2479 * Returns the result of converting the two's complement integer `a' 2480 * to the floating-point format. The conversion is performed according 2481 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2482 */ 2483 2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2485 { 2486 FloatParts r = { .sign = false }; 2487 2488 if (a == 0) { 2489 r.cls = float_class_zero; 2490 } else { 2491 uint64_t f = a; 2492 int shift; 2493 2494 r.cls = float_class_normal; 2495 if (a < 0) { 2496 f = -f; 2497 r.sign = true; 2498 } 2499 shift = clz64(f) - 1; 2500 scale = MIN(MAX(scale, -0x10000), 0x10000); 2501 2502 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2503 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2504 } 2505 2506 return r; 2507 } 2508 2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2510 { 2511 FloatParts pa = int_to_float(a, scale, status); 2512 return float16_round_pack_canonical(pa, status); 2513 } 2514 2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2516 { 2517 return int64_to_float16_scalbn(a, scale, status); 2518 } 2519 2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2521 { 2522 return int64_to_float16_scalbn(a, scale, status); 2523 } 2524 2525 float16 int64_to_float16(int64_t a, float_status *status) 2526 { 2527 return int64_to_float16_scalbn(a, 0, status); 2528 } 2529 2530 float16 int32_to_float16(int32_t a, float_status *status) 2531 { 2532 return int64_to_float16_scalbn(a, 0, status); 2533 } 2534 2535 float16 int16_to_float16(int16_t a, float_status *status) 2536 { 2537 return int64_to_float16_scalbn(a, 0, status); 2538 } 2539 2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2541 { 2542 FloatParts pa = int_to_float(a, scale, status); 2543 return float32_round_pack_canonical(pa, status); 2544 } 2545 2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2547 { 2548 return int64_to_float32_scalbn(a, scale, status); 2549 } 2550 2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2552 { 2553 return int64_to_float32_scalbn(a, scale, status); 2554 } 2555 2556 float32 int64_to_float32(int64_t a, float_status *status) 2557 { 2558 return int64_to_float32_scalbn(a, 0, status); 2559 } 2560 2561 float32 int32_to_float32(int32_t a, float_status *status) 2562 { 2563 return int64_to_float32_scalbn(a, 0, status); 2564 } 2565 2566 float32 int16_to_float32(int16_t a, float_status *status) 2567 { 2568 return int64_to_float32_scalbn(a, 0, status); 2569 } 2570 2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2572 { 2573 FloatParts pa = int_to_float(a, scale, status); 2574 return float64_round_pack_canonical(pa, status); 2575 } 2576 2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2578 { 2579 return int64_to_float64_scalbn(a, scale, status); 2580 } 2581 2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2583 { 2584 return int64_to_float64_scalbn(a, scale, status); 2585 } 2586 2587 float64 int64_to_float64(int64_t a, float_status *status) 2588 { 2589 return int64_to_float64_scalbn(a, 0, status); 2590 } 2591 2592 float64 int32_to_float64(int32_t a, float_status *status) 2593 { 2594 return int64_to_float64_scalbn(a, 0, status); 2595 } 2596 2597 float64 int16_to_float64(int16_t a, float_status *status) 2598 { 2599 return int64_to_float64_scalbn(a, 0, status); 2600 } 2601 2602 2603 /* 2604 * Unsigned Integer to float conversions 2605 * 2606 * Returns the result of converting the unsigned integer `a' to the 2607 * floating-point format. The conversion is performed according to the 2608 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2609 */ 2610 2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2612 { 2613 FloatParts r = { .sign = false }; 2614 2615 if (a == 0) { 2616 r.cls = float_class_zero; 2617 } else { 2618 scale = MIN(MAX(scale, -0x10000), 0x10000); 2619 r.cls = float_class_normal; 2620 if ((int64_t)a < 0) { 2621 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2622 shift64RightJamming(a, 1, &a); 2623 r.frac = a; 2624 } else { 2625 int shift = clz64(a) - 1; 2626 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2627 r.frac = a << shift; 2628 } 2629 } 2630 2631 return r; 2632 } 2633 2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2635 { 2636 FloatParts pa = uint_to_float(a, scale, status); 2637 return float16_round_pack_canonical(pa, status); 2638 } 2639 2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2641 { 2642 return uint64_to_float16_scalbn(a, scale, status); 2643 } 2644 2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2646 { 2647 return uint64_to_float16_scalbn(a, scale, status); 2648 } 2649 2650 float16 uint64_to_float16(uint64_t a, float_status *status) 2651 { 2652 return uint64_to_float16_scalbn(a, 0, status); 2653 } 2654 2655 float16 uint32_to_float16(uint32_t a, float_status *status) 2656 { 2657 return uint64_to_float16_scalbn(a, 0, status); 2658 } 2659 2660 float16 uint16_to_float16(uint16_t a, float_status *status) 2661 { 2662 return uint64_to_float16_scalbn(a, 0, status); 2663 } 2664 2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2666 { 2667 FloatParts pa = uint_to_float(a, scale, status); 2668 return float32_round_pack_canonical(pa, status); 2669 } 2670 2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2672 { 2673 return uint64_to_float32_scalbn(a, scale, status); 2674 } 2675 2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2677 { 2678 return uint64_to_float32_scalbn(a, scale, status); 2679 } 2680 2681 float32 uint64_to_float32(uint64_t a, float_status *status) 2682 { 2683 return uint64_to_float32_scalbn(a, 0, status); 2684 } 2685 2686 float32 uint32_to_float32(uint32_t a, float_status *status) 2687 { 2688 return uint64_to_float32_scalbn(a, 0, status); 2689 } 2690 2691 float32 uint16_to_float32(uint16_t a, float_status *status) 2692 { 2693 return uint64_to_float32_scalbn(a, 0, status); 2694 } 2695 2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2697 { 2698 FloatParts pa = uint_to_float(a, scale, status); 2699 return float64_round_pack_canonical(pa, status); 2700 } 2701 2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2703 { 2704 return uint64_to_float64_scalbn(a, scale, status); 2705 } 2706 2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2708 { 2709 return uint64_to_float64_scalbn(a, scale, status); 2710 } 2711 2712 float64 uint64_to_float64(uint64_t a, float_status *status) 2713 { 2714 return uint64_to_float64_scalbn(a, 0, status); 2715 } 2716 2717 float64 uint32_to_float64(uint32_t a, float_status *status) 2718 { 2719 return uint64_to_float64_scalbn(a, 0, status); 2720 } 2721 2722 float64 uint16_to_float64(uint16_t a, float_status *status) 2723 { 2724 return uint64_to_float64_scalbn(a, 0, status); 2725 } 2726 2727 /* Float Min/Max */ 2728 /* min() and max() functions. These can't be implemented as 2729 * 'compare and pick one input' because that would mishandle 2730 * NaNs and +0 vs -0. 2731 * 2732 * minnum() and maxnum() functions. These are similar to the min() 2733 * and max() functions but if one of the arguments is a QNaN and 2734 * the other is numerical then the numerical argument is returned. 2735 * SNaNs will get quietened before being returned. 2736 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2737 * and maxNum() operations. min() and max() are the typical min/max 2738 * semantics provided by many CPUs which predate that specification. 2739 * 2740 * minnummag() and maxnummag() functions correspond to minNumMag() 2741 * and minNumMag() from the IEEE-754 2008. 2742 */ 2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2744 bool ieee, bool ismag, float_status *s) 2745 { 2746 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2747 if (ieee) { 2748 /* Takes two floating-point values `a' and `b', one of 2749 * which is a NaN, and returns the appropriate NaN 2750 * result. If either `a' or `b' is a signaling NaN, 2751 * the invalid exception is raised. 2752 */ 2753 if (is_snan(a.cls) || is_snan(b.cls)) { 2754 return pick_nan(a, b, s); 2755 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2756 return b; 2757 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2758 return a; 2759 } 2760 } 2761 return pick_nan(a, b, s); 2762 } else { 2763 int a_exp, b_exp; 2764 2765 switch (a.cls) { 2766 case float_class_normal: 2767 a_exp = a.exp; 2768 break; 2769 case float_class_inf: 2770 a_exp = INT_MAX; 2771 break; 2772 case float_class_zero: 2773 a_exp = INT_MIN; 2774 break; 2775 default: 2776 g_assert_not_reached(); 2777 break; 2778 } 2779 switch (b.cls) { 2780 case float_class_normal: 2781 b_exp = b.exp; 2782 break; 2783 case float_class_inf: 2784 b_exp = INT_MAX; 2785 break; 2786 case float_class_zero: 2787 b_exp = INT_MIN; 2788 break; 2789 default: 2790 g_assert_not_reached(); 2791 break; 2792 } 2793 2794 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2795 bool a_less = a_exp < b_exp; 2796 if (a_exp == b_exp) { 2797 a_less = a.frac < b.frac; 2798 } 2799 return a_less ^ ismin ? b : a; 2800 } 2801 2802 if (a.sign == b.sign) { 2803 bool a_less = a_exp < b_exp; 2804 if (a_exp == b_exp) { 2805 a_less = a.frac < b.frac; 2806 } 2807 return a.sign ^ a_less ^ ismin ? b : a; 2808 } else { 2809 return a.sign ^ ismin ? b : a; 2810 } 2811 } 2812 } 2813 2814 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2816 float_status *s) \ 2817 { \ 2818 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2819 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2820 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2821 \ 2822 return float ## sz ## _round_pack_canonical(pr, s); \ 2823 } 2824 2825 MINMAX(16, min, true, false, false) 2826 MINMAX(16, minnum, true, true, false) 2827 MINMAX(16, minnummag, true, true, true) 2828 MINMAX(16, max, false, false, false) 2829 MINMAX(16, maxnum, false, true, false) 2830 MINMAX(16, maxnummag, false, true, true) 2831 2832 MINMAX(32, min, true, false, false) 2833 MINMAX(32, minnum, true, true, false) 2834 MINMAX(32, minnummag, true, true, true) 2835 MINMAX(32, max, false, false, false) 2836 MINMAX(32, maxnum, false, true, false) 2837 MINMAX(32, maxnummag, false, true, true) 2838 2839 MINMAX(64, min, true, false, false) 2840 MINMAX(64, minnum, true, true, false) 2841 MINMAX(64, minnummag, true, true, true) 2842 MINMAX(64, max, false, false, false) 2843 MINMAX(64, maxnum, false, true, false) 2844 MINMAX(64, maxnummag, false, true, true) 2845 2846 #undef MINMAX 2847 2848 /* Floating point compare */ 2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2850 float_status *s) 2851 { 2852 if (is_nan(a.cls) || is_nan(b.cls)) { 2853 if (!is_quiet || 2854 a.cls == float_class_snan || 2855 b.cls == float_class_snan) { 2856 s->float_exception_flags |= float_flag_invalid; 2857 } 2858 return float_relation_unordered; 2859 } 2860 2861 if (a.cls == float_class_zero) { 2862 if (b.cls == float_class_zero) { 2863 return float_relation_equal; 2864 } 2865 return b.sign ? float_relation_greater : float_relation_less; 2866 } else if (b.cls == float_class_zero) { 2867 return a.sign ? float_relation_less : float_relation_greater; 2868 } 2869 2870 /* The only really important thing about infinity is its sign. If 2871 * both are infinities the sign marks the smallest of the two. 2872 */ 2873 if (a.cls == float_class_inf) { 2874 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2875 return float_relation_equal; 2876 } 2877 return a.sign ? float_relation_less : float_relation_greater; 2878 } else if (b.cls == float_class_inf) { 2879 return b.sign ? float_relation_greater : float_relation_less; 2880 } 2881 2882 if (a.sign != b.sign) { 2883 return a.sign ? float_relation_less : float_relation_greater; 2884 } 2885 2886 if (a.exp == b.exp) { 2887 if (a.frac == b.frac) { 2888 return float_relation_equal; 2889 } 2890 if (a.sign) { 2891 return a.frac > b.frac ? 2892 float_relation_less : float_relation_greater; 2893 } else { 2894 return a.frac > b.frac ? 2895 float_relation_greater : float_relation_less; 2896 } 2897 } else { 2898 if (a.sign) { 2899 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2900 } else { 2901 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2902 } 2903 } 2904 } 2905 2906 #define COMPARE(sz) \ 2907 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2908 float_status *s) \ 2909 { \ 2910 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2911 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2912 return compare_floats(pa, pb, false, s); \ 2913 } \ 2914 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2915 float_status *s) \ 2916 { \ 2917 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2918 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2919 return compare_floats(pa, pb, true, s); \ 2920 } 2921 2922 COMPARE(16) 2923 COMPARE(32) 2924 COMPARE(64) 2925 2926 #undef COMPARE 2927 2928 /* Multiply A by 2 raised to the power N. */ 2929 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2930 { 2931 if (unlikely(is_nan(a.cls))) { 2932 return return_nan(a, s); 2933 } 2934 if (a.cls == float_class_normal) { 2935 /* The largest float type (even though not supported by FloatParts) 2936 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2937 * still allows rounding to infinity, without allowing overflow 2938 * within the int32_t that backs FloatParts.exp. 2939 */ 2940 n = MIN(MAX(n, -0x10000), 0x10000); 2941 a.exp += n; 2942 } 2943 return a; 2944 } 2945 2946 float16 float16_scalbn(float16 a, int n, float_status *status) 2947 { 2948 FloatParts pa = float16_unpack_canonical(a, status); 2949 FloatParts pr = scalbn_decomposed(pa, n, status); 2950 return float16_round_pack_canonical(pr, status); 2951 } 2952 2953 float32 float32_scalbn(float32 a, int n, float_status *status) 2954 { 2955 FloatParts pa = float32_unpack_canonical(a, status); 2956 FloatParts pr = scalbn_decomposed(pa, n, status); 2957 return float32_round_pack_canonical(pr, status); 2958 } 2959 2960 float64 float64_scalbn(float64 a, int n, float_status *status) 2961 { 2962 FloatParts pa = float64_unpack_canonical(a, status); 2963 FloatParts pr = scalbn_decomposed(pa, n, status); 2964 return float64_round_pack_canonical(pr, status); 2965 } 2966 2967 /* 2968 * Square Root 2969 * 2970 * The old softfloat code did an approximation step before zeroing in 2971 * on the final result. However for simpleness we just compute the 2972 * square root by iterating down from the implicit bit to enough extra 2973 * bits to ensure we get a correctly rounded result. 2974 * 2975 * This does mean however the calculation is slower than before, 2976 * especially for 64 bit floats. 2977 */ 2978 2979 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2980 { 2981 uint64_t a_frac, r_frac, s_frac; 2982 int bit, last_bit; 2983 2984 if (is_nan(a.cls)) { 2985 return return_nan(a, s); 2986 } 2987 if (a.cls == float_class_zero) { 2988 return a; /* sqrt(+-0) = +-0 */ 2989 } 2990 if (a.sign) { 2991 s->float_exception_flags |= float_flag_invalid; 2992 return parts_default_nan(s); 2993 } 2994 if (a.cls == float_class_inf) { 2995 return a; /* sqrt(+inf) = +inf */ 2996 } 2997 2998 assert(a.cls == float_class_normal); 2999 3000 /* We need two overflow bits at the top. Adding room for that is a 3001 * right shift. If the exponent is odd, we can discard the low bit 3002 * by multiplying the fraction by 2; that's a left shift. Combine 3003 * those and we shift right if the exponent is even. 3004 */ 3005 a_frac = a.frac; 3006 if (!(a.exp & 1)) { 3007 a_frac >>= 1; 3008 } 3009 a.exp >>= 1; 3010 3011 /* Bit-by-bit computation of sqrt. */ 3012 r_frac = 0; 3013 s_frac = 0; 3014 3015 /* Iterate from implicit bit down to the 3 extra bits to compute a 3016 * properly rounded result. Remember we've inserted one more bit 3017 * at the top, so these positions are one less. 3018 */ 3019 bit = DECOMPOSED_BINARY_POINT - 1; 3020 last_bit = MAX(p->frac_shift - 4, 0); 3021 do { 3022 uint64_t q = 1ULL << bit; 3023 uint64_t t_frac = s_frac + q; 3024 if (t_frac <= a_frac) { 3025 s_frac = t_frac + q; 3026 a_frac -= t_frac; 3027 r_frac += q; 3028 } 3029 a_frac <<= 1; 3030 } while (--bit >= last_bit); 3031 3032 /* Undo the right shift done above. If there is any remaining 3033 * fraction, the result is inexact. Set the sticky bit. 3034 */ 3035 a.frac = (r_frac << 1) + (a_frac != 0); 3036 3037 return a; 3038 } 3039 3040 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3041 { 3042 FloatParts pa = float16_unpack_canonical(a, status); 3043 FloatParts pr = sqrt_float(pa, status, &float16_params); 3044 return float16_round_pack_canonical(pr, status); 3045 } 3046 3047 static float32 QEMU_SOFTFLOAT_ATTR 3048 soft_f32_sqrt(float32 a, float_status *status) 3049 { 3050 FloatParts pa = float32_unpack_canonical(a, status); 3051 FloatParts pr = sqrt_float(pa, status, &float32_params); 3052 return float32_round_pack_canonical(pr, status); 3053 } 3054 3055 static float64 QEMU_SOFTFLOAT_ATTR 3056 soft_f64_sqrt(float64 a, float_status *status) 3057 { 3058 FloatParts pa = float64_unpack_canonical(a, status); 3059 FloatParts pr = sqrt_float(pa, status, &float64_params); 3060 return float64_round_pack_canonical(pr, status); 3061 } 3062 3063 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3064 { 3065 union_float32 ua, ur; 3066 3067 ua.s = xa; 3068 if (unlikely(!can_use_fpu(s))) { 3069 goto soft; 3070 } 3071 3072 float32_input_flush1(&ua.s, s); 3073 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3074 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3075 fpclassify(ua.h) == FP_ZERO) || 3076 signbit(ua.h))) { 3077 goto soft; 3078 } 3079 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3080 float32_is_neg(ua.s))) { 3081 goto soft; 3082 } 3083 ur.h = sqrtf(ua.h); 3084 return ur.s; 3085 3086 soft: 3087 return soft_f32_sqrt(ua.s, s); 3088 } 3089 3090 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3091 { 3092 union_float64 ua, ur; 3093 3094 ua.s = xa; 3095 if (unlikely(!can_use_fpu(s))) { 3096 goto soft; 3097 } 3098 3099 float64_input_flush1(&ua.s, s); 3100 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3101 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3102 fpclassify(ua.h) == FP_ZERO) || 3103 signbit(ua.h))) { 3104 goto soft; 3105 } 3106 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3107 float64_is_neg(ua.s))) { 3108 goto soft; 3109 } 3110 ur.h = sqrt(ua.h); 3111 return ur.s; 3112 3113 soft: 3114 return soft_f64_sqrt(ua.s, s); 3115 } 3116 3117 /*---------------------------------------------------------------------------- 3118 | The pattern for a default generated NaN. 3119 *----------------------------------------------------------------------------*/ 3120 3121 float16 float16_default_nan(float_status *status) 3122 { 3123 FloatParts p = parts_default_nan(status); 3124 p.frac >>= float16_params.frac_shift; 3125 return float16_pack_raw(p); 3126 } 3127 3128 float32 float32_default_nan(float_status *status) 3129 { 3130 FloatParts p = parts_default_nan(status); 3131 p.frac >>= float32_params.frac_shift; 3132 return float32_pack_raw(p); 3133 } 3134 3135 float64 float64_default_nan(float_status *status) 3136 { 3137 FloatParts p = parts_default_nan(status); 3138 p.frac >>= float64_params.frac_shift; 3139 return float64_pack_raw(p); 3140 } 3141 3142 float128 float128_default_nan(float_status *status) 3143 { 3144 FloatParts p = parts_default_nan(status); 3145 float128 r; 3146 3147 /* Extrapolate from the choices made by parts_default_nan to fill 3148 * in the quad-floating format. If the low bit is set, assume we 3149 * want to set all non-snan bits. 3150 */ 3151 r.low = -(p.frac & 1); 3152 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3153 r.high |= LIT64(0x7FFF000000000000); 3154 r.high |= (uint64_t)p.sign << 63; 3155 3156 return r; 3157 } 3158 3159 /*---------------------------------------------------------------------------- 3160 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3161 *----------------------------------------------------------------------------*/ 3162 3163 float16 float16_silence_nan(float16 a, float_status *status) 3164 { 3165 FloatParts p = float16_unpack_raw(a); 3166 p.frac <<= float16_params.frac_shift; 3167 p = parts_silence_nan(p, status); 3168 p.frac >>= float16_params.frac_shift; 3169 return float16_pack_raw(p); 3170 } 3171 3172 float32 float32_silence_nan(float32 a, float_status *status) 3173 { 3174 FloatParts p = float32_unpack_raw(a); 3175 p.frac <<= float32_params.frac_shift; 3176 p = parts_silence_nan(p, status); 3177 p.frac >>= float32_params.frac_shift; 3178 return float32_pack_raw(p); 3179 } 3180 3181 float64 float64_silence_nan(float64 a, float_status *status) 3182 { 3183 FloatParts p = float64_unpack_raw(a); 3184 p.frac <<= float64_params.frac_shift; 3185 p = parts_silence_nan(p, status); 3186 p.frac >>= float64_params.frac_shift; 3187 return float64_pack_raw(p); 3188 } 3189 3190 /*---------------------------------------------------------------------------- 3191 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3192 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3193 | input. If `zSign' is 1, the input is negated before being converted to an 3194 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3195 | is simply rounded to an integer, with the inexact exception raised if the 3196 | input cannot be represented exactly as an integer. However, if the fixed- 3197 | point input is too large, the invalid exception is raised and the largest 3198 | positive or negative integer is returned. 3199 *----------------------------------------------------------------------------*/ 3200 3201 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3202 { 3203 int8_t roundingMode; 3204 flag roundNearestEven; 3205 int8_t roundIncrement, roundBits; 3206 int32_t z; 3207 3208 roundingMode = status->float_rounding_mode; 3209 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3210 switch (roundingMode) { 3211 case float_round_nearest_even: 3212 case float_round_ties_away: 3213 roundIncrement = 0x40; 3214 break; 3215 case float_round_to_zero: 3216 roundIncrement = 0; 3217 break; 3218 case float_round_up: 3219 roundIncrement = zSign ? 0 : 0x7f; 3220 break; 3221 case float_round_down: 3222 roundIncrement = zSign ? 0x7f : 0; 3223 break; 3224 default: 3225 abort(); 3226 } 3227 roundBits = absZ & 0x7F; 3228 absZ = ( absZ + roundIncrement )>>7; 3229 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3230 z = absZ; 3231 if ( zSign ) z = - z; 3232 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3233 float_raise(float_flag_invalid, status); 3234 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3235 } 3236 if (roundBits) { 3237 status->float_exception_flags |= float_flag_inexact; 3238 } 3239 return z; 3240 3241 } 3242 3243 /*---------------------------------------------------------------------------- 3244 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3245 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3246 | and returns the properly rounded 64-bit integer corresponding to the input. 3247 | If `zSign' is 1, the input is negated before being converted to an integer. 3248 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3249 | the inexact exception raised if the input cannot be represented exactly as 3250 | an integer. However, if the fixed-point input is too large, the invalid 3251 | exception is raised and the largest positive or negative integer is 3252 | returned. 3253 *----------------------------------------------------------------------------*/ 3254 3255 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3256 float_status *status) 3257 { 3258 int8_t roundingMode; 3259 flag roundNearestEven, increment; 3260 int64_t z; 3261 3262 roundingMode = status->float_rounding_mode; 3263 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3264 switch (roundingMode) { 3265 case float_round_nearest_even: 3266 case float_round_ties_away: 3267 increment = ((int64_t) absZ1 < 0); 3268 break; 3269 case float_round_to_zero: 3270 increment = 0; 3271 break; 3272 case float_round_up: 3273 increment = !zSign && absZ1; 3274 break; 3275 case float_round_down: 3276 increment = zSign && absZ1; 3277 break; 3278 default: 3279 abort(); 3280 } 3281 if ( increment ) { 3282 ++absZ0; 3283 if ( absZ0 == 0 ) goto overflow; 3284 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3285 } 3286 z = absZ0; 3287 if ( zSign ) z = - z; 3288 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3289 overflow: 3290 float_raise(float_flag_invalid, status); 3291 return 3292 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3293 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3294 } 3295 if (absZ1) { 3296 status->float_exception_flags |= float_flag_inexact; 3297 } 3298 return z; 3299 3300 } 3301 3302 /*---------------------------------------------------------------------------- 3303 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3304 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3305 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3306 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3307 | with the inexact exception raised if the input cannot be represented exactly 3308 | as an integer. However, if the fixed-point input is too large, the invalid 3309 | exception is raised and the largest unsigned integer is returned. 3310 *----------------------------------------------------------------------------*/ 3311 3312 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3313 uint64_t absZ1, float_status *status) 3314 { 3315 int8_t roundingMode; 3316 flag roundNearestEven, increment; 3317 3318 roundingMode = status->float_rounding_mode; 3319 roundNearestEven = (roundingMode == float_round_nearest_even); 3320 switch (roundingMode) { 3321 case float_round_nearest_even: 3322 case float_round_ties_away: 3323 increment = ((int64_t)absZ1 < 0); 3324 break; 3325 case float_round_to_zero: 3326 increment = 0; 3327 break; 3328 case float_round_up: 3329 increment = !zSign && absZ1; 3330 break; 3331 case float_round_down: 3332 increment = zSign && absZ1; 3333 break; 3334 default: 3335 abort(); 3336 } 3337 if (increment) { 3338 ++absZ0; 3339 if (absZ0 == 0) { 3340 float_raise(float_flag_invalid, status); 3341 return LIT64(0xFFFFFFFFFFFFFFFF); 3342 } 3343 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3344 } 3345 3346 if (zSign && absZ0) { 3347 float_raise(float_flag_invalid, status); 3348 return 0; 3349 } 3350 3351 if (absZ1) { 3352 status->float_exception_flags |= float_flag_inexact; 3353 } 3354 return absZ0; 3355 } 3356 3357 /*---------------------------------------------------------------------------- 3358 | If `a' is denormal and we are in flush-to-zero mode then set the 3359 | input-denormal exception and return zero. Otherwise just return the value. 3360 *----------------------------------------------------------------------------*/ 3361 float32 float32_squash_input_denormal(float32 a, float_status *status) 3362 { 3363 if (status->flush_inputs_to_zero) { 3364 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3365 float_raise(float_flag_input_denormal, status); 3366 return make_float32(float32_val(a) & 0x80000000); 3367 } 3368 } 3369 return a; 3370 } 3371 3372 /*---------------------------------------------------------------------------- 3373 | Normalizes the subnormal single-precision floating-point value represented 3374 | by the denormalized significand `aSig'. The normalized exponent and 3375 | significand are stored at the locations pointed to by `zExpPtr' and 3376 | `zSigPtr', respectively. 3377 *----------------------------------------------------------------------------*/ 3378 3379 static void 3380 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3381 { 3382 int8_t shiftCount; 3383 3384 shiftCount = clz32(aSig) - 8; 3385 *zSigPtr = aSig<<shiftCount; 3386 *zExpPtr = 1 - shiftCount; 3387 3388 } 3389 3390 /*---------------------------------------------------------------------------- 3391 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3392 | and significand `zSig', and returns the proper single-precision floating- 3393 | point value corresponding to the abstract input. Ordinarily, the abstract 3394 | value is simply rounded and packed into the single-precision format, with 3395 | the inexact exception raised if the abstract input cannot be represented 3396 | exactly. However, if the abstract value is too large, the overflow and 3397 | inexact exceptions are raised and an infinity or maximal finite value is 3398 | returned. If the abstract value is too small, the input value is rounded to 3399 | a subnormal number, and the underflow and inexact exceptions are raised if 3400 | the abstract input cannot be represented exactly as a subnormal single- 3401 | precision floating-point number. 3402 | The input significand `zSig' has its binary point between bits 30 3403 | and 29, which is 7 bits to the left of the usual location. This shifted 3404 | significand must be normalized or smaller. If `zSig' is not normalized, 3405 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3406 | and it must not require rounding. In the usual case that `zSig' is 3407 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3408 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3409 | Binary Floating-Point Arithmetic. 3410 *----------------------------------------------------------------------------*/ 3411 3412 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3413 float_status *status) 3414 { 3415 int8_t roundingMode; 3416 flag roundNearestEven; 3417 int8_t roundIncrement, roundBits; 3418 flag isTiny; 3419 3420 roundingMode = status->float_rounding_mode; 3421 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3422 switch (roundingMode) { 3423 case float_round_nearest_even: 3424 case float_round_ties_away: 3425 roundIncrement = 0x40; 3426 break; 3427 case float_round_to_zero: 3428 roundIncrement = 0; 3429 break; 3430 case float_round_up: 3431 roundIncrement = zSign ? 0 : 0x7f; 3432 break; 3433 case float_round_down: 3434 roundIncrement = zSign ? 0x7f : 0; 3435 break; 3436 default: 3437 abort(); 3438 break; 3439 } 3440 roundBits = zSig & 0x7F; 3441 if ( 0xFD <= (uint16_t) zExp ) { 3442 if ( ( 0xFD < zExp ) 3443 || ( ( zExp == 0xFD ) 3444 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3445 ) { 3446 float_raise(float_flag_overflow | float_flag_inexact, status); 3447 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3448 } 3449 if ( zExp < 0 ) { 3450 if (status->flush_to_zero) { 3451 float_raise(float_flag_output_denormal, status); 3452 return packFloat32(zSign, 0, 0); 3453 } 3454 isTiny = 3455 (status->float_detect_tininess 3456 == float_tininess_before_rounding) 3457 || ( zExp < -1 ) 3458 || ( zSig + roundIncrement < 0x80000000 ); 3459 shift32RightJamming( zSig, - zExp, &zSig ); 3460 zExp = 0; 3461 roundBits = zSig & 0x7F; 3462 if (isTiny && roundBits) { 3463 float_raise(float_flag_underflow, status); 3464 } 3465 } 3466 } 3467 if (roundBits) { 3468 status->float_exception_flags |= float_flag_inexact; 3469 } 3470 zSig = ( zSig + roundIncrement )>>7; 3471 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3472 if ( zSig == 0 ) zExp = 0; 3473 return packFloat32( zSign, zExp, zSig ); 3474 3475 } 3476 3477 /*---------------------------------------------------------------------------- 3478 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3479 | and significand `zSig', and returns the proper single-precision floating- 3480 | point value corresponding to the abstract input. This routine is just like 3481 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3482 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3483 | floating-point exponent. 3484 *----------------------------------------------------------------------------*/ 3485 3486 static float32 3487 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3488 float_status *status) 3489 { 3490 int8_t shiftCount; 3491 3492 shiftCount = clz32(zSig) - 1; 3493 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3494 status); 3495 3496 } 3497 3498 /*---------------------------------------------------------------------------- 3499 | If `a' is denormal and we are in flush-to-zero mode then set the 3500 | input-denormal exception and return zero. Otherwise just return the value. 3501 *----------------------------------------------------------------------------*/ 3502 float64 float64_squash_input_denormal(float64 a, float_status *status) 3503 { 3504 if (status->flush_inputs_to_zero) { 3505 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3506 float_raise(float_flag_input_denormal, status); 3507 return make_float64(float64_val(a) & (1ULL << 63)); 3508 } 3509 } 3510 return a; 3511 } 3512 3513 /*---------------------------------------------------------------------------- 3514 | Normalizes the subnormal double-precision floating-point value represented 3515 | by the denormalized significand `aSig'. The normalized exponent and 3516 | significand are stored at the locations pointed to by `zExpPtr' and 3517 | `zSigPtr', respectively. 3518 *----------------------------------------------------------------------------*/ 3519 3520 static void 3521 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3522 { 3523 int8_t shiftCount; 3524 3525 shiftCount = clz64(aSig) - 11; 3526 *zSigPtr = aSig<<shiftCount; 3527 *zExpPtr = 1 - shiftCount; 3528 3529 } 3530 3531 /*---------------------------------------------------------------------------- 3532 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3533 | double-precision floating-point value, returning the result. After being 3534 | shifted into the proper positions, the three fields are simply added 3535 | together to form the result. This means that any integer portion of `zSig' 3536 | will be added into the exponent. Since a properly normalized significand 3537 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3538 | than the desired result exponent whenever `zSig' is a complete, normalized 3539 | significand. 3540 *----------------------------------------------------------------------------*/ 3541 3542 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3543 { 3544 3545 return make_float64( 3546 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3547 3548 } 3549 3550 /*---------------------------------------------------------------------------- 3551 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3552 | and significand `zSig', and returns the proper double-precision floating- 3553 | point value corresponding to the abstract input. Ordinarily, the abstract 3554 | value is simply rounded and packed into the double-precision format, with 3555 | the inexact exception raised if the abstract input cannot be represented 3556 | exactly. However, if the abstract value is too large, the overflow and 3557 | inexact exceptions are raised and an infinity or maximal finite value is 3558 | returned. If the abstract value is too small, the input value is rounded to 3559 | a subnormal number, and the underflow and inexact exceptions are raised if 3560 | the abstract input cannot be represented exactly as a subnormal double- 3561 | precision floating-point number. 3562 | The input significand `zSig' has its binary point between bits 62 3563 | and 61, which is 10 bits to the left of the usual location. This shifted 3564 | significand must be normalized or smaller. If `zSig' is not normalized, 3565 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3566 | and it must not require rounding. In the usual case that `zSig' is 3567 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3568 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3569 | Binary Floating-Point Arithmetic. 3570 *----------------------------------------------------------------------------*/ 3571 3572 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3573 float_status *status) 3574 { 3575 int8_t roundingMode; 3576 flag roundNearestEven; 3577 int roundIncrement, roundBits; 3578 flag isTiny; 3579 3580 roundingMode = status->float_rounding_mode; 3581 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3582 switch (roundingMode) { 3583 case float_round_nearest_even: 3584 case float_round_ties_away: 3585 roundIncrement = 0x200; 3586 break; 3587 case float_round_to_zero: 3588 roundIncrement = 0; 3589 break; 3590 case float_round_up: 3591 roundIncrement = zSign ? 0 : 0x3ff; 3592 break; 3593 case float_round_down: 3594 roundIncrement = zSign ? 0x3ff : 0; 3595 break; 3596 case float_round_to_odd: 3597 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3598 break; 3599 default: 3600 abort(); 3601 } 3602 roundBits = zSig & 0x3FF; 3603 if ( 0x7FD <= (uint16_t) zExp ) { 3604 if ( ( 0x7FD < zExp ) 3605 || ( ( zExp == 0x7FD ) 3606 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3607 ) { 3608 bool overflow_to_inf = roundingMode != float_round_to_odd && 3609 roundIncrement != 0; 3610 float_raise(float_flag_overflow | float_flag_inexact, status); 3611 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3612 } 3613 if ( zExp < 0 ) { 3614 if (status->flush_to_zero) { 3615 float_raise(float_flag_output_denormal, status); 3616 return packFloat64(zSign, 0, 0); 3617 } 3618 isTiny = 3619 (status->float_detect_tininess 3620 == float_tininess_before_rounding) 3621 || ( zExp < -1 ) 3622 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3623 shift64RightJamming( zSig, - zExp, &zSig ); 3624 zExp = 0; 3625 roundBits = zSig & 0x3FF; 3626 if (isTiny && roundBits) { 3627 float_raise(float_flag_underflow, status); 3628 } 3629 if (roundingMode == float_round_to_odd) { 3630 /* 3631 * For round-to-odd case, the roundIncrement depends on 3632 * zSig which just changed. 3633 */ 3634 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3635 } 3636 } 3637 } 3638 if (roundBits) { 3639 status->float_exception_flags |= float_flag_inexact; 3640 } 3641 zSig = ( zSig + roundIncrement )>>10; 3642 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3643 if ( zSig == 0 ) zExp = 0; 3644 return packFloat64( zSign, zExp, zSig ); 3645 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3650 | and significand `zSig', and returns the proper double-precision floating- 3651 | point value corresponding to the abstract input. This routine is just like 3652 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3653 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3654 | floating-point exponent. 3655 *----------------------------------------------------------------------------*/ 3656 3657 static float64 3658 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3659 float_status *status) 3660 { 3661 int8_t shiftCount; 3662 3663 shiftCount = clz64(zSig) - 1; 3664 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3665 status); 3666 3667 } 3668 3669 /*---------------------------------------------------------------------------- 3670 | Normalizes the subnormal extended double-precision floating-point value 3671 | represented by the denormalized significand `aSig'. The normalized exponent 3672 | and significand are stored at the locations pointed to by `zExpPtr' and 3673 | `zSigPtr', respectively. 3674 *----------------------------------------------------------------------------*/ 3675 3676 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3677 uint64_t *zSigPtr) 3678 { 3679 int8_t shiftCount; 3680 3681 shiftCount = clz64(aSig); 3682 *zSigPtr = aSig<<shiftCount; 3683 *zExpPtr = 1 - shiftCount; 3684 } 3685 3686 /*---------------------------------------------------------------------------- 3687 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3688 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3689 | and returns the proper extended double-precision floating-point value 3690 | corresponding to the abstract input. Ordinarily, the abstract value is 3691 | rounded and packed into the extended double-precision format, with the 3692 | inexact exception raised if the abstract input cannot be represented 3693 | exactly. However, if the abstract value is too large, the overflow and 3694 | inexact exceptions are raised and an infinity or maximal finite value is 3695 | returned. If the abstract value is too small, the input value is rounded to 3696 | a subnormal number, and the underflow and inexact exceptions are raised if 3697 | the abstract input cannot be represented exactly as a subnormal extended 3698 | double-precision floating-point number. 3699 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3700 | number of bits as single or double precision, respectively. Otherwise, the 3701 | result is rounded to the full precision of the extended double-precision 3702 | format. 3703 | The input significand must be normalized or smaller. If the input 3704 | significand is not normalized, `zExp' must be 0; in that case, the result 3705 | returned is a subnormal number, and it must not require rounding. The 3706 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3707 | Floating-Point Arithmetic. 3708 *----------------------------------------------------------------------------*/ 3709 3710 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3711 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3712 float_status *status) 3713 { 3714 int8_t roundingMode; 3715 flag roundNearestEven, increment, isTiny; 3716 int64_t roundIncrement, roundMask, roundBits; 3717 3718 roundingMode = status->float_rounding_mode; 3719 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3720 if ( roundingPrecision == 80 ) goto precision80; 3721 if ( roundingPrecision == 64 ) { 3722 roundIncrement = LIT64( 0x0000000000000400 ); 3723 roundMask = LIT64( 0x00000000000007FF ); 3724 } 3725 else if ( roundingPrecision == 32 ) { 3726 roundIncrement = LIT64( 0x0000008000000000 ); 3727 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3728 } 3729 else { 3730 goto precision80; 3731 } 3732 zSig0 |= ( zSig1 != 0 ); 3733 switch (roundingMode) { 3734 case float_round_nearest_even: 3735 case float_round_ties_away: 3736 break; 3737 case float_round_to_zero: 3738 roundIncrement = 0; 3739 break; 3740 case float_round_up: 3741 roundIncrement = zSign ? 0 : roundMask; 3742 break; 3743 case float_round_down: 3744 roundIncrement = zSign ? roundMask : 0; 3745 break; 3746 default: 3747 abort(); 3748 } 3749 roundBits = zSig0 & roundMask; 3750 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3751 if ( ( 0x7FFE < zExp ) 3752 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3753 ) { 3754 goto overflow; 3755 } 3756 if ( zExp <= 0 ) { 3757 if (status->flush_to_zero) { 3758 float_raise(float_flag_output_denormal, status); 3759 return packFloatx80(zSign, 0, 0); 3760 } 3761 isTiny = 3762 (status->float_detect_tininess 3763 == float_tininess_before_rounding) 3764 || ( zExp < 0 ) 3765 || ( zSig0 <= zSig0 + roundIncrement ); 3766 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3767 zExp = 0; 3768 roundBits = zSig0 & roundMask; 3769 if (isTiny && roundBits) { 3770 float_raise(float_flag_underflow, status); 3771 } 3772 if (roundBits) { 3773 status->float_exception_flags |= float_flag_inexact; 3774 } 3775 zSig0 += roundIncrement; 3776 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3777 roundIncrement = roundMask + 1; 3778 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3779 roundMask |= roundIncrement; 3780 } 3781 zSig0 &= ~ roundMask; 3782 return packFloatx80( zSign, zExp, zSig0 ); 3783 } 3784 } 3785 if (roundBits) { 3786 status->float_exception_flags |= float_flag_inexact; 3787 } 3788 zSig0 += roundIncrement; 3789 if ( zSig0 < roundIncrement ) { 3790 ++zExp; 3791 zSig0 = LIT64( 0x8000000000000000 ); 3792 } 3793 roundIncrement = roundMask + 1; 3794 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3795 roundMask |= roundIncrement; 3796 } 3797 zSig0 &= ~ roundMask; 3798 if ( zSig0 == 0 ) zExp = 0; 3799 return packFloatx80( zSign, zExp, zSig0 ); 3800 precision80: 3801 switch (roundingMode) { 3802 case float_round_nearest_even: 3803 case float_round_ties_away: 3804 increment = ((int64_t)zSig1 < 0); 3805 break; 3806 case float_round_to_zero: 3807 increment = 0; 3808 break; 3809 case float_round_up: 3810 increment = !zSign && zSig1; 3811 break; 3812 case float_round_down: 3813 increment = zSign && zSig1; 3814 break; 3815 default: 3816 abort(); 3817 } 3818 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3819 if ( ( 0x7FFE < zExp ) 3820 || ( ( zExp == 0x7FFE ) 3821 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3822 && increment 3823 ) 3824 ) { 3825 roundMask = 0; 3826 overflow: 3827 float_raise(float_flag_overflow | float_flag_inexact, status); 3828 if ( ( roundingMode == float_round_to_zero ) 3829 || ( zSign && ( roundingMode == float_round_up ) ) 3830 || ( ! zSign && ( roundingMode == float_round_down ) ) 3831 ) { 3832 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3833 } 3834 return packFloatx80(zSign, 3835 floatx80_infinity_high, 3836 floatx80_infinity_low); 3837 } 3838 if ( zExp <= 0 ) { 3839 isTiny = 3840 (status->float_detect_tininess 3841 == float_tininess_before_rounding) 3842 || ( zExp < 0 ) 3843 || ! increment 3844 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3845 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3846 zExp = 0; 3847 if (isTiny && zSig1) { 3848 float_raise(float_flag_underflow, status); 3849 } 3850 if (zSig1) { 3851 status->float_exception_flags |= float_flag_inexact; 3852 } 3853 switch (roundingMode) { 3854 case float_round_nearest_even: 3855 case float_round_ties_away: 3856 increment = ((int64_t)zSig1 < 0); 3857 break; 3858 case float_round_to_zero: 3859 increment = 0; 3860 break; 3861 case float_round_up: 3862 increment = !zSign && zSig1; 3863 break; 3864 case float_round_down: 3865 increment = zSign && zSig1; 3866 break; 3867 default: 3868 abort(); 3869 } 3870 if ( increment ) { 3871 ++zSig0; 3872 zSig0 &= 3873 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3874 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3875 } 3876 return packFloatx80( zSign, zExp, zSig0 ); 3877 } 3878 } 3879 if (zSig1) { 3880 status->float_exception_flags |= float_flag_inexact; 3881 } 3882 if ( increment ) { 3883 ++zSig0; 3884 if ( zSig0 == 0 ) { 3885 ++zExp; 3886 zSig0 = LIT64( 0x8000000000000000 ); 3887 } 3888 else { 3889 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3890 } 3891 } 3892 else { 3893 if ( zSig0 == 0 ) zExp = 0; 3894 } 3895 return packFloatx80( zSign, zExp, zSig0 ); 3896 3897 } 3898 3899 /*---------------------------------------------------------------------------- 3900 | Takes an abstract floating-point value having sign `zSign', exponent 3901 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3902 | and returns the proper extended double-precision floating-point value 3903 | corresponding to the abstract input. This routine is just like 3904 | `roundAndPackFloatx80' except that the input significand does not have to be 3905 | normalized. 3906 *----------------------------------------------------------------------------*/ 3907 3908 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3909 flag zSign, int32_t zExp, 3910 uint64_t zSig0, uint64_t zSig1, 3911 float_status *status) 3912 { 3913 int8_t shiftCount; 3914 3915 if ( zSig0 == 0 ) { 3916 zSig0 = zSig1; 3917 zSig1 = 0; 3918 zExp -= 64; 3919 } 3920 shiftCount = clz64(zSig0); 3921 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3922 zExp -= shiftCount; 3923 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3924 zSig0, zSig1, status); 3925 3926 } 3927 3928 /*---------------------------------------------------------------------------- 3929 | Returns the least-significant 64 fraction bits of the quadruple-precision 3930 | floating-point value `a'. 3931 *----------------------------------------------------------------------------*/ 3932 3933 static inline uint64_t extractFloat128Frac1( float128 a ) 3934 { 3935 3936 return a.low; 3937 3938 } 3939 3940 /*---------------------------------------------------------------------------- 3941 | Returns the most-significant 48 fraction bits of the quadruple-precision 3942 | floating-point value `a'. 3943 *----------------------------------------------------------------------------*/ 3944 3945 static inline uint64_t extractFloat128Frac0( float128 a ) 3946 { 3947 3948 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3949 3950 } 3951 3952 /*---------------------------------------------------------------------------- 3953 | Returns the exponent bits of the quadruple-precision floating-point value 3954 | `a'. 3955 *----------------------------------------------------------------------------*/ 3956 3957 static inline int32_t extractFloat128Exp( float128 a ) 3958 { 3959 3960 return ( a.high>>48 ) & 0x7FFF; 3961 3962 } 3963 3964 /*---------------------------------------------------------------------------- 3965 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3966 *----------------------------------------------------------------------------*/ 3967 3968 static inline flag extractFloat128Sign( float128 a ) 3969 { 3970 3971 return a.high>>63; 3972 3973 } 3974 3975 /*---------------------------------------------------------------------------- 3976 | Normalizes the subnormal quadruple-precision floating-point value 3977 | represented by the denormalized significand formed by the concatenation of 3978 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3979 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3980 | significand are stored at the location pointed to by `zSig0Ptr', and the 3981 | least significant 64 bits of the normalized significand are stored at the 3982 | location pointed to by `zSig1Ptr'. 3983 *----------------------------------------------------------------------------*/ 3984 3985 static void 3986 normalizeFloat128Subnormal( 3987 uint64_t aSig0, 3988 uint64_t aSig1, 3989 int32_t *zExpPtr, 3990 uint64_t *zSig0Ptr, 3991 uint64_t *zSig1Ptr 3992 ) 3993 { 3994 int8_t shiftCount; 3995 3996 if ( aSig0 == 0 ) { 3997 shiftCount = clz64(aSig1) - 15; 3998 if ( shiftCount < 0 ) { 3999 *zSig0Ptr = aSig1>>( - shiftCount ); 4000 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4001 } 4002 else { 4003 *zSig0Ptr = aSig1<<shiftCount; 4004 *zSig1Ptr = 0; 4005 } 4006 *zExpPtr = - shiftCount - 63; 4007 } 4008 else { 4009 shiftCount = clz64(aSig0) - 15; 4010 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4011 *zExpPtr = 1 - shiftCount; 4012 } 4013 4014 } 4015 4016 /*---------------------------------------------------------------------------- 4017 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4018 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4019 | floating-point value, returning the result. After being shifted into the 4020 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4021 | added together to form the most significant 32 bits of the result. This 4022 | means that any integer portion of `zSig0' will be added into the exponent. 4023 | Since a properly normalized significand will have an integer portion equal 4024 | to 1, the `zExp' input should be 1 less than the desired result exponent 4025 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4026 | significand. 4027 *----------------------------------------------------------------------------*/ 4028 4029 static inline float128 4030 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4031 { 4032 float128 z; 4033 4034 z.low = zSig1; 4035 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4036 return z; 4037 4038 } 4039 4040 /*---------------------------------------------------------------------------- 4041 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4042 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4043 | and `zSig2', and returns the proper quadruple-precision floating-point value 4044 | corresponding to the abstract input. Ordinarily, the abstract value is 4045 | simply rounded and packed into the quadruple-precision format, with the 4046 | inexact exception raised if the abstract input cannot be represented 4047 | exactly. However, if the abstract value is too large, the overflow and 4048 | inexact exceptions are raised and an infinity or maximal finite value is 4049 | returned. If the abstract value is too small, the input value is rounded to 4050 | a subnormal number, and the underflow and inexact exceptions are raised if 4051 | the abstract input cannot be represented exactly as a subnormal quadruple- 4052 | precision floating-point number. 4053 | The input significand must be normalized or smaller. If the input 4054 | significand is not normalized, `zExp' must be 0; in that case, the result 4055 | returned is a subnormal number, and it must not require rounding. In the 4056 | usual case that the input significand is normalized, `zExp' must be 1 less 4057 | than the ``true'' floating-point exponent. The handling of underflow and 4058 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4059 *----------------------------------------------------------------------------*/ 4060 4061 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4062 uint64_t zSig0, uint64_t zSig1, 4063 uint64_t zSig2, float_status *status) 4064 { 4065 int8_t roundingMode; 4066 flag roundNearestEven, increment, isTiny; 4067 4068 roundingMode = status->float_rounding_mode; 4069 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4070 switch (roundingMode) { 4071 case float_round_nearest_even: 4072 case float_round_ties_away: 4073 increment = ((int64_t)zSig2 < 0); 4074 break; 4075 case float_round_to_zero: 4076 increment = 0; 4077 break; 4078 case float_round_up: 4079 increment = !zSign && zSig2; 4080 break; 4081 case float_round_down: 4082 increment = zSign && zSig2; 4083 break; 4084 case float_round_to_odd: 4085 increment = !(zSig1 & 0x1) && zSig2; 4086 break; 4087 default: 4088 abort(); 4089 } 4090 if ( 0x7FFD <= (uint32_t) zExp ) { 4091 if ( ( 0x7FFD < zExp ) 4092 || ( ( zExp == 0x7FFD ) 4093 && eq128( 4094 LIT64( 0x0001FFFFFFFFFFFF ), 4095 LIT64( 0xFFFFFFFFFFFFFFFF ), 4096 zSig0, 4097 zSig1 4098 ) 4099 && increment 4100 ) 4101 ) { 4102 float_raise(float_flag_overflow | float_flag_inexact, status); 4103 if ( ( roundingMode == float_round_to_zero ) 4104 || ( zSign && ( roundingMode == float_round_up ) ) 4105 || ( ! zSign && ( roundingMode == float_round_down ) ) 4106 || (roundingMode == float_round_to_odd) 4107 ) { 4108 return 4109 packFloat128( 4110 zSign, 4111 0x7FFE, 4112 LIT64( 0x0000FFFFFFFFFFFF ), 4113 LIT64( 0xFFFFFFFFFFFFFFFF ) 4114 ); 4115 } 4116 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4117 } 4118 if ( zExp < 0 ) { 4119 if (status->flush_to_zero) { 4120 float_raise(float_flag_output_denormal, status); 4121 return packFloat128(zSign, 0, 0, 0); 4122 } 4123 isTiny = 4124 (status->float_detect_tininess 4125 == float_tininess_before_rounding) 4126 || ( zExp < -1 ) 4127 || ! increment 4128 || lt128( 4129 zSig0, 4130 zSig1, 4131 LIT64( 0x0001FFFFFFFFFFFF ), 4132 LIT64( 0xFFFFFFFFFFFFFFFF ) 4133 ); 4134 shift128ExtraRightJamming( 4135 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4136 zExp = 0; 4137 if (isTiny && zSig2) { 4138 float_raise(float_flag_underflow, status); 4139 } 4140 switch (roundingMode) { 4141 case float_round_nearest_even: 4142 case float_round_ties_away: 4143 increment = ((int64_t)zSig2 < 0); 4144 break; 4145 case float_round_to_zero: 4146 increment = 0; 4147 break; 4148 case float_round_up: 4149 increment = !zSign && zSig2; 4150 break; 4151 case float_round_down: 4152 increment = zSign && zSig2; 4153 break; 4154 case float_round_to_odd: 4155 increment = !(zSig1 & 0x1) && zSig2; 4156 break; 4157 default: 4158 abort(); 4159 } 4160 } 4161 } 4162 if (zSig2) { 4163 status->float_exception_flags |= float_flag_inexact; 4164 } 4165 if ( increment ) { 4166 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4167 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4168 } 4169 else { 4170 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4171 } 4172 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4173 4174 } 4175 4176 /*---------------------------------------------------------------------------- 4177 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4178 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4179 | returns the proper quadruple-precision floating-point value corresponding 4180 | to the abstract input. This routine is just like `roundAndPackFloat128' 4181 | except that the input significand has fewer bits and does not have to be 4182 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4183 | point exponent. 4184 *----------------------------------------------------------------------------*/ 4185 4186 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4187 uint64_t zSig0, uint64_t zSig1, 4188 float_status *status) 4189 { 4190 int8_t shiftCount; 4191 uint64_t zSig2; 4192 4193 if ( zSig0 == 0 ) { 4194 zSig0 = zSig1; 4195 zSig1 = 0; 4196 zExp -= 64; 4197 } 4198 shiftCount = clz64(zSig0) - 15; 4199 if ( 0 <= shiftCount ) { 4200 zSig2 = 0; 4201 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4202 } 4203 else { 4204 shift128ExtraRightJamming( 4205 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4206 } 4207 zExp -= shiftCount; 4208 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4209 4210 } 4211 4212 4213 /*---------------------------------------------------------------------------- 4214 | Returns the result of converting the 32-bit two's complement integer `a' 4215 | to the extended double-precision floating-point format. The conversion 4216 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4217 | Arithmetic. 4218 *----------------------------------------------------------------------------*/ 4219 4220 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4221 { 4222 flag zSign; 4223 uint32_t absA; 4224 int8_t shiftCount; 4225 uint64_t zSig; 4226 4227 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4228 zSign = ( a < 0 ); 4229 absA = zSign ? - a : a; 4230 shiftCount = clz32(absA) + 32; 4231 zSig = absA; 4232 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4233 4234 } 4235 4236 /*---------------------------------------------------------------------------- 4237 | Returns the result of converting the 32-bit two's complement integer `a' to 4238 | the quadruple-precision floating-point format. The conversion is performed 4239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4240 *----------------------------------------------------------------------------*/ 4241 4242 float128 int32_to_float128(int32_t a, float_status *status) 4243 { 4244 flag zSign; 4245 uint32_t absA; 4246 int8_t shiftCount; 4247 uint64_t zSig0; 4248 4249 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4250 zSign = ( a < 0 ); 4251 absA = zSign ? - a : a; 4252 shiftCount = clz32(absA) + 17; 4253 zSig0 = absA; 4254 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4255 4256 } 4257 4258 /*---------------------------------------------------------------------------- 4259 | Returns the result of converting the 64-bit two's complement integer `a' 4260 | to the extended double-precision floating-point format. The conversion 4261 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4262 | Arithmetic. 4263 *----------------------------------------------------------------------------*/ 4264 4265 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4266 { 4267 flag zSign; 4268 uint64_t absA; 4269 int8_t shiftCount; 4270 4271 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4272 zSign = ( a < 0 ); 4273 absA = zSign ? - a : a; 4274 shiftCount = clz64(absA); 4275 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4276 4277 } 4278 4279 /*---------------------------------------------------------------------------- 4280 | Returns the result of converting the 64-bit two's complement integer `a' to 4281 | the quadruple-precision floating-point format. The conversion is performed 4282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4283 *----------------------------------------------------------------------------*/ 4284 4285 float128 int64_to_float128(int64_t a, float_status *status) 4286 { 4287 flag zSign; 4288 uint64_t absA; 4289 int8_t shiftCount; 4290 int32_t zExp; 4291 uint64_t zSig0, zSig1; 4292 4293 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4294 zSign = ( a < 0 ); 4295 absA = zSign ? - a : a; 4296 shiftCount = clz64(absA) + 49; 4297 zExp = 0x406E - shiftCount; 4298 if ( 64 <= shiftCount ) { 4299 zSig1 = 0; 4300 zSig0 = absA; 4301 shiftCount -= 64; 4302 } 4303 else { 4304 zSig1 = absA; 4305 zSig0 = 0; 4306 } 4307 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4308 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4309 4310 } 4311 4312 /*---------------------------------------------------------------------------- 4313 | Returns the result of converting the 64-bit unsigned integer `a' 4314 | to the quadruple-precision floating-point format. The conversion is performed 4315 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4316 *----------------------------------------------------------------------------*/ 4317 4318 float128 uint64_to_float128(uint64_t a, float_status *status) 4319 { 4320 if (a == 0) { 4321 return float128_zero; 4322 } 4323 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4324 } 4325 4326 /*---------------------------------------------------------------------------- 4327 | Returns the result of converting the single-precision floating-point value 4328 | `a' to the extended double-precision floating-point format. The conversion 4329 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4330 | Arithmetic. 4331 *----------------------------------------------------------------------------*/ 4332 4333 floatx80 float32_to_floatx80(float32 a, float_status *status) 4334 { 4335 flag aSign; 4336 int aExp; 4337 uint32_t aSig; 4338 4339 a = float32_squash_input_denormal(a, status); 4340 aSig = extractFloat32Frac( a ); 4341 aExp = extractFloat32Exp( a ); 4342 aSign = extractFloat32Sign( a ); 4343 if ( aExp == 0xFF ) { 4344 if (aSig) { 4345 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4346 } 4347 return packFloatx80(aSign, 4348 floatx80_infinity_high, 4349 floatx80_infinity_low); 4350 } 4351 if ( aExp == 0 ) { 4352 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4353 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4354 } 4355 aSig |= 0x00800000; 4356 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4357 4358 } 4359 4360 /*---------------------------------------------------------------------------- 4361 | Returns the result of converting the single-precision floating-point value 4362 | `a' to the double-precision floating-point format. The conversion is 4363 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4364 | Arithmetic. 4365 *----------------------------------------------------------------------------*/ 4366 4367 float128 float32_to_float128(float32 a, float_status *status) 4368 { 4369 flag aSign; 4370 int aExp; 4371 uint32_t aSig; 4372 4373 a = float32_squash_input_denormal(a, status); 4374 aSig = extractFloat32Frac( a ); 4375 aExp = extractFloat32Exp( a ); 4376 aSign = extractFloat32Sign( a ); 4377 if ( aExp == 0xFF ) { 4378 if (aSig) { 4379 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4380 } 4381 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4382 } 4383 if ( aExp == 0 ) { 4384 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4385 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4386 --aExp; 4387 } 4388 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4389 4390 } 4391 4392 /*---------------------------------------------------------------------------- 4393 | Returns the remainder of the single-precision floating-point value `a' 4394 | with respect to the corresponding value `b'. The operation is performed 4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4396 *----------------------------------------------------------------------------*/ 4397 4398 float32 float32_rem(float32 a, float32 b, float_status *status) 4399 { 4400 flag aSign, zSign; 4401 int aExp, bExp, expDiff; 4402 uint32_t aSig, bSig; 4403 uint32_t q; 4404 uint64_t aSig64, bSig64, q64; 4405 uint32_t alternateASig; 4406 int32_t sigMean; 4407 a = float32_squash_input_denormal(a, status); 4408 b = float32_squash_input_denormal(b, status); 4409 4410 aSig = extractFloat32Frac( a ); 4411 aExp = extractFloat32Exp( a ); 4412 aSign = extractFloat32Sign( a ); 4413 bSig = extractFloat32Frac( b ); 4414 bExp = extractFloat32Exp( b ); 4415 if ( aExp == 0xFF ) { 4416 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4417 return propagateFloat32NaN(a, b, status); 4418 } 4419 float_raise(float_flag_invalid, status); 4420 return float32_default_nan(status); 4421 } 4422 if ( bExp == 0xFF ) { 4423 if (bSig) { 4424 return propagateFloat32NaN(a, b, status); 4425 } 4426 return a; 4427 } 4428 if ( bExp == 0 ) { 4429 if ( bSig == 0 ) { 4430 float_raise(float_flag_invalid, status); 4431 return float32_default_nan(status); 4432 } 4433 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4434 } 4435 if ( aExp == 0 ) { 4436 if ( aSig == 0 ) return a; 4437 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4438 } 4439 expDiff = aExp - bExp; 4440 aSig |= 0x00800000; 4441 bSig |= 0x00800000; 4442 if ( expDiff < 32 ) { 4443 aSig <<= 8; 4444 bSig <<= 8; 4445 if ( expDiff < 0 ) { 4446 if ( expDiff < -1 ) return a; 4447 aSig >>= 1; 4448 } 4449 q = ( bSig <= aSig ); 4450 if ( q ) aSig -= bSig; 4451 if ( 0 < expDiff ) { 4452 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4453 q >>= 32 - expDiff; 4454 bSig >>= 2; 4455 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4456 } 4457 else { 4458 aSig >>= 2; 4459 bSig >>= 2; 4460 } 4461 } 4462 else { 4463 if ( bSig <= aSig ) aSig -= bSig; 4464 aSig64 = ( (uint64_t) aSig )<<40; 4465 bSig64 = ( (uint64_t) bSig )<<40; 4466 expDiff -= 64; 4467 while ( 0 < expDiff ) { 4468 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4469 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4470 aSig64 = - ( ( bSig * q64 )<<38 ); 4471 expDiff -= 62; 4472 } 4473 expDiff += 64; 4474 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4475 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4476 q = q64>>( 64 - expDiff ); 4477 bSig <<= 6; 4478 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4479 } 4480 do { 4481 alternateASig = aSig; 4482 ++q; 4483 aSig -= bSig; 4484 } while ( 0 <= (int32_t) aSig ); 4485 sigMean = aSig + alternateASig; 4486 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4487 aSig = alternateASig; 4488 } 4489 zSign = ( (int32_t) aSig < 0 ); 4490 if ( zSign ) aSig = - aSig; 4491 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4492 } 4493 4494 4495 4496 /*---------------------------------------------------------------------------- 4497 | Returns the binary exponential of the single-precision floating-point value 4498 | `a'. The operation is performed according to the IEC/IEEE Standard for 4499 | Binary Floating-Point Arithmetic. 4500 | 4501 | Uses the following identities: 4502 | 4503 | 1. ------------------------------------------------------------------------- 4504 | x x*ln(2) 4505 | 2 = e 4506 | 4507 | 2. ------------------------------------------------------------------------- 4508 | 2 3 4 5 n 4509 | x x x x x x x 4510 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4511 | 1! 2! 3! 4! 5! n! 4512 *----------------------------------------------------------------------------*/ 4513 4514 static const float64 float32_exp2_coefficients[15] = 4515 { 4516 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4517 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4518 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4519 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4520 const_float64( 0x3f81111111111111ll ), /* 5 */ 4521 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4522 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4523 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4524 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4525 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4526 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4527 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4528 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4529 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4530 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4531 }; 4532 4533 float32 float32_exp2(float32 a, float_status *status) 4534 { 4535 flag aSign; 4536 int aExp; 4537 uint32_t aSig; 4538 float64 r, x, xn; 4539 int i; 4540 a = float32_squash_input_denormal(a, status); 4541 4542 aSig = extractFloat32Frac( a ); 4543 aExp = extractFloat32Exp( a ); 4544 aSign = extractFloat32Sign( a ); 4545 4546 if ( aExp == 0xFF) { 4547 if (aSig) { 4548 return propagateFloat32NaN(a, float32_zero, status); 4549 } 4550 return (aSign) ? float32_zero : a; 4551 } 4552 if (aExp == 0) { 4553 if (aSig == 0) return float32_one; 4554 } 4555 4556 float_raise(float_flag_inexact, status); 4557 4558 /* ******************************* */ 4559 /* using float64 for approximation */ 4560 /* ******************************* */ 4561 x = float32_to_float64(a, status); 4562 x = float64_mul(x, float64_ln2, status); 4563 4564 xn = x; 4565 r = float64_one; 4566 for (i = 0 ; i < 15 ; i++) { 4567 float64 f; 4568 4569 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4570 r = float64_add(r, f, status); 4571 4572 xn = float64_mul(xn, x, status); 4573 } 4574 4575 return float64_to_float32(r, status); 4576 } 4577 4578 /*---------------------------------------------------------------------------- 4579 | Returns the binary log of the single-precision floating-point value `a'. 4580 | The operation is performed according to the IEC/IEEE Standard for Binary 4581 | Floating-Point Arithmetic. 4582 *----------------------------------------------------------------------------*/ 4583 float32 float32_log2(float32 a, float_status *status) 4584 { 4585 flag aSign, zSign; 4586 int aExp; 4587 uint32_t aSig, zSig, i; 4588 4589 a = float32_squash_input_denormal(a, status); 4590 aSig = extractFloat32Frac( a ); 4591 aExp = extractFloat32Exp( a ); 4592 aSign = extractFloat32Sign( a ); 4593 4594 if ( aExp == 0 ) { 4595 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4596 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4597 } 4598 if ( aSign ) { 4599 float_raise(float_flag_invalid, status); 4600 return float32_default_nan(status); 4601 } 4602 if ( aExp == 0xFF ) { 4603 if (aSig) { 4604 return propagateFloat32NaN(a, float32_zero, status); 4605 } 4606 return a; 4607 } 4608 4609 aExp -= 0x7F; 4610 aSig |= 0x00800000; 4611 zSign = aExp < 0; 4612 zSig = aExp << 23; 4613 4614 for (i = 1 << 22; i > 0; i >>= 1) { 4615 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4616 if ( aSig & 0x01000000 ) { 4617 aSig >>= 1; 4618 zSig |= i; 4619 } 4620 } 4621 4622 if ( zSign ) 4623 zSig = -zSig; 4624 4625 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4626 } 4627 4628 /*---------------------------------------------------------------------------- 4629 | Returns 1 if the single-precision floating-point value `a' is equal to 4630 | the corresponding value `b', and 0 otherwise. The invalid exception is 4631 | raised if either operand is a NaN. Otherwise, the comparison is performed 4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4633 *----------------------------------------------------------------------------*/ 4634 4635 int float32_eq(float32 a, float32 b, float_status *status) 4636 { 4637 uint32_t av, bv; 4638 a = float32_squash_input_denormal(a, status); 4639 b = float32_squash_input_denormal(b, status); 4640 4641 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4642 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4643 ) { 4644 float_raise(float_flag_invalid, status); 4645 return 0; 4646 } 4647 av = float32_val(a); 4648 bv = float32_val(b); 4649 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4650 } 4651 4652 /*---------------------------------------------------------------------------- 4653 | Returns 1 if the single-precision floating-point value `a' is less than 4654 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4655 | exception is raised if either operand is a NaN. The comparison is performed 4656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4657 *----------------------------------------------------------------------------*/ 4658 4659 int float32_le(float32 a, float32 b, float_status *status) 4660 { 4661 flag aSign, bSign; 4662 uint32_t av, bv; 4663 a = float32_squash_input_denormal(a, status); 4664 b = float32_squash_input_denormal(b, status); 4665 4666 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4667 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4668 ) { 4669 float_raise(float_flag_invalid, status); 4670 return 0; 4671 } 4672 aSign = extractFloat32Sign( a ); 4673 bSign = extractFloat32Sign( b ); 4674 av = float32_val(a); 4675 bv = float32_val(b); 4676 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4677 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4678 4679 } 4680 4681 /*---------------------------------------------------------------------------- 4682 | Returns 1 if the single-precision floating-point value `a' is less than 4683 | the corresponding value `b', and 0 otherwise. The invalid exception is 4684 | raised if either operand is a NaN. The comparison is performed according 4685 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4686 *----------------------------------------------------------------------------*/ 4687 4688 int float32_lt(float32 a, float32 b, float_status *status) 4689 { 4690 flag aSign, bSign; 4691 uint32_t av, bv; 4692 a = float32_squash_input_denormal(a, status); 4693 b = float32_squash_input_denormal(b, status); 4694 4695 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4696 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4697 ) { 4698 float_raise(float_flag_invalid, status); 4699 return 0; 4700 } 4701 aSign = extractFloat32Sign( a ); 4702 bSign = extractFloat32Sign( b ); 4703 av = float32_val(a); 4704 bv = float32_val(b); 4705 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4706 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4707 4708 } 4709 4710 /*---------------------------------------------------------------------------- 4711 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4712 | be compared, and 0 otherwise. The invalid exception is raised if either 4713 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4714 | Standard for Binary Floating-Point Arithmetic. 4715 *----------------------------------------------------------------------------*/ 4716 4717 int float32_unordered(float32 a, float32 b, float_status *status) 4718 { 4719 a = float32_squash_input_denormal(a, status); 4720 b = float32_squash_input_denormal(b, status); 4721 4722 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4723 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4724 ) { 4725 float_raise(float_flag_invalid, status); 4726 return 1; 4727 } 4728 return 0; 4729 } 4730 4731 /*---------------------------------------------------------------------------- 4732 | Returns 1 if the single-precision floating-point value `a' is equal to 4733 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4734 | exception. The comparison is performed according to the IEC/IEEE Standard 4735 | for Binary Floating-Point Arithmetic. 4736 *----------------------------------------------------------------------------*/ 4737 4738 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4739 { 4740 a = float32_squash_input_denormal(a, status); 4741 b = float32_squash_input_denormal(b, status); 4742 4743 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4744 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4745 ) { 4746 if (float32_is_signaling_nan(a, status) 4747 || float32_is_signaling_nan(b, status)) { 4748 float_raise(float_flag_invalid, status); 4749 } 4750 return 0; 4751 } 4752 return ( float32_val(a) == float32_val(b) ) || 4753 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4754 } 4755 4756 /*---------------------------------------------------------------------------- 4757 | Returns 1 if the single-precision floating-point value `a' is less than or 4758 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4759 | cause an exception. Otherwise, the comparison is performed according to the 4760 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4761 *----------------------------------------------------------------------------*/ 4762 4763 int float32_le_quiet(float32 a, float32 b, float_status *status) 4764 { 4765 flag aSign, bSign; 4766 uint32_t av, bv; 4767 a = float32_squash_input_denormal(a, status); 4768 b = float32_squash_input_denormal(b, status); 4769 4770 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4771 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4772 ) { 4773 if (float32_is_signaling_nan(a, status) 4774 || float32_is_signaling_nan(b, status)) { 4775 float_raise(float_flag_invalid, status); 4776 } 4777 return 0; 4778 } 4779 aSign = extractFloat32Sign( a ); 4780 bSign = extractFloat32Sign( b ); 4781 av = float32_val(a); 4782 bv = float32_val(b); 4783 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4784 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4785 4786 } 4787 4788 /*---------------------------------------------------------------------------- 4789 | Returns 1 if the single-precision floating-point value `a' is less than 4790 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4791 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4792 | Standard for Binary Floating-Point Arithmetic. 4793 *----------------------------------------------------------------------------*/ 4794 4795 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4796 { 4797 flag aSign, bSign; 4798 uint32_t av, bv; 4799 a = float32_squash_input_denormal(a, status); 4800 b = float32_squash_input_denormal(b, status); 4801 4802 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4803 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4804 ) { 4805 if (float32_is_signaling_nan(a, status) 4806 || float32_is_signaling_nan(b, status)) { 4807 float_raise(float_flag_invalid, status); 4808 } 4809 return 0; 4810 } 4811 aSign = extractFloat32Sign( a ); 4812 bSign = extractFloat32Sign( b ); 4813 av = float32_val(a); 4814 bv = float32_val(b); 4815 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4816 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4817 4818 } 4819 4820 /*---------------------------------------------------------------------------- 4821 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4822 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4823 | comparison is performed according to the IEC/IEEE Standard for Binary 4824 | Floating-Point Arithmetic. 4825 *----------------------------------------------------------------------------*/ 4826 4827 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4828 { 4829 a = float32_squash_input_denormal(a, status); 4830 b = float32_squash_input_denormal(b, status); 4831 4832 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4833 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4834 ) { 4835 if (float32_is_signaling_nan(a, status) 4836 || float32_is_signaling_nan(b, status)) { 4837 float_raise(float_flag_invalid, status); 4838 } 4839 return 1; 4840 } 4841 return 0; 4842 } 4843 4844 /*---------------------------------------------------------------------------- 4845 | If `a' is denormal and we are in flush-to-zero mode then set the 4846 | input-denormal exception and return zero. Otherwise just return the value. 4847 *----------------------------------------------------------------------------*/ 4848 float16 float16_squash_input_denormal(float16 a, float_status *status) 4849 { 4850 if (status->flush_inputs_to_zero) { 4851 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4852 float_raise(float_flag_input_denormal, status); 4853 return make_float16(float16_val(a) & 0x8000); 4854 } 4855 } 4856 return a; 4857 } 4858 4859 /*---------------------------------------------------------------------------- 4860 | Returns the result of converting the double-precision floating-point value 4861 | `a' to the extended double-precision floating-point format. The conversion 4862 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4863 | Arithmetic. 4864 *----------------------------------------------------------------------------*/ 4865 4866 floatx80 float64_to_floatx80(float64 a, float_status *status) 4867 { 4868 flag aSign; 4869 int aExp; 4870 uint64_t aSig; 4871 4872 a = float64_squash_input_denormal(a, status); 4873 aSig = extractFloat64Frac( a ); 4874 aExp = extractFloat64Exp( a ); 4875 aSign = extractFloat64Sign( a ); 4876 if ( aExp == 0x7FF ) { 4877 if (aSig) { 4878 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4879 } 4880 return packFloatx80(aSign, 4881 floatx80_infinity_high, 4882 floatx80_infinity_low); 4883 } 4884 if ( aExp == 0 ) { 4885 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4886 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4887 } 4888 return 4889 packFloatx80( 4890 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4891 4892 } 4893 4894 /*---------------------------------------------------------------------------- 4895 | Returns the result of converting the double-precision floating-point value 4896 | `a' to the quadruple-precision floating-point format. The conversion is 4897 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4898 | Arithmetic. 4899 *----------------------------------------------------------------------------*/ 4900 4901 float128 float64_to_float128(float64 a, float_status *status) 4902 { 4903 flag aSign; 4904 int aExp; 4905 uint64_t aSig, zSig0, zSig1; 4906 4907 a = float64_squash_input_denormal(a, status); 4908 aSig = extractFloat64Frac( a ); 4909 aExp = extractFloat64Exp( a ); 4910 aSign = extractFloat64Sign( a ); 4911 if ( aExp == 0x7FF ) { 4912 if (aSig) { 4913 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4914 } 4915 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4916 } 4917 if ( aExp == 0 ) { 4918 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4919 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4920 --aExp; 4921 } 4922 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4923 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4924 4925 } 4926 4927 4928 /*---------------------------------------------------------------------------- 4929 | Returns the remainder of the double-precision floating-point value `a' 4930 | with respect to the corresponding value `b'. The operation is performed 4931 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4932 *----------------------------------------------------------------------------*/ 4933 4934 float64 float64_rem(float64 a, float64 b, float_status *status) 4935 { 4936 flag aSign, zSign; 4937 int aExp, bExp, expDiff; 4938 uint64_t aSig, bSig; 4939 uint64_t q, alternateASig; 4940 int64_t sigMean; 4941 4942 a = float64_squash_input_denormal(a, status); 4943 b = float64_squash_input_denormal(b, status); 4944 aSig = extractFloat64Frac( a ); 4945 aExp = extractFloat64Exp( a ); 4946 aSign = extractFloat64Sign( a ); 4947 bSig = extractFloat64Frac( b ); 4948 bExp = extractFloat64Exp( b ); 4949 if ( aExp == 0x7FF ) { 4950 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4951 return propagateFloat64NaN(a, b, status); 4952 } 4953 float_raise(float_flag_invalid, status); 4954 return float64_default_nan(status); 4955 } 4956 if ( bExp == 0x7FF ) { 4957 if (bSig) { 4958 return propagateFloat64NaN(a, b, status); 4959 } 4960 return a; 4961 } 4962 if ( bExp == 0 ) { 4963 if ( bSig == 0 ) { 4964 float_raise(float_flag_invalid, status); 4965 return float64_default_nan(status); 4966 } 4967 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4968 } 4969 if ( aExp == 0 ) { 4970 if ( aSig == 0 ) return a; 4971 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4972 } 4973 expDiff = aExp - bExp; 4974 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4975 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4976 if ( expDiff < 0 ) { 4977 if ( expDiff < -1 ) return a; 4978 aSig >>= 1; 4979 } 4980 q = ( bSig <= aSig ); 4981 if ( q ) aSig -= bSig; 4982 expDiff -= 64; 4983 while ( 0 < expDiff ) { 4984 q = estimateDiv128To64( aSig, 0, bSig ); 4985 q = ( 2 < q ) ? q - 2 : 0; 4986 aSig = - ( ( bSig>>2 ) * q ); 4987 expDiff -= 62; 4988 } 4989 expDiff += 64; 4990 if ( 0 < expDiff ) { 4991 q = estimateDiv128To64( aSig, 0, bSig ); 4992 q = ( 2 < q ) ? q - 2 : 0; 4993 q >>= 64 - expDiff; 4994 bSig >>= 2; 4995 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4996 } 4997 else { 4998 aSig >>= 2; 4999 bSig >>= 2; 5000 } 5001 do { 5002 alternateASig = aSig; 5003 ++q; 5004 aSig -= bSig; 5005 } while ( 0 <= (int64_t) aSig ); 5006 sigMean = aSig + alternateASig; 5007 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5008 aSig = alternateASig; 5009 } 5010 zSign = ( (int64_t) aSig < 0 ); 5011 if ( zSign ) aSig = - aSig; 5012 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5013 5014 } 5015 5016 /*---------------------------------------------------------------------------- 5017 | Returns the binary log of the double-precision floating-point value `a'. 5018 | The operation is performed according to the IEC/IEEE Standard for Binary 5019 | Floating-Point Arithmetic. 5020 *----------------------------------------------------------------------------*/ 5021 float64 float64_log2(float64 a, float_status *status) 5022 { 5023 flag aSign, zSign; 5024 int aExp; 5025 uint64_t aSig, aSig0, aSig1, zSig, i; 5026 a = float64_squash_input_denormal(a, status); 5027 5028 aSig = extractFloat64Frac( a ); 5029 aExp = extractFloat64Exp( a ); 5030 aSign = extractFloat64Sign( a ); 5031 5032 if ( aExp == 0 ) { 5033 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5034 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5035 } 5036 if ( aSign ) { 5037 float_raise(float_flag_invalid, status); 5038 return float64_default_nan(status); 5039 } 5040 if ( aExp == 0x7FF ) { 5041 if (aSig) { 5042 return propagateFloat64NaN(a, float64_zero, status); 5043 } 5044 return a; 5045 } 5046 5047 aExp -= 0x3FF; 5048 aSig |= LIT64( 0x0010000000000000 ); 5049 zSign = aExp < 0; 5050 zSig = (uint64_t)aExp << 52; 5051 for (i = 1LL << 51; i > 0; i >>= 1) { 5052 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5053 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5054 if ( aSig & LIT64( 0x0020000000000000 ) ) { 5055 aSig >>= 1; 5056 zSig |= i; 5057 } 5058 } 5059 5060 if ( zSign ) 5061 zSig = -zSig; 5062 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5063 } 5064 5065 /*---------------------------------------------------------------------------- 5066 | Returns 1 if the double-precision floating-point value `a' is equal to the 5067 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5068 | if either operand is a NaN. Otherwise, the comparison is performed 5069 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5070 *----------------------------------------------------------------------------*/ 5071 5072 int float64_eq(float64 a, float64 b, float_status *status) 5073 { 5074 uint64_t av, bv; 5075 a = float64_squash_input_denormal(a, status); 5076 b = float64_squash_input_denormal(b, status); 5077 5078 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5079 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5080 ) { 5081 float_raise(float_flag_invalid, status); 5082 return 0; 5083 } 5084 av = float64_val(a); 5085 bv = float64_val(b); 5086 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5087 5088 } 5089 5090 /*---------------------------------------------------------------------------- 5091 | Returns 1 if the double-precision floating-point value `a' is less than or 5092 | equal to the corresponding value `b', and 0 otherwise. The invalid 5093 | exception is raised if either operand is a NaN. The comparison is performed 5094 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5095 *----------------------------------------------------------------------------*/ 5096 5097 int float64_le(float64 a, float64 b, float_status *status) 5098 { 5099 flag aSign, bSign; 5100 uint64_t av, bv; 5101 a = float64_squash_input_denormal(a, status); 5102 b = float64_squash_input_denormal(b, status); 5103 5104 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5105 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5106 ) { 5107 float_raise(float_flag_invalid, status); 5108 return 0; 5109 } 5110 aSign = extractFloat64Sign( a ); 5111 bSign = extractFloat64Sign( b ); 5112 av = float64_val(a); 5113 bv = float64_val(b); 5114 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5115 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5116 5117 } 5118 5119 /*---------------------------------------------------------------------------- 5120 | Returns 1 if the double-precision floating-point value `a' is less than 5121 | the corresponding value `b', and 0 otherwise. The invalid exception is 5122 | raised if either operand is a NaN. The comparison is performed according 5123 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5124 *----------------------------------------------------------------------------*/ 5125 5126 int float64_lt(float64 a, float64 b, float_status *status) 5127 { 5128 flag aSign, bSign; 5129 uint64_t av, bv; 5130 5131 a = float64_squash_input_denormal(a, status); 5132 b = float64_squash_input_denormal(b, status); 5133 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5134 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5135 ) { 5136 float_raise(float_flag_invalid, status); 5137 return 0; 5138 } 5139 aSign = extractFloat64Sign( a ); 5140 bSign = extractFloat64Sign( b ); 5141 av = float64_val(a); 5142 bv = float64_val(b); 5143 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5144 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5145 5146 } 5147 5148 /*---------------------------------------------------------------------------- 5149 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5150 | be compared, and 0 otherwise. The invalid exception is raised if either 5151 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5152 | Standard for Binary Floating-Point Arithmetic. 5153 *----------------------------------------------------------------------------*/ 5154 5155 int float64_unordered(float64 a, float64 b, float_status *status) 5156 { 5157 a = float64_squash_input_denormal(a, status); 5158 b = float64_squash_input_denormal(b, status); 5159 5160 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5161 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5162 ) { 5163 float_raise(float_flag_invalid, status); 5164 return 1; 5165 } 5166 return 0; 5167 } 5168 5169 /*---------------------------------------------------------------------------- 5170 | Returns 1 if the double-precision floating-point value `a' is equal to the 5171 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5172 | exception.The comparison is performed according to the IEC/IEEE Standard 5173 | for Binary Floating-Point Arithmetic. 5174 *----------------------------------------------------------------------------*/ 5175 5176 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5177 { 5178 uint64_t av, bv; 5179 a = float64_squash_input_denormal(a, status); 5180 b = float64_squash_input_denormal(b, status); 5181 5182 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5183 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5184 ) { 5185 if (float64_is_signaling_nan(a, status) 5186 || float64_is_signaling_nan(b, status)) { 5187 float_raise(float_flag_invalid, status); 5188 } 5189 return 0; 5190 } 5191 av = float64_val(a); 5192 bv = float64_val(b); 5193 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5194 5195 } 5196 5197 /*---------------------------------------------------------------------------- 5198 | Returns 1 if the double-precision floating-point value `a' is less than or 5199 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5200 | cause an exception. Otherwise, the comparison is performed according to the 5201 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5202 *----------------------------------------------------------------------------*/ 5203 5204 int float64_le_quiet(float64 a, float64 b, float_status *status) 5205 { 5206 flag aSign, bSign; 5207 uint64_t av, bv; 5208 a = float64_squash_input_denormal(a, status); 5209 b = float64_squash_input_denormal(b, status); 5210 5211 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5212 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5213 ) { 5214 if (float64_is_signaling_nan(a, status) 5215 || float64_is_signaling_nan(b, status)) { 5216 float_raise(float_flag_invalid, status); 5217 } 5218 return 0; 5219 } 5220 aSign = extractFloat64Sign( a ); 5221 bSign = extractFloat64Sign( b ); 5222 av = float64_val(a); 5223 bv = float64_val(b); 5224 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5225 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5226 5227 } 5228 5229 /*---------------------------------------------------------------------------- 5230 | Returns 1 if the double-precision floating-point value `a' is less than 5231 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5232 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5233 | Standard for Binary Floating-Point Arithmetic. 5234 *----------------------------------------------------------------------------*/ 5235 5236 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5237 { 5238 flag aSign, bSign; 5239 uint64_t av, bv; 5240 a = float64_squash_input_denormal(a, status); 5241 b = float64_squash_input_denormal(b, status); 5242 5243 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5244 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5245 ) { 5246 if (float64_is_signaling_nan(a, status) 5247 || float64_is_signaling_nan(b, status)) { 5248 float_raise(float_flag_invalid, status); 5249 } 5250 return 0; 5251 } 5252 aSign = extractFloat64Sign( a ); 5253 bSign = extractFloat64Sign( b ); 5254 av = float64_val(a); 5255 bv = float64_val(b); 5256 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5257 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5258 5259 } 5260 5261 /*---------------------------------------------------------------------------- 5262 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5263 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5264 | comparison is performed according to the IEC/IEEE Standard for Binary 5265 | Floating-Point Arithmetic. 5266 *----------------------------------------------------------------------------*/ 5267 5268 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5269 { 5270 a = float64_squash_input_denormal(a, status); 5271 b = float64_squash_input_denormal(b, status); 5272 5273 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5274 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5275 ) { 5276 if (float64_is_signaling_nan(a, status) 5277 || float64_is_signaling_nan(b, status)) { 5278 float_raise(float_flag_invalid, status); 5279 } 5280 return 1; 5281 } 5282 return 0; 5283 } 5284 5285 /*---------------------------------------------------------------------------- 5286 | Returns the result of converting the extended double-precision floating- 5287 | point value `a' to the 32-bit two's complement integer format. The 5288 | conversion is performed according to the IEC/IEEE Standard for Binary 5289 | Floating-Point Arithmetic---which means in particular that the conversion 5290 | is rounded according to the current rounding mode. If `a' is a NaN, the 5291 | largest positive integer is returned. Otherwise, if the conversion 5292 | overflows, the largest integer with the same sign as `a' is returned. 5293 *----------------------------------------------------------------------------*/ 5294 5295 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5296 { 5297 flag aSign; 5298 int32_t aExp, shiftCount; 5299 uint64_t aSig; 5300 5301 if (floatx80_invalid_encoding(a)) { 5302 float_raise(float_flag_invalid, status); 5303 return 1 << 31; 5304 } 5305 aSig = extractFloatx80Frac( a ); 5306 aExp = extractFloatx80Exp( a ); 5307 aSign = extractFloatx80Sign( a ); 5308 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5309 shiftCount = 0x4037 - aExp; 5310 if ( shiftCount <= 0 ) shiftCount = 1; 5311 shift64RightJamming( aSig, shiftCount, &aSig ); 5312 return roundAndPackInt32(aSign, aSig, status); 5313 5314 } 5315 5316 /*---------------------------------------------------------------------------- 5317 | Returns the result of converting the extended double-precision floating- 5318 | point value `a' to the 32-bit two's complement integer format. The 5319 | conversion is performed according to the IEC/IEEE Standard for Binary 5320 | Floating-Point Arithmetic, except that the conversion is always rounded 5321 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5322 | Otherwise, if the conversion overflows, the largest integer with the same 5323 | sign as `a' is returned. 5324 *----------------------------------------------------------------------------*/ 5325 5326 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5327 { 5328 flag aSign; 5329 int32_t aExp, shiftCount; 5330 uint64_t aSig, savedASig; 5331 int32_t z; 5332 5333 if (floatx80_invalid_encoding(a)) { 5334 float_raise(float_flag_invalid, status); 5335 return 1 << 31; 5336 } 5337 aSig = extractFloatx80Frac( a ); 5338 aExp = extractFloatx80Exp( a ); 5339 aSign = extractFloatx80Sign( a ); 5340 if ( 0x401E < aExp ) { 5341 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5342 goto invalid; 5343 } 5344 else if ( aExp < 0x3FFF ) { 5345 if (aExp || aSig) { 5346 status->float_exception_flags |= float_flag_inexact; 5347 } 5348 return 0; 5349 } 5350 shiftCount = 0x403E - aExp; 5351 savedASig = aSig; 5352 aSig >>= shiftCount; 5353 z = aSig; 5354 if ( aSign ) z = - z; 5355 if ( ( z < 0 ) ^ aSign ) { 5356 invalid: 5357 float_raise(float_flag_invalid, status); 5358 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5359 } 5360 if ( ( aSig<<shiftCount ) != savedASig ) { 5361 status->float_exception_flags |= float_flag_inexact; 5362 } 5363 return z; 5364 5365 } 5366 5367 /*---------------------------------------------------------------------------- 5368 | Returns the result of converting the extended double-precision floating- 5369 | point value `a' to the 64-bit two's complement integer format. The 5370 | conversion is performed according to the IEC/IEEE Standard for Binary 5371 | Floating-Point Arithmetic---which means in particular that the conversion 5372 | is rounded according to the current rounding mode. If `a' is a NaN, 5373 | the largest positive integer is returned. Otherwise, if the conversion 5374 | overflows, the largest integer with the same sign as `a' is returned. 5375 *----------------------------------------------------------------------------*/ 5376 5377 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5378 { 5379 flag aSign; 5380 int32_t aExp, shiftCount; 5381 uint64_t aSig, aSigExtra; 5382 5383 if (floatx80_invalid_encoding(a)) { 5384 float_raise(float_flag_invalid, status); 5385 return 1ULL << 63; 5386 } 5387 aSig = extractFloatx80Frac( a ); 5388 aExp = extractFloatx80Exp( a ); 5389 aSign = extractFloatx80Sign( a ); 5390 shiftCount = 0x403E - aExp; 5391 if ( shiftCount <= 0 ) { 5392 if ( shiftCount ) { 5393 float_raise(float_flag_invalid, status); 5394 if (!aSign || floatx80_is_any_nan(a)) { 5395 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5396 } 5397 return (int64_t) LIT64( 0x8000000000000000 ); 5398 } 5399 aSigExtra = 0; 5400 } 5401 else { 5402 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5403 } 5404 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5405 5406 } 5407 5408 /*---------------------------------------------------------------------------- 5409 | Returns the result of converting the extended double-precision floating- 5410 | point value `a' to the 64-bit two's complement integer format. The 5411 | conversion is performed according to the IEC/IEEE Standard for Binary 5412 | Floating-Point Arithmetic, except that the conversion is always rounded 5413 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5414 | Otherwise, if the conversion overflows, the largest integer with the same 5415 | sign as `a' is returned. 5416 *----------------------------------------------------------------------------*/ 5417 5418 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5419 { 5420 flag aSign; 5421 int32_t aExp, shiftCount; 5422 uint64_t aSig; 5423 int64_t z; 5424 5425 if (floatx80_invalid_encoding(a)) { 5426 float_raise(float_flag_invalid, status); 5427 return 1ULL << 63; 5428 } 5429 aSig = extractFloatx80Frac( a ); 5430 aExp = extractFloatx80Exp( a ); 5431 aSign = extractFloatx80Sign( a ); 5432 shiftCount = aExp - 0x403E; 5433 if ( 0 <= shiftCount ) { 5434 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5435 if ( ( a.high != 0xC03E ) || aSig ) { 5436 float_raise(float_flag_invalid, status); 5437 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5438 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5439 } 5440 } 5441 return (int64_t) LIT64( 0x8000000000000000 ); 5442 } 5443 else if ( aExp < 0x3FFF ) { 5444 if (aExp | aSig) { 5445 status->float_exception_flags |= float_flag_inexact; 5446 } 5447 return 0; 5448 } 5449 z = aSig>>( - shiftCount ); 5450 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5451 status->float_exception_flags |= float_flag_inexact; 5452 } 5453 if ( aSign ) z = - z; 5454 return z; 5455 5456 } 5457 5458 /*---------------------------------------------------------------------------- 5459 | Returns the result of converting the extended double-precision floating- 5460 | point value `a' to the single-precision floating-point format. The 5461 | conversion is performed according to the IEC/IEEE Standard for Binary 5462 | Floating-Point Arithmetic. 5463 *----------------------------------------------------------------------------*/ 5464 5465 float32 floatx80_to_float32(floatx80 a, float_status *status) 5466 { 5467 flag aSign; 5468 int32_t aExp; 5469 uint64_t aSig; 5470 5471 if (floatx80_invalid_encoding(a)) { 5472 float_raise(float_flag_invalid, status); 5473 return float32_default_nan(status); 5474 } 5475 aSig = extractFloatx80Frac( a ); 5476 aExp = extractFloatx80Exp( a ); 5477 aSign = extractFloatx80Sign( a ); 5478 if ( aExp == 0x7FFF ) { 5479 if ( (uint64_t) ( aSig<<1 ) ) { 5480 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5481 } 5482 return packFloat32( aSign, 0xFF, 0 ); 5483 } 5484 shift64RightJamming( aSig, 33, &aSig ); 5485 if ( aExp || aSig ) aExp -= 0x3F81; 5486 return roundAndPackFloat32(aSign, aExp, aSig, status); 5487 5488 } 5489 5490 /*---------------------------------------------------------------------------- 5491 | Returns the result of converting the extended double-precision floating- 5492 | point value `a' to the double-precision floating-point format. The 5493 | conversion is performed according to the IEC/IEEE Standard for Binary 5494 | Floating-Point Arithmetic. 5495 *----------------------------------------------------------------------------*/ 5496 5497 float64 floatx80_to_float64(floatx80 a, float_status *status) 5498 { 5499 flag aSign; 5500 int32_t aExp; 5501 uint64_t aSig, zSig; 5502 5503 if (floatx80_invalid_encoding(a)) { 5504 float_raise(float_flag_invalid, status); 5505 return float64_default_nan(status); 5506 } 5507 aSig = extractFloatx80Frac( a ); 5508 aExp = extractFloatx80Exp( a ); 5509 aSign = extractFloatx80Sign( a ); 5510 if ( aExp == 0x7FFF ) { 5511 if ( (uint64_t) ( aSig<<1 ) ) { 5512 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5513 } 5514 return packFloat64( aSign, 0x7FF, 0 ); 5515 } 5516 shift64RightJamming( aSig, 1, &zSig ); 5517 if ( aExp || aSig ) aExp -= 0x3C01; 5518 return roundAndPackFloat64(aSign, aExp, zSig, status); 5519 5520 } 5521 5522 /*---------------------------------------------------------------------------- 5523 | Returns the result of converting the extended double-precision floating- 5524 | point value `a' to the quadruple-precision floating-point format. The 5525 | conversion is performed according to the IEC/IEEE Standard for Binary 5526 | Floating-Point Arithmetic. 5527 *----------------------------------------------------------------------------*/ 5528 5529 float128 floatx80_to_float128(floatx80 a, float_status *status) 5530 { 5531 flag aSign; 5532 int aExp; 5533 uint64_t aSig, zSig0, zSig1; 5534 5535 if (floatx80_invalid_encoding(a)) { 5536 float_raise(float_flag_invalid, status); 5537 return float128_default_nan(status); 5538 } 5539 aSig = extractFloatx80Frac( a ); 5540 aExp = extractFloatx80Exp( a ); 5541 aSign = extractFloatx80Sign( a ); 5542 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5543 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5544 } 5545 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5546 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5547 5548 } 5549 5550 /*---------------------------------------------------------------------------- 5551 | Rounds the extended double-precision floating-point value `a' 5552 | to the precision provided by floatx80_rounding_precision and returns the 5553 | result as an extended double-precision floating-point value. 5554 | The operation is performed according to the IEC/IEEE Standard for Binary 5555 | Floating-Point Arithmetic. 5556 *----------------------------------------------------------------------------*/ 5557 5558 floatx80 floatx80_round(floatx80 a, float_status *status) 5559 { 5560 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5561 extractFloatx80Sign(a), 5562 extractFloatx80Exp(a), 5563 extractFloatx80Frac(a), 0, status); 5564 } 5565 5566 /*---------------------------------------------------------------------------- 5567 | Rounds the extended double-precision floating-point value `a' to an integer, 5568 | and returns the result as an extended quadruple-precision floating-point 5569 | value. The operation is performed according to the IEC/IEEE Standard for 5570 | Binary Floating-Point Arithmetic. 5571 *----------------------------------------------------------------------------*/ 5572 5573 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5574 { 5575 flag aSign; 5576 int32_t aExp; 5577 uint64_t lastBitMask, roundBitsMask; 5578 floatx80 z; 5579 5580 if (floatx80_invalid_encoding(a)) { 5581 float_raise(float_flag_invalid, status); 5582 return floatx80_default_nan(status); 5583 } 5584 aExp = extractFloatx80Exp( a ); 5585 if ( 0x403E <= aExp ) { 5586 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5587 return propagateFloatx80NaN(a, a, status); 5588 } 5589 return a; 5590 } 5591 if ( aExp < 0x3FFF ) { 5592 if ( ( aExp == 0 ) 5593 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5594 return a; 5595 } 5596 status->float_exception_flags |= float_flag_inexact; 5597 aSign = extractFloatx80Sign( a ); 5598 switch (status->float_rounding_mode) { 5599 case float_round_nearest_even: 5600 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5601 ) { 5602 return 5603 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5604 } 5605 break; 5606 case float_round_ties_away: 5607 if (aExp == 0x3FFE) { 5608 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5609 } 5610 break; 5611 case float_round_down: 5612 return 5613 aSign ? 5614 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5615 : packFloatx80( 0, 0, 0 ); 5616 case float_round_up: 5617 return 5618 aSign ? packFloatx80( 1, 0, 0 ) 5619 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5620 } 5621 return packFloatx80( aSign, 0, 0 ); 5622 } 5623 lastBitMask = 1; 5624 lastBitMask <<= 0x403E - aExp; 5625 roundBitsMask = lastBitMask - 1; 5626 z = a; 5627 switch (status->float_rounding_mode) { 5628 case float_round_nearest_even: 5629 z.low += lastBitMask>>1; 5630 if ((z.low & roundBitsMask) == 0) { 5631 z.low &= ~lastBitMask; 5632 } 5633 break; 5634 case float_round_ties_away: 5635 z.low += lastBitMask >> 1; 5636 break; 5637 case float_round_to_zero: 5638 break; 5639 case float_round_up: 5640 if (!extractFloatx80Sign(z)) { 5641 z.low += roundBitsMask; 5642 } 5643 break; 5644 case float_round_down: 5645 if (extractFloatx80Sign(z)) { 5646 z.low += roundBitsMask; 5647 } 5648 break; 5649 default: 5650 abort(); 5651 } 5652 z.low &= ~ roundBitsMask; 5653 if ( z.low == 0 ) { 5654 ++z.high; 5655 z.low = LIT64( 0x8000000000000000 ); 5656 } 5657 if (z.low != a.low) { 5658 status->float_exception_flags |= float_flag_inexact; 5659 } 5660 return z; 5661 5662 } 5663 5664 /*---------------------------------------------------------------------------- 5665 | Returns the result of adding the absolute values of the extended double- 5666 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5667 | negated before being returned. `zSign' is ignored if the result is a NaN. 5668 | The addition is performed according to the IEC/IEEE Standard for Binary 5669 | Floating-Point Arithmetic. 5670 *----------------------------------------------------------------------------*/ 5671 5672 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5673 float_status *status) 5674 { 5675 int32_t aExp, bExp, zExp; 5676 uint64_t aSig, bSig, zSig0, zSig1; 5677 int32_t expDiff; 5678 5679 aSig = extractFloatx80Frac( a ); 5680 aExp = extractFloatx80Exp( a ); 5681 bSig = extractFloatx80Frac( b ); 5682 bExp = extractFloatx80Exp( b ); 5683 expDiff = aExp - bExp; 5684 if ( 0 < expDiff ) { 5685 if ( aExp == 0x7FFF ) { 5686 if ((uint64_t)(aSig << 1)) { 5687 return propagateFloatx80NaN(a, b, status); 5688 } 5689 return a; 5690 } 5691 if ( bExp == 0 ) --expDiff; 5692 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5693 zExp = aExp; 5694 } 5695 else if ( expDiff < 0 ) { 5696 if ( bExp == 0x7FFF ) { 5697 if ((uint64_t)(bSig << 1)) { 5698 return propagateFloatx80NaN(a, b, status); 5699 } 5700 return packFloatx80(zSign, 5701 floatx80_infinity_high, 5702 floatx80_infinity_low); 5703 } 5704 if ( aExp == 0 ) ++expDiff; 5705 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5706 zExp = bExp; 5707 } 5708 else { 5709 if ( aExp == 0x7FFF ) { 5710 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5711 return propagateFloatx80NaN(a, b, status); 5712 } 5713 return a; 5714 } 5715 zSig1 = 0; 5716 zSig0 = aSig + bSig; 5717 if ( aExp == 0 ) { 5718 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5719 goto roundAndPack; 5720 } 5721 zExp = aExp; 5722 goto shiftRight1; 5723 } 5724 zSig0 = aSig + bSig; 5725 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5726 shiftRight1: 5727 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5728 zSig0 |= LIT64( 0x8000000000000000 ); 5729 ++zExp; 5730 roundAndPack: 5731 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5732 zSign, zExp, zSig0, zSig1, status); 5733 } 5734 5735 /*---------------------------------------------------------------------------- 5736 | Returns the result of subtracting the absolute values of the extended 5737 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5738 | difference is negated before being returned. `zSign' is ignored if the 5739 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5740 | Standard for Binary Floating-Point Arithmetic. 5741 *----------------------------------------------------------------------------*/ 5742 5743 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5744 float_status *status) 5745 { 5746 int32_t aExp, bExp, zExp; 5747 uint64_t aSig, bSig, zSig0, zSig1; 5748 int32_t expDiff; 5749 5750 aSig = extractFloatx80Frac( a ); 5751 aExp = extractFloatx80Exp( a ); 5752 bSig = extractFloatx80Frac( b ); 5753 bExp = extractFloatx80Exp( b ); 5754 expDiff = aExp - bExp; 5755 if ( 0 < expDiff ) goto aExpBigger; 5756 if ( expDiff < 0 ) goto bExpBigger; 5757 if ( aExp == 0x7FFF ) { 5758 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5759 return propagateFloatx80NaN(a, b, status); 5760 } 5761 float_raise(float_flag_invalid, status); 5762 return floatx80_default_nan(status); 5763 } 5764 if ( aExp == 0 ) { 5765 aExp = 1; 5766 bExp = 1; 5767 } 5768 zSig1 = 0; 5769 if ( bSig < aSig ) goto aBigger; 5770 if ( aSig < bSig ) goto bBigger; 5771 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5772 bExpBigger: 5773 if ( bExp == 0x7FFF ) { 5774 if ((uint64_t)(bSig << 1)) { 5775 return propagateFloatx80NaN(a, b, status); 5776 } 5777 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5778 floatx80_infinity_low); 5779 } 5780 if ( aExp == 0 ) ++expDiff; 5781 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5782 bBigger: 5783 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5784 zExp = bExp; 5785 zSign ^= 1; 5786 goto normalizeRoundAndPack; 5787 aExpBigger: 5788 if ( aExp == 0x7FFF ) { 5789 if ((uint64_t)(aSig << 1)) { 5790 return propagateFloatx80NaN(a, b, status); 5791 } 5792 return a; 5793 } 5794 if ( bExp == 0 ) --expDiff; 5795 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5796 aBigger: 5797 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5798 zExp = aExp; 5799 normalizeRoundAndPack: 5800 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5801 zSign, zExp, zSig0, zSig1, status); 5802 } 5803 5804 /*---------------------------------------------------------------------------- 5805 | Returns the result of adding the extended double-precision floating-point 5806 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5807 | Standard for Binary Floating-Point Arithmetic. 5808 *----------------------------------------------------------------------------*/ 5809 5810 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5811 { 5812 flag aSign, bSign; 5813 5814 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5815 float_raise(float_flag_invalid, status); 5816 return floatx80_default_nan(status); 5817 } 5818 aSign = extractFloatx80Sign( a ); 5819 bSign = extractFloatx80Sign( b ); 5820 if ( aSign == bSign ) { 5821 return addFloatx80Sigs(a, b, aSign, status); 5822 } 5823 else { 5824 return subFloatx80Sigs(a, b, aSign, status); 5825 } 5826 5827 } 5828 5829 /*---------------------------------------------------------------------------- 5830 | Returns the result of subtracting the extended double-precision floating- 5831 | point values `a' and `b'. The operation is performed according to the 5832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5833 *----------------------------------------------------------------------------*/ 5834 5835 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5836 { 5837 flag aSign, bSign; 5838 5839 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5840 float_raise(float_flag_invalid, status); 5841 return floatx80_default_nan(status); 5842 } 5843 aSign = extractFloatx80Sign( a ); 5844 bSign = extractFloatx80Sign( b ); 5845 if ( aSign == bSign ) { 5846 return subFloatx80Sigs(a, b, aSign, status); 5847 } 5848 else { 5849 return addFloatx80Sigs(a, b, aSign, status); 5850 } 5851 5852 } 5853 5854 /*---------------------------------------------------------------------------- 5855 | Returns the result of multiplying the extended double-precision floating- 5856 | point values `a' and `b'. The operation is performed according to the 5857 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5858 *----------------------------------------------------------------------------*/ 5859 5860 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5861 { 5862 flag aSign, bSign, zSign; 5863 int32_t aExp, bExp, zExp; 5864 uint64_t aSig, bSig, zSig0, zSig1; 5865 5866 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5867 float_raise(float_flag_invalid, status); 5868 return floatx80_default_nan(status); 5869 } 5870 aSig = extractFloatx80Frac( a ); 5871 aExp = extractFloatx80Exp( a ); 5872 aSign = extractFloatx80Sign( a ); 5873 bSig = extractFloatx80Frac( b ); 5874 bExp = extractFloatx80Exp( b ); 5875 bSign = extractFloatx80Sign( b ); 5876 zSign = aSign ^ bSign; 5877 if ( aExp == 0x7FFF ) { 5878 if ( (uint64_t) ( aSig<<1 ) 5879 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5880 return propagateFloatx80NaN(a, b, status); 5881 } 5882 if ( ( bExp | bSig ) == 0 ) goto invalid; 5883 return packFloatx80(zSign, floatx80_infinity_high, 5884 floatx80_infinity_low); 5885 } 5886 if ( bExp == 0x7FFF ) { 5887 if ((uint64_t)(bSig << 1)) { 5888 return propagateFloatx80NaN(a, b, status); 5889 } 5890 if ( ( aExp | aSig ) == 0 ) { 5891 invalid: 5892 float_raise(float_flag_invalid, status); 5893 return floatx80_default_nan(status); 5894 } 5895 return packFloatx80(zSign, floatx80_infinity_high, 5896 floatx80_infinity_low); 5897 } 5898 if ( aExp == 0 ) { 5899 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5900 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5901 } 5902 if ( bExp == 0 ) { 5903 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5904 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5905 } 5906 zExp = aExp + bExp - 0x3FFE; 5907 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5908 if ( 0 < (int64_t) zSig0 ) { 5909 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5910 --zExp; 5911 } 5912 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5913 zSign, zExp, zSig0, zSig1, status); 5914 } 5915 5916 /*---------------------------------------------------------------------------- 5917 | Returns the result of dividing the extended double-precision floating-point 5918 | value `a' by the corresponding value `b'. The operation is performed 5919 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5920 *----------------------------------------------------------------------------*/ 5921 5922 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5923 { 5924 flag aSign, bSign, zSign; 5925 int32_t aExp, bExp, zExp; 5926 uint64_t aSig, bSig, zSig0, zSig1; 5927 uint64_t rem0, rem1, rem2, term0, term1, term2; 5928 5929 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5930 float_raise(float_flag_invalid, status); 5931 return floatx80_default_nan(status); 5932 } 5933 aSig = extractFloatx80Frac( a ); 5934 aExp = extractFloatx80Exp( a ); 5935 aSign = extractFloatx80Sign( a ); 5936 bSig = extractFloatx80Frac( b ); 5937 bExp = extractFloatx80Exp( b ); 5938 bSign = extractFloatx80Sign( b ); 5939 zSign = aSign ^ bSign; 5940 if ( aExp == 0x7FFF ) { 5941 if ((uint64_t)(aSig << 1)) { 5942 return propagateFloatx80NaN(a, b, status); 5943 } 5944 if ( bExp == 0x7FFF ) { 5945 if ((uint64_t)(bSig << 1)) { 5946 return propagateFloatx80NaN(a, b, status); 5947 } 5948 goto invalid; 5949 } 5950 return packFloatx80(zSign, floatx80_infinity_high, 5951 floatx80_infinity_low); 5952 } 5953 if ( bExp == 0x7FFF ) { 5954 if ((uint64_t)(bSig << 1)) { 5955 return propagateFloatx80NaN(a, b, status); 5956 } 5957 return packFloatx80( zSign, 0, 0 ); 5958 } 5959 if ( bExp == 0 ) { 5960 if ( bSig == 0 ) { 5961 if ( ( aExp | aSig ) == 0 ) { 5962 invalid: 5963 float_raise(float_flag_invalid, status); 5964 return floatx80_default_nan(status); 5965 } 5966 float_raise(float_flag_divbyzero, status); 5967 return packFloatx80(zSign, floatx80_infinity_high, 5968 floatx80_infinity_low); 5969 } 5970 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5971 } 5972 if ( aExp == 0 ) { 5973 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5974 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5975 } 5976 zExp = aExp - bExp + 0x3FFE; 5977 rem1 = 0; 5978 if ( bSig <= aSig ) { 5979 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5980 ++zExp; 5981 } 5982 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5983 mul64To128( bSig, zSig0, &term0, &term1 ); 5984 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5985 while ( (int64_t) rem0 < 0 ) { 5986 --zSig0; 5987 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5988 } 5989 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5990 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5991 mul64To128( bSig, zSig1, &term1, &term2 ); 5992 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5993 while ( (int64_t) rem1 < 0 ) { 5994 --zSig1; 5995 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5996 } 5997 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5998 } 5999 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6000 zSign, zExp, zSig0, zSig1, status); 6001 } 6002 6003 /*---------------------------------------------------------------------------- 6004 | Returns the remainder of the extended double-precision floating-point value 6005 | `a' with respect to the corresponding value `b'. The operation is performed 6006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6007 *----------------------------------------------------------------------------*/ 6008 6009 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6010 { 6011 flag aSign, zSign; 6012 int32_t aExp, bExp, expDiff; 6013 uint64_t aSig0, aSig1, bSig; 6014 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6015 6016 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6017 float_raise(float_flag_invalid, status); 6018 return floatx80_default_nan(status); 6019 } 6020 aSig0 = extractFloatx80Frac( a ); 6021 aExp = extractFloatx80Exp( a ); 6022 aSign = extractFloatx80Sign( a ); 6023 bSig = extractFloatx80Frac( b ); 6024 bExp = extractFloatx80Exp( b ); 6025 if ( aExp == 0x7FFF ) { 6026 if ( (uint64_t) ( aSig0<<1 ) 6027 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6028 return propagateFloatx80NaN(a, b, status); 6029 } 6030 goto invalid; 6031 } 6032 if ( bExp == 0x7FFF ) { 6033 if ((uint64_t)(bSig << 1)) { 6034 return propagateFloatx80NaN(a, b, status); 6035 } 6036 return a; 6037 } 6038 if ( bExp == 0 ) { 6039 if ( bSig == 0 ) { 6040 invalid: 6041 float_raise(float_flag_invalid, status); 6042 return floatx80_default_nan(status); 6043 } 6044 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6045 } 6046 if ( aExp == 0 ) { 6047 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6048 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6049 } 6050 bSig |= LIT64( 0x8000000000000000 ); 6051 zSign = aSign; 6052 expDiff = aExp - bExp; 6053 aSig1 = 0; 6054 if ( expDiff < 0 ) { 6055 if ( expDiff < -1 ) return a; 6056 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6057 expDiff = 0; 6058 } 6059 q = ( bSig <= aSig0 ); 6060 if ( q ) aSig0 -= bSig; 6061 expDiff -= 64; 6062 while ( 0 < expDiff ) { 6063 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6064 q = ( 2 < q ) ? q - 2 : 0; 6065 mul64To128( bSig, q, &term0, &term1 ); 6066 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6067 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6068 expDiff -= 62; 6069 } 6070 expDiff += 64; 6071 if ( 0 < expDiff ) { 6072 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6073 q = ( 2 < q ) ? q - 2 : 0; 6074 q >>= 64 - expDiff; 6075 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6076 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6077 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6078 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6079 ++q; 6080 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6081 } 6082 } 6083 else { 6084 term1 = 0; 6085 term0 = bSig; 6086 } 6087 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6088 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6089 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6090 && ( q & 1 ) ) 6091 ) { 6092 aSig0 = alternateASig0; 6093 aSig1 = alternateASig1; 6094 zSign = ! zSign; 6095 } 6096 return 6097 normalizeRoundAndPackFloatx80( 6098 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6099 6100 } 6101 6102 /*---------------------------------------------------------------------------- 6103 | Returns the square root of the extended double-precision floating-point 6104 | value `a'. The operation is performed according to the IEC/IEEE Standard 6105 | for Binary Floating-Point Arithmetic. 6106 *----------------------------------------------------------------------------*/ 6107 6108 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6109 { 6110 flag aSign; 6111 int32_t aExp, zExp; 6112 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6113 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6114 6115 if (floatx80_invalid_encoding(a)) { 6116 float_raise(float_flag_invalid, status); 6117 return floatx80_default_nan(status); 6118 } 6119 aSig0 = extractFloatx80Frac( a ); 6120 aExp = extractFloatx80Exp( a ); 6121 aSign = extractFloatx80Sign( a ); 6122 if ( aExp == 0x7FFF ) { 6123 if ((uint64_t)(aSig0 << 1)) { 6124 return propagateFloatx80NaN(a, a, status); 6125 } 6126 if ( ! aSign ) return a; 6127 goto invalid; 6128 } 6129 if ( aSign ) { 6130 if ( ( aExp | aSig0 ) == 0 ) return a; 6131 invalid: 6132 float_raise(float_flag_invalid, status); 6133 return floatx80_default_nan(status); 6134 } 6135 if ( aExp == 0 ) { 6136 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6137 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6138 } 6139 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6140 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6141 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6142 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6143 doubleZSig0 = zSig0<<1; 6144 mul64To128( zSig0, zSig0, &term0, &term1 ); 6145 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6146 while ( (int64_t) rem0 < 0 ) { 6147 --zSig0; 6148 doubleZSig0 -= 2; 6149 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6150 } 6151 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6152 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 6153 if ( zSig1 == 0 ) zSig1 = 1; 6154 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6155 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6156 mul64To128( zSig1, zSig1, &term2, &term3 ); 6157 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6158 while ( (int64_t) rem1 < 0 ) { 6159 --zSig1; 6160 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6161 term3 |= 1; 6162 term2 |= doubleZSig0; 6163 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6164 } 6165 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6166 } 6167 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6168 zSig0 |= doubleZSig0; 6169 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6170 0, zExp, zSig0, zSig1, status); 6171 } 6172 6173 /*---------------------------------------------------------------------------- 6174 | Returns 1 if the extended double-precision floating-point value `a' is equal 6175 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6176 | raised if either operand is a NaN. Otherwise, the comparison is performed 6177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6178 *----------------------------------------------------------------------------*/ 6179 6180 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6181 { 6182 6183 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6184 || (extractFloatx80Exp(a) == 0x7FFF 6185 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6186 || (extractFloatx80Exp(b) == 0x7FFF 6187 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6188 ) { 6189 float_raise(float_flag_invalid, status); 6190 return 0; 6191 } 6192 return 6193 ( a.low == b.low ) 6194 && ( ( a.high == b.high ) 6195 || ( ( a.low == 0 ) 6196 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6197 ); 6198 6199 } 6200 6201 /*---------------------------------------------------------------------------- 6202 | Returns 1 if the extended double-precision floating-point value `a' is 6203 | less than or equal to the corresponding value `b', and 0 otherwise. The 6204 | invalid exception is raised if either operand is a NaN. The comparison is 6205 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6206 | Arithmetic. 6207 *----------------------------------------------------------------------------*/ 6208 6209 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6210 { 6211 flag aSign, bSign; 6212 6213 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6214 || (extractFloatx80Exp(a) == 0x7FFF 6215 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6216 || (extractFloatx80Exp(b) == 0x7FFF 6217 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6218 ) { 6219 float_raise(float_flag_invalid, status); 6220 return 0; 6221 } 6222 aSign = extractFloatx80Sign( a ); 6223 bSign = extractFloatx80Sign( b ); 6224 if ( aSign != bSign ) { 6225 return 6226 aSign 6227 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6228 == 0 ); 6229 } 6230 return 6231 aSign ? le128( b.high, b.low, a.high, a.low ) 6232 : le128( a.high, a.low, b.high, b.low ); 6233 6234 } 6235 6236 /*---------------------------------------------------------------------------- 6237 | Returns 1 if the extended double-precision floating-point value `a' is 6238 | less than the corresponding value `b', and 0 otherwise. The invalid 6239 | exception is raised if either operand is a NaN. The comparison is performed 6240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6241 *----------------------------------------------------------------------------*/ 6242 6243 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6244 { 6245 flag aSign, bSign; 6246 6247 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6248 || (extractFloatx80Exp(a) == 0x7FFF 6249 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6250 || (extractFloatx80Exp(b) == 0x7FFF 6251 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6252 ) { 6253 float_raise(float_flag_invalid, status); 6254 return 0; 6255 } 6256 aSign = extractFloatx80Sign( a ); 6257 bSign = extractFloatx80Sign( b ); 6258 if ( aSign != bSign ) { 6259 return 6260 aSign 6261 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6262 != 0 ); 6263 } 6264 return 6265 aSign ? lt128( b.high, b.low, a.high, a.low ) 6266 : lt128( a.high, a.low, b.high, b.low ); 6267 6268 } 6269 6270 /*---------------------------------------------------------------------------- 6271 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6272 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6273 | either operand is a NaN. The comparison is performed according to the 6274 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6275 *----------------------------------------------------------------------------*/ 6276 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6277 { 6278 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6279 || (extractFloatx80Exp(a) == 0x7FFF 6280 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6281 || (extractFloatx80Exp(b) == 0x7FFF 6282 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6283 ) { 6284 float_raise(float_flag_invalid, status); 6285 return 1; 6286 } 6287 return 0; 6288 } 6289 6290 /*---------------------------------------------------------------------------- 6291 | Returns 1 if the extended double-precision floating-point value `a' is 6292 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6293 | cause an exception. The comparison is performed according to the IEC/IEEE 6294 | Standard for Binary Floating-Point Arithmetic. 6295 *----------------------------------------------------------------------------*/ 6296 6297 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6298 { 6299 6300 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6301 float_raise(float_flag_invalid, status); 6302 return 0; 6303 } 6304 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6305 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6306 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6307 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6308 ) { 6309 if (floatx80_is_signaling_nan(a, status) 6310 || floatx80_is_signaling_nan(b, status)) { 6311 float_raise(float_flag_invalid, status); 6312 } 6313 return 0; 6314 } 6315 return 6316 ( a.low == b.low ) 6317 && ( ( a.high == b.high ) 6318 || ( ( a.low == 0 ) 6319 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6320 ); 6321 6322 } 6323 6324 /*---------------------------------------------------------------------------- 6325 | Returns 1 if the extended double-precision floating-point value `a' is less 6326 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6327 | do not cause an exception. Otherwise, the comparison is performed according 6328 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6329 *----------------------------------------------------------------------------*/ 6330 6331 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6332 { 6333 flag aSign, bSign; 6334 6335 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6336 float_raise(float_flag_invalid, status); 6337 return 0; 6338 } 6339 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6340 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6341 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6342 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6343 ) { 6344 if (floatx80_is_signaling_nan(a, status) 6345 || floatx80_is_signaling_nan(b, status)) { 6346 float_raise(float_flag_invalid, status); 6347 } 6348 return 0; 6349 } 6350 aSign = extractFloatx80Sign( a ); 6351 bSign = extractFloatx80Sign( b ); 6352 if ( aSign != bSign ) { 6353 return 6354 aSign 6355 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6356 == 0 ); 6357 } 6358 return 6359 aSign ? le128( b.high, b.low, a.high, a.low ) 6360 : le128( a.high, a.low, b.high, b.low ); 6361 6362 } 6363 6364 /*---------------------------------------------------------------------------- 6365 | Returns 1 if the extended double-precision floating-point value `a' is less 6366 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6367 | an exception. Otherwise, the comparison is performed according to the 6368 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6369 *----------------------------------------------------------------------------*/ 6370 6371 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6372 { 6373 flag aSign, bSign; 6374 6375 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6376 float_raise(float_flag_invalid, status); 6377 return 0; 6378 } 6379 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6380 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6381 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6382 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6383 ) { 6384 if (floatx80_is_signaling_nan(a, status) 6385 || floatx80_is_signaling_nan(b, status)) { 6386 float_raise(float_flag_invalid, status); 6387 } 6388 return 0; 6389 } 6390 aSign = extractFloatx80Sign( a ); 6391 bSign = extractFloatx80Sign( b ); 6392 if ( aSign != bSign ) { 6393 return 6394 aSign 6395 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6396 != 0 ); 6397 } 6398 return 6399 aSign ? lt128( b.high, b.low, a.high, a.low ) 6400 : lt128( a.high, a.low, b.high, b.low ); 6401 6402 } 6403 6404 /*---------------------------------------------------------------------------- 6405 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6406 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6407 | The comparison is performed according to the IEC/IEEE Standard for Binary 6408 | Floating-Point Arithmetic. 6409 *----------------------------------------------------------------------------*/ 6410 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6411 { 6412 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6413 float_raise(float_flag_invalid, status); 6414 return 1; 6415 } 6416 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6417 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6418 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6419 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6420 ) { 6421 if (floatx80_is_signaling_nan(a, status) 6422 || floatx80_is_signaling_nan(b, status)) { 6423 float_raise(float_flag_invalid, status); 6424 } 6425 return 1; 6426 } 6427 return 0; 6428 } 6429 6430 /*---------------------------------------------------------------------------- 6431 | Returns the result of converting the quadruple-precision floating-point 6432 | value `a' to the 32-bit two's complement integer format. The conversion 6433 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6434 | Arithmetic---which means in particular that the conversion is rounded 6435 | according to the current rounding mode. If `a' is a NaN, the largest 6436 | positive integer is returned. Otherwise, if the conversion overflows, the 6437 | largest integer with the same sign as `a' is returned. 6438 *----------------------------------------------------------------------------*/ 6439 6440 int32_t float128_to_int32(float128 a, float_status *status) 6441 { 6442 flag aSign; 6443 int32_t aExp, shiftCount; 6444 uint64_t aSig0, aSig1; 6445 6446 aSig1 = extractFloat128Frac1( a ); 6447 aSig0 = extractFloat128Frac0( a ); 6448 aExp = extractFloat128Exp( a ); 6449 aSign = extractFloat128Sign( a ); 6450 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6451 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6452 aSig0 |= ( aSig1 != 0 ); 6453 shiftCount = 0x4028 - aExp; 6454 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6455 return roundAndPackInt32(aSign, aSig0, status); 6456 6457 } 6458 6459 /*---------------------------------------------------------------------------- 6460 | Returns the result of converting the quadruple-precision floating-point 6461 | value `a' to the 32-bit two's complement integer format. The conversion 6462 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6463 | Arithmetic, except that the conversion is always rounded toward zero. If 6464 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6465 | conversion overflows, the largest integer with the same sign as `a' is 6466 | returned. 6467 *----------------------------------------------------------------------------*/ 6468 6469 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6470 { 6471 flag aSign; 6472 int32_t aExp, shiftCount; 6473 uint64_t aSig0, aSig1, savedASig; 6474 int32_t z; 6475 6476 aSig1 = extractFloat128Frac1( a ); 6477 aSig0 = extractFloat128Frac0( a ); 6478 aExp = extractFloat128Exp( a ); 6479 aSign = extractFloat128Sign( a ); 6480 aSig0 |= ( aSig1 != 0 ); 6481 if ( 0x401E < aExp ) { 6482 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6483 goto invalid; 6484 } 6485 else if ( aExp < 0x3FFF ) { 6486 if (aExp || aSig0) { 6487 status->float_exception_flags |= float_flag_inexact; 6488 } 6489 return 0; 6490 } 6491 aSig0 |= LIT64( 0x0001000000000000 ); 6492 shiftCount = 0x402F - aExp; 6493 savedASig = aSig0; 6494 aSig0 >>= shiftCount; 6495 z = aSig0; 6496 if ( aSign ) z = - z; 6497 if ( ( z < 0 ) ^ aSign ) { 6498 invalid: 6499 float_raise(float_flag_invalid, status); 6500 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6501 } 6502 if ( ( aSig0<<shiftCount ) != savedASig ) { 6503 status->float_exception_flags |= float_flag_inexact; 6504 } 6505 return z; 6506 6507 } 6508 6509 /*---------------------------------------------------------------------------- 6510 | Returns the result of converting the quadruple-precision floating-point 6511 | value `a' to the 64-bit two's complement integer format. The conversion 6512 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6513 | Arithmetic---which means in particular that the conversion is rounded 6514 | according to the current rounding mode. If `a' is a NaN, the largest 6515 | positive integer is returned. Otherwise, if the conversion overflows, the 6516 | largest integer with the same sign as `a' is returned. 6517 *----------------------------------------------------------------------------*/ 6518 6519 int64_t float128_to_int64(float128 a, float_status *status) 6520 { 6521 flag aSign; 6522 int32_t aExp, shiftCount; 6523 uint64_t aSig0, aSig1; 6524 6525 aSig1 = extractFloat128Frac1( a ); 6526 aSig0 = extractFloat128Frac0( a ); 6527 aExp = extractFloat128Exp( a ); 6528 aSign = extractFloat128Sign( a ); 6529 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6530 shiftCount = 0x402F - aExp; 6531 if ( shiftCount <= 0 ) { 6532 if ( 0x403E < aExp ) { 6533 float_raise(float_flag_invalid, status); 6534 if ( ! aSign 6535 || ( ( aExp == 0x7FFF ) 6536 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6537 ) 6538 ) { 6539 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6540 } 6541 return (int64_t) LIT64( 0x8000000000000000 ); 6542 } 6543 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6544 } 6545 else { 6546 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6547 } 6548 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6549 6550 } 6551 6552 /*---------------------------------------------------------------------------- 6553 | Returns the result of converting the quadruple-precision floating-point 6554 | value `a' to the 64-bit two's complement integer format. The conversion 6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6556 | Arithmetic, except that the conversion is always rounded toward zero. 6557 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6558 | the conversion overflows, the largest integer with the same sign as `a' is 6559 | returned. 6560 *----------------------------------------------------------------------------*/ 6561 6562 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6563 { 6564 flag aSign; 6565 int32_t aExp, shiftCount; 6566 uint64_t aSig0, aSig1; 6567 int64_t z; 6568 6569 aSig1 = extractFloat128Frac1( a ); 6570 aSig0 = extractFloat128Frac0( a ); 6571 aExp = extractFloat128Exp( a ); 6572 aSign = extractFloat128Sign( a ); 6573 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6574 shiftCount = aExp - 0x402F; 6575 if ( 0 < shiftCount ) { 6576 if ( 0x403E <= aExp ) { 6577 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6578 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6579 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6580 if (aSig1) { 6581 status->float_exception_flags |= float_flag_inexact; 6582 } 6583 } 6584 else { 6585 float_raise(float_flag_invalid, status); 6586 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6587 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6588 } 6589 } 6590 return (int64_t) LIT64( 0x8000000000000000 ); 6591 } 6592 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6593 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6594 status->float_exception_flags |= float_flag_inexact; 6595 } 6596 } 6597 else { 6598 if ( aExp < 0x3FFF ) { 6599 if ( aExp | aSig0 | aSig1 ) { 6600 status->float_exception_flags |= float_flag_inexact; 6601 } 6602 return 0; 6603 } 6604 z = aSig0>>( - shiftCount ); 6605 if ( aSig1 6606 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6607 status->float_exception_flags |= float_flag_inexact; 6608 } 6609 } 6610 if ( aSign ) z = - z; 6611 return z; 6612 6613 } 6614 6615 /*---------------------------------------------------------------------------- 6616 | Returns the result of converting the quadruple-precision floating-point value 6617 | `a' to the 64-bit unsigned integer format. The conversion is 6618 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6619 | Arithmetic---which means in particular that the conversion is rounded 6620 | according to the current rounding mode. If `a' is a NaN, the largest 6621 | positive integer is returned. If the conversion overflows, the 6622 | largest unsigned integer is returned. If 'a' is negative, the value is 6623 | rounded and zero is returned; negative values that do not round to zero 6624 | will raise the inexact exception. 6625 *----------------------------------------------------------------------------*/ 6626 6627 uint64_t float128_to_uint64(float128 a, float_status *status) 6628 { 6629 flag aSign; 6630 int aExp; 6631 int shiftCount; 6632 uint64_t aSig0, aSig1; 6633 6634 aSig0 = extractFloat128Frac0(a); 6635 aSig1 = extractFloat128Frac1(a); 6636 aExp = extractFloat128Exp(a); 6637 aSign = extractFloat128Sign(a); 6638 if (aSign && (aExp > 0x3FFE)) { 6639 float_raise(float_flag_invalid, status); 6640 if (float128_is_any_nan(a)) { 6641 return LIT64(0xFFFFFFFFFFFFFFFF); 6642 } else { 6643 return 0; 6644 } 6645 } 6646 if (aExp) { 6647 aSig0 |= LIT64(0x0001000000000000); 6648 } 6649 shiftCount = 0x402F - aExp; 6650 if (shiftCount <= 0) { 6651 if (0x403E < aExp) { 6652 float_raise(float_flag_invalid, status); 6653 return LIT64(0xFFFFFFFFFFFFFFFF); 6654 } 6655 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6656 } else { 6657 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6658 } 6659 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6660 } 6661 6662 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6663 { 6664 uint64_t v; 6665 signed char current_rounding_mode = status->float_rounding_mode; 6666 6667 set_float_rounding_mode(float_round_to_zero, status); 6668 v = float128_to_uint64(a, status); 6669 set_float_rounding_mode(current_rounding_mode, status); 6670 6671 return v; 6672 } 6673 6674 /*---------------------------------------------------------------------------- 6675 | Returns the result of converting the quadruple-precision floating-point 6676 | value `a' to the 32-bit unsigned integer format. The conversion 6677 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6678 | Arithmetic except that the conversion is always rounded toward zero. 6679 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6680 | if the conversion overflows, the largest unsigned integer is returned. 6681 | If 'a' is negative, the value is rounded and zero is returned; negative 6682 | values that do not round to zero will raise the inexact exception. 6683 *----------------------------------------------------------------------------*/ 6684 6685 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6686 { 6687 uint64_t v; 6688 uint32_t res; 6689 int old_exc_flags = get_float_exception_flags(status); 6690 6691 v = float128_to_uint64_round_to_zero(a, status); 6692 if (v > 0xffffffff) { 6693 res = 0xffffffff; 6694 } else { 6695 return v; 6696 } 6697 set_float_exception_flags(old_exc_flags, status); 6698 float_raise(float_flag_invalid, status); 6699 return res; 6700 } 6701 6702 /*---------------------------------------------------------------------------- 6703 | Returns the result of converting the quadruple-precision floating-point 6704 | value `a' to the single-precision floating-point format. The conversion 6705 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6706 | Arithmetic. 6707 *----------------------------------------------------------------------------*/ 6708 6709 float32 float128_to_float32(float128 a, float_status *status) 6710 { 6711 flag aSign; 6712 int32_t aExp; 6713 uint64_t aSig0, aSig1; 6714 uint32_t zSig; 6715 6716 aSig1 = extractFloat128Frac1( a ); 6717 aSig0 = extractFloat128Frac0( a ); 6718 aExp = extractFloat128Exp( a ); 6719 aSign = extractFloat128Sign( a ); 6720 if ( aExp == 0x7FFF ) { 6721 if ( aSig0 | aSig1 ) { 6722 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6723 } 6724 return packFloat32( aSign, 0xFF, 0 ); 6725 } 6726 aSig0 |= ( aSig1 != 0 ); 6727 shift64RightJamming( aSig0, 18, &aSig0 ); 6728 zSig = aSig0; 6729 if ( aExp || zSig ) { 6730 zSig |= 0x40000000; 6731 aExp -= 0x3F81; 6732 } 6733 return roundAndPackFloat32(aSign, aExp, zSig, status); 6734 6735 } 6736 6737 /*---------------------------------------------------------------------------- 6738 | Returns the result of converting the quadruple-precision floating-point 6739 | value `a' to the double-precision floating-point format. The conversion 6740 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6741 | Arithmetic. 6742 *----------------------------------------------------------------------------*/ 6743 6744 float64 float128_to_float64(float128 a, float_status *status) 6745 { 6746 flag aSign; 6747 int32_t aExp; 6748 uint64_t aSig0, aSig1; 6749 6750 aSig1 = extractFloat128Frac1( a ); 6751 aSig0 = extractFloat128Frac0( a ); 6752 aExp = extractFloat128Exp( a ); 6753 aSign = extractFloat128Sign( a ); 6754 if ( aExp == 0x7FFF ) { 6755 if ( aSig0 | aSig1 ) { 6756 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6757 } 6758 return packFloat64( aSign, 0x7FF, 0 ); 6759 } 6760 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6761 aSig0 |= ( aSig1 != 0 ); 6762 if ( aExp || aSig0 ) { 6763 aSig0 |= LIT64( 0x4000000000000000 ); 6764 aExp -= 0x3C01; 6765 } 6766 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6767 6768 } 6769 6770 /*---------------------------------------------------------------------------- 6771 | Returns the result of converting the quadruple-precision floating-point 6772 | value `a' to the extended double-precision floating-point format. The 6773 | conversion is performed according to the IEC/IEEE Standard for Binary 6774 | Floating-Point Arithmetic. 6775 *----------------------------------------------------------------------------*/ 6776 6777 floatx80 float128_to_floatx80(float128 a, float_status *status) 6778 { 6779 flag aSign; 6780 int32_t aExp; 6781 uint64_t aSig0, aSig1; 6782 6783 aSig1 = extractFloat128Frac1( a ); 6784 aSig0 = extractFloat128Frac0( a ); 6785 aExp = extractFloat128Exp( a ); 6786 aSign = extractFloat128Sign( a ); 6787 if ( aExp == 0x7FFF ) { 6788 if ( aSig0 | aSig1 ) { 6789 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6790 } 6791 return packFloatx80(aSign, floatx80_infinity_high, 6792 floatx80_infinity_low); 6793 } 6794 if ( aExp == 0 ) { 6795 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6796 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6797 } 6798 else { 6799 aSig0 |= LIT64( 0x0001000000000000 ); 6800 } 6801 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6802 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6803 6804 } 6805 6806 /*---------------------------------------------------------------------------- 6807 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6808 | returns the result as a quadruple-precision floating-point value. The 6809 | operation is performed according to the IEC/IEEE Standard for Binary 6810 | Floating-Point Arithmetic. 6811 *----------------------------------------------------------------------------*/ 6812 6813 float128 float128_round_to_int(float128 a, float_status *status) 6814 { 6815 flag aSign; 6816 int32_t aExp; 6817 uint64_t lastBitMask, roundBitsMask; 6818 float128 z; 6819 6820 aExp = extractFloat128Exp( a ); 6821 if ( 0x402F <= aExp ) { 6822 if ( 0x406F <= aExp ) { 6823 if ( ( aExp == 0x7FFF ) 6824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6825 ) { 6826 return propagateFloat128NaN(a, a, status); 6827 } 6828 return a; 6829 } 6830 lastBitMask = 1; 6831 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6832 roundBitsMask = lastBitMask - 1; 6833 z = a; 6834 switch (status->float_rounding_mode) { 6835 case float_round_nearest_even: 6836 if ( lastBitMask ) { 6837 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6838 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6839 } 6840 else { 6841 if ( (int64_t) z.low < 0 ) { 6842 ++z.high; 6843 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6844 } 6845 } 6846 break; 6847 case float_round_ties_away: 6848 if (lastBitMask) { 6849 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6850 } else { 6851 if ((int64_t) z.low < 0) { 6852 ++z.high; 6853 } 6854 } 6855 break; 6856 case float_round_to_zero: 6857 break; 6858 case float_round_up: 6859 if (!extractFloat128Sign(z)) { 6860 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6861 } 6862 break; 6863 case float_round_down: 6864 if (extractFloat128Sign(z)) { 6865 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6866 } 6867 break; 6868 default: 6869 abort(); 6870 } 6871 z.low &= ~ roundBitsMask; 6872 } 6873 else { 6874 if ( aExp < 0x3FFF ) { 6875 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6876 status->float_exception_flags |= float_flag_inexact; 6877 aSign = extractFloat128Sign( a ); 6878 switch (status->float_rounding_mode) { 6879 case float_round_nearest_even: 6880 if ( ( aExp == 0x3FFE ) 6881 && ( extractFloat128Frac0( a ) 6882 | extractFloat128Frac1( a ) ) 6883 ) { 6884 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6885 } 6886 break; 6887 case float_round_ties_away: 6888 if (aExp == 0x3FFE) { 6889 return packFloat128(aSign, 0x3FFF, 0, 0); 6890 } 6891 break; 6892 case float_round_down: 6893 return 6894 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6895 : packFloat128( 0, 0, 0, 0 ); 6896 case float_round_up: 6897 return 6898 aSign ? packFloat128( 1, 0, 0, 0 ) 6899 : packFloat128( 0, 0x3FFF, 0, 0 ); 6900 } 6901 return packFloat128( aSign, 0, 0, 0 ); 6902 } 6903 lastBitMask = 1; 6904 lastBitMask <<= 0x402F - aExp; 6905 roundBitsMask = lastBitMask - 1; 6906 z.low = 0; 6907 z.high = a.high; 6908 switch (status->float_rounding_mode) { 6909 case float_round_nearest_even: 6910 z.high += lastBitMask>>1; 6911 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6912 z.high &= ~ lastBitMask; 6913 } 6914 break; 6915 case float_round_ties_away: 6916 z.high += lastBitMask>>1; 6917 break; 6918 case float_round_to_zero: 6919 break; 6920 case float_round_up: 6921 if (!extractFloat128Sign(z)) { 6922 z.high |= ( a.low != 0 ); 6923 z.high += roundBitsMask; 6924 } 6925 break; 6926 case float_round_down: 6927 if (extractFloat128Sign(z)) { 6928 z.high |= (a.low != 0); 6929 z.high += roundBitsMask; 6930 } 6931 break; 6932 default: 6933 abort(); 6934 } 6935 z.high &= ~ roundBitsMask; 6936 } 6937 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6938 status->float_exception_flags |= float_flag_inexact; 6939 } 6940 return z; 6941 6942 } 6943 6944 /*---------------------------------------------------------------------------- 6945 | Returns the result of adding the absolute values of the quadruple-precision 6946 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6947 | before being returned. `zSign' is ignored if the result is a NaN. 6948 | The addition is performed according to the IEC/IEEE Standard for Binary 6949 | Floating-Point Arithmetic. 6950 *----------------------------------------------------------------------------*/ 6951 6952 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6953 float_status *status) 6954 { 6955 int32_t aExp, bExp, zExp; 6956 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6957 int32_t expDiff; 6958 6959 aSig1 = extractFloat128Frac1( a ); 6960 aSig0 = extractFloat128Frac0( a ); 6961 aExp = extractFloat128Exp( a ); 6962 bSig1 = extractFloat128Frac1( b ); 6963 bSig0 = extractFloat128Frac0( b ); 6964 bExp = extractFloat128Exp( b ); 6965 expDiff = aExp - bExp; 6966 if ( 0 < expDiff ) { 6967 if ( aExp == 0x7FFF ) { 6968 if (aSig0 | aSig1) { 6969 return propagateFloat128NaN(a, b, status); 6970 } 6971 return a; 6972 } 6973 if ( bExp == 0 ) { 6974 --expDiff; 6975 } 6976 else { 6977 bSig0 |= LIT64( 0x0001000000000000 ); 6978 } 6979 shift128ExtraRightJamming( 6980 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6981 zExp = aExp; 6982 } 6983 else if ( expDiff < 0 ) { 6984 if ( bExp == 0x7FFF ) { 6985 if (bSig0 | bSig1) { 6986 return propagateFloat128NaN(a, b, status); 6987 } 6988 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6989 } 6990 if ( aExp == 0 ) { 6991 ++expDiff; 6992 } 6993 else { 6994 aSig0 |= LIT64( 0x0001000000000000 ); 6995 } 6996 shift128ExtraRightJamming( 6997 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6998 zExp = bExp; 6999 } 7000 else { 7001 if ( aExp == 0x7FFF ) { 7002 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7003 return propagateFloat128NaN(a, b, status); 7004 } 7005 return a; 7006 } 7007 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7008 if ( aExp == 0 ) { 7009 if (status->flush_to_zero) { 7010 if (zSig0 | zSig1) { 7011 float_raise(float_flag_output_denormal, status); 7012 } 7013 return packFloat128(zSign, 0, 0, 0); 7014 } 7015 return packFloat128( zSign, 0, zSig0, zSig1 ); 7016 } 7017 zSig2 = 0; 7018 zSig0 |= LIT64( 0x0002000000000000 ); 7019 zExp = aExp; 7020 goto shiftRight1; 7021 } 7022 aSig0 |= LIT64( 0x0001000000000000 ); 7023 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7024 --zExp; 7025 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 7026 ++zExp; 7027 shiftRight1: 7028 shift128ExtraRightJamming( 7029 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7030 roundAndPack: 7031 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7032 7033 } 7034 7035 /*---------------------------------------------------------------------------- 7036 | Returns the result of subtracting the absolute values of the quadruple- 7037 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7038 | difference is negated before being returned. `zSign' is ignored if the 7039 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7040 | Standard for Binary Floating-Point Arithmetic. 7041 *----------------------------------------------------------------------------*/ 7042 7043 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7044 float_status *status) 7045 { 7046 int32_t aExp, bExp, zExp; 7047 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7048 int32_t expDiff; 7049 7050 aSig1 = extractFloat128Frac1( a ); 7051 aSig0 = extractFloat128Frac0( a ); 7052 aExp = extractFloat128Exp( a ); 7053 bSig1 = extractFloat128Frac1( b ); 7054 bSig0 = extractFloat128Frac0( b ); 7055 bExp = extractFloat128Exp( b ); 7056 expDiff = aExp - bExp; 7057 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7058 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7059 if ( 0 < expDiff ) goto aExpBigger; 7060 if ( expDiff < 0 ) goto bExpBigger; 7061 if ( aExp == 0x7FFF ) { 7062 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7063 return propagateFloat128NaN(a, b, status); 7064 } 7065 float_raise(float_flag_invalid, status); 7066 return float128_default_nan(status); 7067 } 7068 if ( aExp == 0 ) { 7069 aExp = 1; 7070 bExp = 1; 7071 } 7072 if ( bSig0 < aSig0 ) goto aBigger; 7073 if ( aSig0 < bSig0 ) goto bBigger; 7074 if ( bSig1 < aSig1 ) goto aBigger; 7075 if ( aSig1 < bSig1 ) goto bBigger; 7076 return packFloat128(status->float_rounding_mode == float_round_down, 7077 0, 0, 0); 7078 bExpBigger: 7079 if ( bExp == 0x7FFF ) { 7080 if (bSig0 | bSig1) { 7081 return propagateFloat128NaN(a, b, status); 7082 } 7083 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7084 } 7085 if ( aExp == 0 ) { 7086 ++expDiff; 7087 } 7088 else { 7089 aSig0 |= LIT64( 0x4000000000000000 ); 7090 } 7091 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7092 bSig0 |= LIT64( 0x4000000000000000 ); 7093 bBigger: 7094 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7095 zExp = bExp; 7096 zSign ^= 1; 7097 goto normalizeRoundAndPack; 7098 aExpBigger: 7099 if ( aExp == 0x7FFF ) { 7100 if (aSig0 | aSig1) { 7101 return propagateFloat128NaN(a, b, status); 7102 } 7103 return a; 7104 } 7105 if ( bExp == 0 ) { 7106 --expDiff; 7107 } 7108 else { 7109 bSig0 |= LIT64( 0x4000000000000000 ); 7110 } 7111 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7112 aSig0 |= LIT64( 0x4000000000000000 ); 7113 aBigger: 7114 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7115 zExp = aExp; 7116 normalizeRoundAndPack: 7117 --zExp; 7118 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7119 status); 7120 7121 } 7122 7123 /*---------------------------------------------------------------------------- 7124 | Returns the result of adding the quadruple-precision floating-point values 7125 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7126 | for Binary Floating-Point Arithmetic. 7127 *----------------------------------------------------------------------------*/ 7128 7129 float128 float128_add(float128 a, float128 b, float_status *status) 7130 { 7131 flag aSign, bSign; 7132 7133 aSign = extractFloat128Sign( a ); 7134 bSign = extractFloat128Sign( b ); 7135 if ( aSign == bSign ) { 7136 return addFloat128Sigs(a, b, aSign, status); 7137 } 7138 else { 7139 return subFloat128Sigs(a, b, aSign, status); 7140 } 7141 7142 } 7143 7144 /*---------------------------------------------------------------------------- 7145 | Returns the result of subtracting the quadruple-precision floating-point 7146 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7147 | Standard for Binary Floating-Point Arithmetic. 7148 *----------------------------------------------------------------------------*/ 7149 7150 float128 float128_sub(float128 a, float128 b, float_status *status) 7151 { 7152 flag aSign, bSign; 7153 7154 aSign = extractFloat128Sign( a ); 7155 bSign = extractFloat128Sign( b ); 7156 if ( aSign == bSign ) { 7157 return subFloat128Sigs(a, b, aSign, status); 7158 } 7159 else { 7160 return addFloat128Sigs(a, b, aSign, status); 7161 } 7162 7163 } 7164 7165 /*---------------------------------------------------------------------------- 7166 | Returns the result of multiplying the quadruple-precision floating-point 7167 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7168 | Standard for Binary Floating-Point Arithmetic. 7169 *----------------------------------------------------------------------------*/ 7170 7171 float128 float128_mul(float128 a, float128 b, float_status *status) 7172 { 7173 flag aSign, bSign, zSign; 7174 int32_t aExp, bExp, zExp; 7175 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7176 7177 aSig1 = extractFloat128Frac1( a ); 7178 aSig0 = extractFloat128Frac0( a ); 7179 aExp = extractFloat128Exp( a ); 7180 aSign = extractFloat128Sign( a ); 7181 bSig1 = extractFloat128Frac1( b ); 7182 bSig0 = extractFloat128Frac0( b ); 7183 bExp = extractFloat128Exp( b ); 7184 bSign = extractFloat128Sign( b ); 7185 zSign = aSign ^ bSign; 7186 if ( aExp == 0x7FFF ) { 7187 if ( ( aSig0 | aSig1 ) 7188 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7189 return propagateFloat128NaN(a, b, status); 7190 } 7191 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7192 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7193 } 7194 if ( bExp == 0x7FFF ) { 7195 if (bSig0 | bSig1) { 7196 return propagateFloat128NaN(a, b, status); 7197 } 7198 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7199 invalid: 7200 float_raise(float_flag_invalid, status); 7201 return float128_default_nan(status); 7202 } 7203 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7204 } 7205 if ( aExp == 0 ) { 7206 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7207 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7208 } 7209 if ( bExp == 0 ) { 7210 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7211 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7212 } 7213 zExp = aExp + bExp - 0x4000; 7214 aSig0 |= LIT64( 0x0001000000000000 ); 7215 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7216 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7217 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7218 zSig2 |= ( zSig3 != 0 ); 7219 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7220 shift128ExtraRightJamming( 7221 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7222 ++zExp; 7223 } 7224 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7225 7226 } 7227 7228 /*---------------------------------------------------------------------------- 7229 | Returns the result of dividing the quadruple-precision floating-point value 7230 | `a' by the corresponding value `b'. The operation is performed according to 7231 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7232 *----------------------------------------------------------------------------*/ 7233 7234 float128 float128_div(float128 a, float128 b, float_status *status) 7235 { 7236 flag aSign, bSign, zSign; 7237 int32_t aExp, bExp, zExp; 7238 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7239 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7240 7241 aSig1 = extractFloat128Frac1( a ); 7242 aSig0 = extractFloat128Frac0( a ); 7243 aExp = extractFloat128Exp( a ); 7244 aSign = extractFloat128Sign( a ); 7245 bSig1 = extractFloat128Frac1( b ); 7246 bSig0 = extractFloat128Frac0( b ); 7247 bExp = extractFloat128Exp( b ); 7248 bSign = extractFloat128Sign( b ); 7249 zSign = aSign ^ bSign; 7250 if ( aExp == 0x7FFF ) { 7251 if (aSig0 | aSig1) { 7252 return propagateFloat128NaN(a, b, status); 7253 } 7254 if ( bExp == 0x7FFF ) { 7255 if (bSig0 | bSig1) { 7256 return propagateFloat128NaN(a, b, status); 7257 } 7258 goto invalid; 7259 } 7260 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7261 } 7262 if ( bExp == 0x7FFF ) { 7263 if (bSig0 | bSig1) { 7264 return propagateFloat128NaN(a, b, status); 7265 } 7266 return packFloat128( zSign, 0, 0, 0 ); 7267 } 7268 if ( bExp == 0 ) { 7269 if ( ( bSig0 | bSig1 ) == 0 ) { 7270 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7271 invalid: 7272 float_raise(float_flag_invalid, status); 7273 return float128_default_nan(status); 7274 } 7275 float_raise(float_flag_divbyzero, status); 7276 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7277 } 7278 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7279 } 7280 if ( aExp == 0 ) { 7281 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7282 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7283 } 7284 zExp = aExp - bExp + 0x3FFD; 7285 shortShift128Left( 7286 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7287 shortShift128Left( 7288 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7289 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7290 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7291 ++zExp; 7292 } 7293 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7294 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7295 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7296 while ( (int64_t) rem0 < 0 ) { 7297 --zSig0; 7298 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7299 } 7300 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7301 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7302 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7303 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7304 while ( (int64_t) rem1 < 0 ) { 7305 --zSig1; 7306 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7307 } 7308 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7309 } 7310 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7311 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7312 7313 } 7314 7315 /*---------------------------------------------------------------------------- 7316 | Returns the remainder of the quadruple-precision floating-point value `a' 7317 | with respect to the corresponding value `b'. The operation is performed 7318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7319 *----------------------------------------------------------------------------*/ 7320 7321 float128 float128_rem(float128 a, float128 b, float_status *status) 7322 { 7323 flag aSign, zSign; 7324 int32_t aExp, bExp, expDiff; 7325 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7326 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7327 int64_t sigMean0; 7328 7329 aSig1 = extractFloat128Frac1( a ); 7330 aSig0 = extractFloat128Frac0( a ); 7331 aExp = extractFloat128Exp( a ); 7332 aSign = extractFloat128Sign( a ); 7333 bSig1 = extractFloat128Frac1( b ); 7334 bSig0 = extractFloat128Frac0( b ); 7335 bExp = extractFloat128Exp( b ); 7336 if ( aExp == 0x7FFF ) { 7337 if ( ( aSig0 | aSig1 ) 7338 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7339 return propagateFloat128NaN(a, b, status); 7340 } 7341 goto invalid; 7342 } 7343 if ( bExp == 0x7FFF ) { 7344 if (bSig0 | bSig1) { 7345 return propagateFloat128NaN(a, b, status); 7346 } 7347 return a; 7348 } 7349 if ( bExp == 0 ) { 7350 if ( ( bSig0 | bSig1 ) == 0 ) { 7351 invalid: 7352 float_raise(float_flag_invalid, status); 7353 return float128_default_nan(status); 7354 } 7355 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7356 } 7357 if ( aExp == 0 ) { 7358 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7359 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7360 } 7361 expDiff = aExp - bExp; 7362 if ( expDiff < -1 ) return a; 7363 shortShift128Left( 7364 aSig0 | LIT64( 0x0001000000000000 ), 7365 aSig1, 7366 15 - ( expDiff < 0 ), 7367 &aSig0, 7368 &aSig1 7369 ); 7370 shortShift128Left( 7371 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7372 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7373 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7374 expDiff -= 64; 7375 while ( 0 < expDiff ) { 7376 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7377 q = ( 4 < q ) ? q - 4 : 0; 7378 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7379 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7380 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7381 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7382 expDiff -= 61; 7383 } 7384 if ( -64 < expDiff ) { 7385 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7386 q = ( 4 < q ) ? q - 4 : 0; 7387 q >>= - expDiff; 7388 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7389 expDiff += 52; 7390 if ( expDiff < 0 ) { 7391 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7392 } 7393 else { 7394 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7395 } 7396 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7397 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7398 } 7399 else { 7400 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7401 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7402 } 7403 do { 7404 alternateASig0 = aSig0; 7405 alternateASig1 = aSig1; 7406 ++q; 7407 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7408 } while ( 0 <= (int64_t) aSig0 ); 7409 add128( 7410 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7411 if ( ( sigMean0 < 0 ) 7412 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7413 aSig0 = alternateASig0; 7414 aSig1 = alternateASig1; 7415 } 7416 zSign = ( (int64_t) aSig0 < 0 ); 7417 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7418 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7419 status); 7420 } 7421 7422 /*---------------------------------------------------------------------------- 7423 | Returns the square root of the quadruple-precision floating-point value `a'. 7424 | The operation is performed according to the IEC/IEEE Standard for Binary 7425 | Floating-Point Arithmetic. 7426 *----------------------------------------------------------------------------*/ 7427 7428 float128 float128_sqrt(float128 a, float_status *status) 7429 { 7430 flag aSign; 7431 int32_t aExp, zExp; 7432 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7433 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7434 7435 aSig1 = extractFloat128Frac1( a ); 7436 aSig0 = extractFloat128Frac0( a ); 7437 aExp = extractFloat128Exp( a ); 7438 aSign = extractFloat128Sign( a ); 7439 if ( aExp == 0x7FFF ) { 7440 if (aSig0 | aSig1) { 7441 return propagateFloat128NaN(a, a, status); 7442 } 7443 if ( ! aSign ) return a; 7444 goto invalid; 7445 } 7446 if ( aSign ) { 7447 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7448 invalid: 7449 float_raise(float_flag_invalid, status); 7450 return float128_default_nan(status); 7451 } 7452 if ( aExp == 0 ) { 7453 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7454 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7455 } 7456 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7457 aSig0 |= LIT64( 0x0001000000000000 ); 7458 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7459 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7460 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7461 doubleZSig0 = zSig0<<1; 7462 mul64To128( zSig0, zSig0, &term0, &term1 ); 7463 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7464 while ( (int64_t) rem0 < 0 ) { 7465 --zSig0; 7466 doubleZSig0 -= 2; 7467 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7468 } 7469 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7470 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7471 if ( zSig1 == 0 ) zSig1 = 1; 7472 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7473 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7474 mul64To128( zSig1, zSig1, &term2, &term3 ); 7475 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7476 while ( (int64_t) rem1 < 0 ) { 7477 --zSig1; 7478 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7479 term3 |= 1; 7480 term2 |= doubleZSig0; 7481 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7482 } 7483 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7484 } 7485 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7486 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7487 7488 } 7489 7490 /*---------------------------------------------------------------------------- 7491 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7492 | the corresponding value `b', and 0 otherwise. The invalid exception is 7493 | raised if either operand is a NaN. Otherwise, the comparison is performed 7494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7495 *----------------------------------------------------------------------------*/ 7496 7497 int float128_eq(float128 a, float128 b, float_status *status) 7498 { 7499 7500 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7501 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7502 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7503 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7504 ) { 7505 float_raise(float_flag_invalid, status); 7506 return 0; 7507 } 7508 return 7509 ( a.low == b.low ) 7510 && ( ( a.high == b.high ) 7511 || ( ( a.low == 0 ) 7512 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7513 ); 7514 7515 } 7516 7517 /*---------------------------------------------------------------------------- 7518 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7519 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7520 | exception is raised if either operand is a NaN. The comparison is performed 7521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7522 *----------------------------------------------------------------------------*/ 7523 7524 int float128_le(float128 a, float128 b, float_status *status) 7525 { 7526 flag aSign, bSign; 7527 7528 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7529 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7530 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7531 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7532 ) { 7533 float_raise(float_flag_invalid, status); 7534 return 0; 7535 } 7536 aSign = extractFloat128Sign( a ); 7537 bSign = extractFloat128Sign( b ); 7538 if ( aSign != bSign ) { 7539 return 7540 aSign 7541 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7542 == 0 ); 7543 } 7544 return 7545 aSign ? le128( b.high, b.low, a.high, a.low ) 7546 : le128( a.high, a.low, b.high, b.low ); 7547 7548 } 7549 7550 /*---------------------------------------------------------------------------- 7551 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7552 | the corresponding value `b', and 0 otherwise. The invalid exception is 7553 | raised if either operand is a NaN. The comparison is performed according 7554 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7555 *----------------------------------------------------------------------------*/ 7556 7557 int float128_lt(float128 a, float128 b, float_status *status) 7558 { 7559 flag aSign, bSign; 7560 7561 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7562 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7563 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7564 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7565 ) { 7566 float_raise(float_flag_invalid, status); 7567 return 0; 7568 } 7569 aSign = extractFloat128Sign( a ); 7570 bSign = extractFloat128Sign( b ); 7571 if ( aSign != bSign ) { 7572 return 7573 aSign 7574 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7575 != 0 ); 7576 } 7577 return 7578 aSign ? lt128( b.high, b.low, a.high, a.low ) 7579 : lt128( a.high, a.low, b.high, b.low ); 7580 7581 } 7582 7583 /*---------------------------------------------------------------------------- 7584 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7585 | be compared, and 0 otherwise. The invalid exception is raised if either 7586 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7587 | Standard for Binary Floating-Point Arithmetic. 7588 *----------------------------------------------------------------------------*/ 7589 7590 int float128_unordered(float128 a, float128 b, float_status *status) 7591 { 7592 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7593 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7594 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7595 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7596 ) { 7597 float_raise(float_flag_invalid, status); 7598 return 1; 7599 } 7600 return 0; 7601 } 7602 7603 /*---------------------------------------------------------------------------- 7604 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7605 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7606 | exception. The comparison is performed according to the IEC/IEEE Standard 7607 | for Binary Floating-Point Arithmetic. 7608 *----------------------------------------------------------------------------*/ 7609 7610 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7611 { 7612 7613 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7614 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7615 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7616 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7617 ) { 7618 if (float128_is_signaling_nan(a, status) 7619 || float128_is_signaling_nan(b, status)) { 7620 float_raise(float_flag_invalid, status); 7621 } 7622 return 0; 7623 } 7624 return 7625 ( a.low == b.low ) 7626 && ( ( a.high == b.high ) 7627 || ( ( a.low == 0 ) 7628 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7629 ); 7630 7631 } 7632 7633 /*---------------------------------------------------------------------------- 7634 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7635 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7636 | cause an exception. Otherwise, the comparison is performed according to the 7637 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7638 *----------------------------------------------------------------------------*/ 7639 7640 int float128_le_quiet(float128 a, float128 b, float_status *status) 7641 { 7642 flag aSign, bSign; 7643 7644 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7645 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7646 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7647 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7648 ) { 7649 if (float128_is_signaling_nan(a, status) 7650 || float128_is_signaling_nan(b, status)) { 7651 float_raise(float_flag_invalid, status); 7652 } 7653 return 0; 7654 } 7655 aSign = extractFloat128Sign( a ); 7656 bSign = extractFloat128Sign( b ); 7657 if ( aSign != bSign ) { 7658 return 7659 aSign 7660 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7661 == 0 ); 7662 } 7663 return 7664 aSign ? le128( b.high, b.low, a.high, a.low ) 7665 : le128( a.high, a.low, b.high, b.low ); 7666 7667 } 7668 7669 /*---------------------------------------------------------------------------- 7670 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7671 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7672 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7673 | Standard for Binary Floating-Point Arithmetic. 7674 *----------------------------------------------------------------------------*/ 7675 7676 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7677 { 7678 flag aSign, bSign; 7679 7680 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7681 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7682 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7683 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7684 ) { 7685 if (float128_is_signaling_nan(a, status) 7686 || float128_is_signaling_nan(b, status)) { 7687 float_raise(float_flag_invalid, status); 7688 } 7689 return 0; 7690 } 7691 aSign = extractFloat128Sign( a ); 7692 bSign = extractFloat128Sign( b ); 7693 if ( aSign != bSign ) { 7694 return 7695 aSign 7696 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7697 != 0 ); 7698 } 7699 return 7700 aSign ? lt128( b.high, b.low, a.high, a.low ) 7701 : lt128( a.high, a.low, b.high, b.low ); 7702 7703 } 7704 7705 /*---------------------------------------------------------------------------- 7706 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7707 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7708 | comparison is performed according to the IEC/IEEE Standard for Binary 7709 | Floating-Point Arithmetic. 7710 *----------------------------------------------------------------------------*/ 7711 7712 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7713 { 7714 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7715 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7716 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7717 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7718 ) { 7719 if (float128_is_signaling_nan(a, status) 7720 || float128_is_signaling_nan(b, status)) { 7721 float_raise(float_flag_invalid, status); 7722 } 7723 return 1; 7724 } 7725 return 0; 7726 } 7727 7728 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7729 int is_quiet, float_status *status) 7730 { 7731 flag aSign, bSign; 7732 7733 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7734 float_raise(float_flag_invalid, status); 7735 return float_relation_unordered; 7736 } 7737 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7738 ( extractFloatx80Frac( a )<<1 ) ) || 7739 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7740 ( extractFloatx80Frac( b )<<1 ) )) { 7741 if (!is_quiet || 7742 floatx80_is_signaling_nan(a, status) || 7743 floatx80_is_signaling_nan(b, status)) { 7744 float_raise(float_flag_invalid, status); 7745 } 7746 return float_relation_unordered; 7747 } 7748 aSign = extractFloatx80Sign( a ); 7749 bSign = extractFloatx80Sign( b ); 7750 if ( aSign != bSign ) { 7751 7752 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7753 ( ( a.low | b.low ) == 0 ) ) { 7754 /* zero case */ 7755 return float_relation_equal; 7756 } else { 7757 return 1 - (2 * aSign); 7758 } 7759 } else { 7760 if (a.low == b.low && a.high == b.high) { 7761 return float_relation_equal; 7762 } else { 7763 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7764 } 7765 } 7766 } 7767 7768 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7769 { 7770 return floatx80_compare_internal(a, b, 0, status); 7771 } 7772 7773 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7774 { 7775 return floatx80_compare_internal(a, b, 1, status); 7776 } 7777 7778 static inline int float128_compare_internal(float128 a, float128 b, 7779 int is_quiet, float_status *status) 7780 { 7781 flag aSign, bSign; 7782 7783 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7784 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7785 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7786 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7787 if (!is_quiet || 7788 float128_is_signaling_nan(a, status) || 7789 float128_is_signaling_nan(b, status)) { 7790 float_raise(float_flag_invalid, status); 7791 } 7792 return float_relation_unordered; 7793 } 7794 aSign = extractFloat128Sign( a ); 7795 bSign = extractFloat128Sign( b ); 7796 if ( aSign != bSign ) { 7797 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7798 /* zero case */ 7799 return float_relation_equal; 7800 } else { 7801 return 1 - (2 * aSign); 7802 } 7803 } else { 7804 if (a.low == b.low && a.high == b.high) { 7805 return float_relation_equal; 7806 } else { 7807 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7808 } 7809 } 7810 } 7811 7812 int float128_compare(float128 a, float128 b, float_status *status) 7813 { 7814 return float128_compare_internal(a, b, 0, status); 7815 } 7816 7817 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7818 { 7819 return float128_compare_internal(a, b, 1, status); 7820 } 7821 7822 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7823 { 7824 flag aSign; 7825 int32_t aExp; 7826 uint64_t aSig; 7827 7828 if (floatx80_invalid_encoding(a)) { 7829 float_raise(float_flag_invalid, status); 7830 return floatx80_default_nan(status); 7831 } 7832 aSig = extractFloatx80Frac( a ); 7833 aExp = extractFloatx80Exp( a ); 7834 aSign = extractFloatx80Sign( a ); 7835 7836 if ( aExp == 0x7FFF ) { 7837 if ( aSig<<1 ) { 7838 return propagateFloatx80NaN(a, a, status); 7839 } 7840 return a; 7841 } 7842 7843 if (aExp == 0) { 7844 if (aSig == 0) { 7845 return a; 7846 } 7847 aExp++; 7848 } 7849 7850 if (n > 0x10000) { 7851 n = 0x10000; 7852 } else if (n < -0x10000) { 7853 n = -0x10000; 7854 } 7855 7856 aExp += n; 7857 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7858 aSign, aExp, aSig, 0, status); 7859 } 7860 7861 float128 float128_scalbn(float128 a, int n, float_status *status) 7862 { 7863 flag aSign; 7864 int32_t aExp; 7865 uint64_t aSig0, aSig1; 7866 7867 aSig1 = extractFloat128Frac1( a ); 7868 aSig0 = extractFloat128Frac0( a ); 7869 aExp = extractFloat128Exp( a ); 7870 aSign = extractFloat128Sign( a ); 7871 if ( aExp == 0x7FFF ) { 7872 if ( aSig0 | aSig1 ) { 7873 return propagateFloat128NaN(a, a, status); 7874 } 7875 return a; 7876 } 7877 if (aExp != 0) { 7878 aSig0 |= LIT64( 0x0001000000000000 ); 7879 } else if (aSig0 == 0 && aSig1 == 0) { 7880 return a; 7881 } else { 7882 aExp++; 7883 } 7884 7885 if (n > 0x10000) { 7886 n = 0x10000; 7887 } else if (n < -0x10000) { 7888 n = -0x10000; 7889 } 7890 7891 aExp += n - 1; 7892 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7893 , status); 7894 7895 } 7896