1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status) 1240 { 1241 FloatParts pa = float32_unpack_canonical(a, status); 1242 FloatParts pb = float32_unpack_canonical(b, status); 1243 FloatParts pr = mul_floats(pa, pb, status); 1244 1245 return float32_round_pack_canonical(pr, status); 1246 } 1247 1248 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status) 1249 { 1250 FloatParts pa = float64_unpack_canonical(a, status); 1251 FloatParts pb = float64_unpack_canonical(b, status); 1252 FloatParts pr = mul_floats(pa, pb, status); 1253 1254 return float64_round_pack_canonical(pr, status); 1255 } 1256 1257 /* 1258 * Returns the result of multiplying the floating-point values `a' and 1259 * `b' then adding 'c', with no intermediate rounding step after the 1260 * multiplication. The operation is performed according to the 1261 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1262 * The flags argument allows the caller to select negation of the 1263 * addend, the intermediate product, or the final result. (The 1264 * difference between this and having the caller do a separate 1265 * negation is that negating externally will flip the sign bit on 1266 * NaNs.) 1267 */ 1268 1269 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1270 int flags, float_status *s) 1271 { 1272 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1273 ((1 << float_class_inf) | (1 << float_class_zero)); 1274 bool p_sign; 1275 bool sign_flip = flags & float_muladd_negate_result; 1276 FloatClass p_class; 1277 uint64_t hi, lo; 1278 int p_exp; 1279 1280 /* It is implementation-defined whether the cases of (0,inf,qnan) 1281 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1282 * they return if they do), so we have to hand this information 1283 * off to the target-specific pick-a-NaN routine. 1284 */ 1285 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1286 return pick_nan_muladd(a, b, c, inf_zero, s); 1287 } 1288 1289 if (inf_zero) { 1290 s->float_exception_flags |= float_flag_invalid; 1291 return parts_default_nan(s); 1292 } 1293 1294 if (flags & float_muladd_negate_c) { 1295 c.sign ^= 1; 1296 } 1297 1298 p_sign = a.sign ^ b.sign; 1299 1300 if (flags & float_muladd_negate_product) { 1301 p_sign ^= 1; 1302 } 1303 1304 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1305 p_class = float_class_inf; 1306 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1307 p_class = float_class_zero; 1308 } else { 1309 p_class = float_class_normal; 1310 } 1311 1312 if (c.cls == float_class_inf) { 1313 if (p_class == float_class_inf && p_sign != c.sign) { 1314 s->float_exception_flags |= float_flag_invalid; 1315 return parts_default_nan(s); 1316 } else { 1317 a.cls = float_class_inf; 1318 a.sign = c.sign ^ sign_flip; 1319 return a; 1320 } 1321 } 1322 1323 if (p_class == float_class_inf) { 1324 a.cls = float_class_inf; 1325 a.sign = p_sign ^ sign_flip; 1326 return a; 1327 } 1328 1329 if (p_class == float_class_zero) { 1330 if (c.cls == float_class_zero) { 1331 if (p_sign != c.sign) { 1332 p_sign = s->float_rounding_mode == float_round_down; 1333 } 1334 c.sign = p_sign; 1335 } else if (flags & float_muladd_halve_result) { 1336 c.exp -= 1; 1337 } 1338 c.sign ^= sign_flip; 1339 return c; 1340 } 1341 1342 /* a & b should be normals now... */ 1343 assert(a.cls == float_class_normal && 1344 b.cls == float_class_normal); 1345 1346 p_exp = a.exp + b.exp; 1347 1348 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1349 * result. 1350 */ 1351 mul64To128(a.frac, b.frac, &hi, &lo); 1352 /* binary point now at bit 124 */ 1353 1354 /* check for overflow */ 1355 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1356 shift128RightJamming(hi, lo, 1, &hi, &lo); 1357 p_exp += 1; 1358 } 1359 1360 /* + add/sub */ 1361 if (c.cls == float_class_zero) { 1362 /* move binary point back to 62 */ 1363 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1364 } else { 1365 int exp_diff = p_exp - c.exp; 1366 if (p_sign == c.sign) { 1367 /* Addition */ 1368 if (exp_diff <= 0) { 1369 shift128RightJamming(hi, lo, 1370 DECOMPOSED_BINARY_POINT - exp_diff, 1371 &hi, &lo); 1372 lo += c.frac; 1373 p_exp = c.exp; 1374 } else { 1375 uint64_t c_hi, c_lo; 1376 /* shift c to the same binary point as the product (124) */ 1377 c_hi = c.frac >> 2; 1378 c_lo = 0; 1379 shift128RightJamming(c_hi, c_lo, 1380 exp_diff, 1381 &c_hi, &c_lo); 1382 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1383 /* move binary point back to 62 */ 1384 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1385 } 1386 1387 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1388 shift64RightJamming(lo, 1, &lo); 1389 p_exp += 1; 1390 } 1391 1392 } else { 1393 /* Subtraction */ 1394 uint64_t c_hi, c_lo; 1395 /* make C binary point match product at bit 124 */ 1396 c_hi = c.frac >> 2; 1397 c_lo = 0; 1398 1399 if (exp_diff <= 0) { 1400 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1401 if (exp_diff == 0 1402 && 1403 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1404 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1405 } else { 1406 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1407 p_sign ^= 1; 1408 p_exp = c.exp; 1409 } 1410 } else { 1411 shift128RightJamming(c_hi, c_lo, 1412 exp_diff, 1413 &c_hi, &c_lo); 1414 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1415 } 1416 1417 if (hi == 0 && lo == 0) { 1418 a.cls = float_class_zero; 1419 a.sign = s->float_rounding_mode == float_round_down; 1420 a.sign ^= sign_flip; 1421 return a; 1422 } else { 1423 int shift; 1424 if (hi != 0) { 1425 shift = clz64(hi); 1426 } else { 1427 shift = clz64(lo) + 64; 1428 } 1429 /* Normalizing to a binary point of 124 is the 1430 correct adjust for the exponent. However since we're 1431 shifting, we might as well put the binary point back 1432 at 62 where we really want it. Therefore shift as 1433 if we're leaving 1 bit at the top of the word, but 1434 adjust the exponent as if we're leaving 3 bits. */ 1435 shift -= 1; 1436 if (shift >= 64) { 1437 lo = lo << (shift - 64); 1438 } else { 1439 hi = (hi << shift) | (lo >> (64 - shift)); 1440 lo = hi | ((lo << shift) != 0); 1441 } 1442 p_exp -= shift - 2; 1443 } 1444 } 1445 } 1446 1447 if (flags & float_muladd_halve_result) { 1448 p_exp -= 1; 1449 } 1450 1451 /* finally prepare our result */ 1452 a.cls = float_class_normal; 1453 a.sign = p_sign ^ sign_flip; 1454 a.exp = p_exp; 1455 a.frac = lo; 1456 1457 return a; 1458 } 1459 1460 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1461 int flags, float_status *status) 1462 { 1463 FloatParts pa = float16_unpack_canonical(a, status); 1464 FloatParts pb = float16_unpack_canonical(b, status); 1465 FloatParts pc = float16_unpack_canonical(c, status); 1466 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1467 1468 return float16_round_pack_canonical(pr, status); 1469 } 1470 1471 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, 1472 int flags, float_status *status) 1473 { 1474 FloatParts pa = float32_unpack_canonical(a, status); 1475 FloatParts pb = float32_unpack_canonical(b, status); 1476 FloatParts pc = float32_unpack_canonical(c, status); 1477 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1478 1479 return float32_round_pack_canonical(pr, status); 1480 } 1481 1482 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, 1483 int flags, float_status *status) 1484 { 1485 FloatParts pa = float64_unpack_canonical(a, status); 1486 FloatParts pb = float64_unpack_canonical(b, status); 1487 FloatParts pc = float64_unpack_canonical(c, status); 1488 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1489 1490 return float64_round_pack_canonical(pr, status); 1491 } 1492 1493 /* 1494 * Returns the result of dividing the floating-point value `a' by the 1495 * corresponding value `b'. The operation is performed according to 1496 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1497 */ 1498 1499 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1500 { 1501 bool sign = a.sign ^ b.sign; 1502 1503 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1504 uint64_t n0, n1, q, r; 1505 int exp = a.exp - b.exp; 1506 1507 /* 1508 * We want a 2*N / N-bit division to produce exactly an N-bit 1509 * result, so that we do not lose any precision and so that we 1510 * do not have to renormalize afterward. If A.frac < B.frac, 1511 * then division would produce an (N-1)-bit result; shift A left 1512 * by one to produce the an N-bit result, and decrement the 1513 * exponent to match. 1514 * 1515 * The udiv_qrnnd algorithm that we're using requires normalization, 1516 * i.e. the msb of the denominator must be set. Since we know that 1517 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1518 * by one (more), and the remainder must be shifted right by one. 1519 */ 1520 if (a.frac < b.frac) { 1521 exp -= 1; 1522 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1523 } else { 1524 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1525 } 1526 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1527 1528 /* 1529 * Set lsb if there is a remainder, to set inexact. 1530 * As mentioned above, to find the actual value of the remainder we 1531 * would need to shift right, but (1) we are only concerned about 1532 * non-zero-ness, and (2) the remainder will always be even because 1533 * both inputs to the division primitive are even. 1534 */ 1535 a.frac = q | (r != 0); 1536 a.sign = sign; 1537 a.exp = exp; 1538 return a; 1539 } 1540 /* handle all the NaN cases */ 1541 if (is_nan(a.cls) || is_nan(b.cls)) { 1542 return pick_nan(a, b, s); 1543 } 1544 /* 0/0 or Inf/Inf */ 1545 if (a.cls == b.cls 1546 && 1547 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1548 s->float_exception_flags |= float_flag_invalid; 1549 return parts_default_nan(s); 1550 } 1551 /* Inf / x or 0 / x */ 1552 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1553 a.sign = sign; 1554 return a; 1555 } 1556 /* Div 0 => Inf */ 1557 if (b.cls == float_class_zero) { 1558 s->float_exception_flags |= float_flag_divbyzero; 1559 a.cls = float_class_inf; 1560 a.sign = sign; 1561 return a; 1562 } 1563 /* Div by Inf */ 1564 if (b.cls == float_class_inf) { 1565 a.cls = float_class_zero; 1566 a.sign = sign; 1567 return a; 1568 } 1569 g_assert_not_reached(); 1570 } 1571 1572 float16 float16_div(float16 a, float16 b, float_status *status) 1573 { 1574 FloatParts pa = float16_unpack_canonical(a, status); 1575 FloatParts pb = float16_unpack_canonical(b, status); 1576 FloatParts pr = div_floats(pa, pb, status); 1577 1578 return float16_round_pack_canonical(pr, status); 1579 } 1580 1581 float32 float32_div(float32 a, float32 b, float_status *status) 1582 { 1583 FloatParts pa = float32_unpack_canonical(a, status); 1584 FloatParts pb = float32_unpack_canonical(b, status); 1585 FloatParts pr = div_floats(pa, pb, status); 1586 1587 return float32_round_pack_canonical(pr, status); 1588 } 1589 1590 float64 float64_div(float64 a, float64 b, float_status *status) 1591 { 1592 FloatParts pa = float64_unpack_canonical(a, status); 1593 FloatParts pb = float64_unpack_canonical(b, status); 1594 FloatParts pr = div_floats(pa, pb, status); 1595 1596 return float64_round_pack_canonical(pr, status); 1597 } 1598 1599 /* 1600 * Float to Float conversions 1601 * 1602 * Returns the result of converting one float format to another. The 1603 * conversion is performed according to the IEC/IEEE Standard for 1604 * Binary Floating-Point Arithmetic. 1605 * 1606 * The float_to_float helper only needs to take care of raising 1607 * invalid exceptions and handling the conversion on NaNs. 1608 */ 1609 1610 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1611 float_status *s) 1612 { 1613 if (dstf->arm_althp) { 1614 switch (a.cls) { 1615 case float_class_qnan: 1616 case float_class_snan: 1617 /* There is no NaN in the destination format. Raise Invalid 1618 * and return a zero with the sign of the input NaN. 1619 */ 1620 s->float_exception_flags |= float_flag_invalid; 1621 a.cls = float_class_zero; 1622 a.frac = 0; 1623 a.exp = 0; 1624 break; 1625 1626 case float_class_inf: 1627 /* There is no Inf in the destination format. Raise Invalid 1628 * and return the maximum normal with the correct sign. 1629 */ 1630 s->float_exception_flags |= float_flag_invalid; 1631 a.cls = float_class_normal; 1632 a.exp = dstf->exp_max; 1633 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1634 break; 1635 1636 default: 1637 break; 1638 } 1639 } else if (is_nan(a.cls)) { 1640 if (is_snan(a.cls)) { 1641 s->float_exception_flags |= float_flag_invalid; 1642 a = parts_silence_nan(a, s); 1643 } 1644 if (s->default_nan_mode) { 1645 return parts_default_nan(s); 1646 } 1647 } 1648 return a; 1649 } 1650 1651 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1652 { 1653 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1654 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1655 FloatParts pr = float_to_float(p, &float32_params, s); 1656 return float32_round_pack_canonical(pr, s); 1657 } 1658 1659 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1660 { 1661 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1662 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1663 FloatParts pr = float_to_float(p, &float64_params, s); 1664 return float64_round_pack_canonical(pr, s); 1665 } 1666 1667 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1668 { 1669 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1670 FloatParts p = float32_unpack_canonical(a, s); 1671 FloatParts pr = float_to_float(p, fmt16, s); 1672 return float16a_round_pack_canonical(pr, s, fmt16); 1673 } 1674 1675 float64 float32_to_float64(float32 a, float_status *s) 1676 { 1677 FloatParts p = float32_unpack_canonical(a, s); 1678 FloatParts pr = float_to_float(p, &float64_params, s); 1679 return float64_round_pack_canonical(pr, s); 1680 } 1681 1682 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1683 { 1684 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1685 FloatParts p = float64_unpack_canonical(a, s); 1686 FloatParts pr = float_to_float(p, fmt16, s); 1687 return float16a_round_pack_canonical(pr, s, fmt16); 1688 } 1689 1690 float32 float64_to_float32(float64 a, float_status *s) 1691 { 1692 FloatParts p = float64_unpack_canonical(a, s); 1693 FloatParts pr = float_to_float(p, &float32_params, s); 1694 return float32_round_pack_canonical(pr, s); 1695 } 1696 1697 /* 1698 * Rounds the floating-point value `a' to an integer, and returns the 1699 * result as a floating-point value. The operation is performed 1700 * according to the IEC/IEEE Standard for Binary Floating-Point 1701 * Arithmetic. 1702 */ 1703 1704 static FloatParts round_to_int(FloatParts a, int rmode, 1705 int scale, float_status *s) 1706 { 1707 switch (a.cls) { 1708 case float_class_qnan: 1709 case float_class_snan: 1710 return return_nan(a, s); 1711 1712 case float_class_zero: 1713 case float_class_inf: 1714 /* already "integral" */ 1715 break; 1716 1717 case float_class_normal: 1718 scale = MIN(MAX(scale, -0x10000), 0x10000); 1719 a.exp += scale; 1720 1721 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1722 /* already integral */ 1723 break; 1724 } 1725 if (a.exp < 0) { 1726 bool one; 1727 /* all fractional */ 1728 s->float_exception_flags |= float_flag_inexact; 1729 switch (rmode) { 1730 case float_round_nearest_even: 1731 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1732 break; 1733 case float_round_ties_away: 1734 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1735 break; 1736 case float_round_to_zero: 1737 one = false; 1738 break; 1739 case float_round_up: 1740 one = !a.sign; 1741 break; 1742 case float_round_down: 1743 one = a.sign; 1744 break; 1745 default: 1746 g_assert_not_reached(); 1747 } 1748 1749 if (one) { 1750 a.frac = DECOMPOSED_IMPLICIT_BIT; 1751 a.exp = 0; 1752 } else { 1753 a.cls = float_class_zero; 1754 } 1755 } else { 1756 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1757 uint64_t frac_lsbm1 = frac_lsb >> 1; 1758 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1759 uint64_t rnd_mask = rnd_even_mask >> 1; 1760 uint64_t inc; 1761 1762 switch (rmode) { 1763 case float_round_nearest_even: 1764 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1765 break; 1766 case float_round_ties_away: 1767 inc = frac_lsbm1; 1768 break; 1769 case float_round_to_zero: 1770 inc = 0; 1771 break; 1772 case float_round_up: 1773 inc = a.sign ? 0 : rnd_mask; 1774 break; 1775 case float_round_down: 1776 inc = a.sign ? rnd_mask : 0; 1777 break; 1778 default: 1779 g_assert_not_reached(); 1780 } 1781 1782 if (a.frac & rnd_mask) { 1783 s->float_exception_flags |= float_flag_inexact; 1784 a.frac += inc; 1785 a.frac &= ~rnd_mask; 1786 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1787 a.frac >>= 1; 1788 a.exp++; 1789 } 1790 } 1791 } 1792 break; 1793 default: 1794 g_assert_not_reached(); 1795 } 1796 return a; 1797 } 1798 1799 float16 float16_round_to_int(float16 a, float_status *s) 1800 { 1801 FloatParts pa = float16_unpack_canonical(a, s); 1802 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1803 return float16_round_pack_canonical(pr, s); 1804 } 1805 1806 float32 float32_round_to_int(float32 a, float_status *s) 1807 { 1808 FloatParts pa = float32_unpack_canonical(a, s); 1809 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1810 return float32_round_pack_canonical(pr, s); 1811 } 1812 1813 float64 float64_round_to_int(float64 a, float_status *s) 1814 { 1815 FloatParts pa = float64_unpack_canonical(a, s); 1816 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1817 return float64_round_pack_canonical(pr, s); 1818 } 1819 1820 /* 1821 * Returns the result of converting the floating-point value `a' to 1822 * the two's complement integer format. The conversion is performed 1823 * according to the IEC/IEEE Standard for Binary Floating-Point 1824 * Arithmetic---which means in particular that the conversion is 1825 * rounded according to the current rounding mode. If `a' is a NaN, 1826 * the largest positive integer is returned. Otherwise, if the 1827 * conversion overflows, the largest integer with the same sign as `a' 1828 * is returned. 1829 */ 1830 1831 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1832 int64_t min, int64_t max, 1833 float_status *s) 1834 { 1835 uint64_t r; 1836 int orig_flags = get_float_exception_flags(s); 1837 FloatParts p = round_to_int(in, rmode, scale, s); 1838 1839 switch (p.cls) { 1840 case float_class_snan: 1841 case float_class_qnan: 1842 s->float_exception_flags = orig_flags | float_flag_invalid; 1843 return max; 1844 case float_class_inf: 1845 s->float_exception_flags = orig_flags | float_flag_invalid; 1846 return p.sign ? min : max; 1847 case float_class_zero: 1848 return 0; 1849 case float_class_normal: 1850 if (p.exp < DECOMPOSED_BINARY_POINT) { 1851 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1852 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1853 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1854 } else { 1855 r = UINT64_MAX; 1856 } 1857 if (p.sign) { 1858 if (r <= -(uint64_t) min) { 1859 return -r; 1860 } else { 1861 s->float_exception_flags = orig_flags | float_flag_invalid; 1862 return min; 1863 } 1864 } else { 1865 if (r <= max) { 1866 return r; 1867 } else { 1868 s->float_exception_flags = orig_flags | float_flag_invalid; 1869 return max; 1870 } 1871 } 1872 default: 1873 g_assert_not_reached(); 1874 } 1875 } 1876 1877 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1878 float_status *s) 1879 { 1880 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1881 rmode, scale, INT16_MIN, INT16_MAX, s); 1882 } 1883 1884 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1885 float_status *s) 1886 { 1887 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1888 rmode, scale, INT32_MIN, INT32_MAX, s); 1889 } 1890 1891 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 1892 float_status *s) 1893 { 1894 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1895 rmode, scale, INT64_MIN, INT64_MAX, s); 1896 } 1897 1898 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 1899 float_status *s) 1900 { 1901 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1902 rmode, scale, INT16_MIN, INT16_MAX, s); 1903 } 1904 1905 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 1906 float_status *s) 1907 { 1908 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1909 rmode, scale, INT32_MIN, INT32_MAX, s); 1910 } 1911 1912 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 1913 float_status *s) 1914 { 1915 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1916 rmode, scale, INT64_MIN, INT64_MAX, s); 1917 } 1918 1919 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 1920 float_status *s) 1921 { 1922 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1923 rmode, scale, INT16_MIN, INT16_MAX, s); 1924 } 1925 1926 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 1927 float_status *s) 1928 { 1929 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1930 rmode, scale, INT32_MIN, INT32_MAX, s); 1931 } 1932 1933 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 1934 float_status *s) 1935 { 1936 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1937 rmode, scale, INT64_MIN, INT64_MAX, s); 1938 } 1939 1940 int16_t float16_to_int16(float16 a, float_status *s) 1941 { 1942 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1943 } 1944 1945 int32_t float16_to_int32(float16 a, float_status *s) 1946 { 1947 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1948 } 1949 1950 int64_t float16_to_int64(float16 a, float_status *s) 1951 { 1952 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1953 } 1954 1955 int16_t float32_to_int16(float32 a, float_status *s) 1956 { 1957 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1958 } 1959 1960 int32_t float32_to_int32(float32 a, float_status *s) 1961 { 1962 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1963 } 1964 1965 int64_t float32_to_int64(float32 a, float_status *s) 1966 { 1967 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1968 } 1969 1970 int16_t float64_to_int16(float64 a, float_status *s) 1971 { 1972 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1973 } 1974 1975 int32_t float64_to_int32(float64 a, float_status *s) 1976 { 1977 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1978 } 1979 1980 int64_t float64_to_int64(float64 a, float_status *s) 1981 { 1982 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1983 } 1984 1985 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 1986 { 1987 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 1988 } 1989 1990 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 1991 { 1992 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 1993 } 1994 1995 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 1996 { 1997 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 1998 } 1999 2000 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2001 { 2002 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2003 } 2004 2005 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2006 { 2007 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2008 } 2009 2010 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2011 { 2012 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2013 } 2014 2015 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2016 { 2017 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2018 } 2019 2020 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2021 { 2022 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2023 } 2024 2025 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2026 { 2027 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2028 } 2029 2030 /* 2031 * Returns the result of converting the floating-point value `a' to 2032 * the unsigned integer format. The conversion is performed according 2033 * to the IEC/IEEE Standard for Binary Floating-Point 2034 * Arithmetic---which means in particular that the conversion is 2035 * rounded according to the current rounding mode. If `a' is a NaN, 2036 * the largest unsigned integer is returned. Otherwise, if the 2037 * conversion overflows, the largest unsigned integer is returned. If 2038 * the 'a' is negative, the result is rounded and zero is returned; 2039 * values that do not round to zero will raise the inexact exception 2040 * flag. 2041 */ 2042 2043 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2044 uint64_t max, float_status *s) 2045 { 2046 int orig_flags = get_float_exception_flags(s); 2047 FloatParts p = round_to_int(in, rmode, scale, s); 2048 uint64_t r; 2049 2050 switch (p.cls) { 2051 case float_class_snan: 2052 case float_class_qnan: 2053 s->float_exception_flags = orig_flags | float_flag_invalid; 2054 return max; 2055 case float_class_inf: 2056 s->float_exception_flags = orig_flags | float_flag_invalid; 2057 return p.sign ? 0 : max; 2058 case float_class_zero: 2059 return 0; 2060 case float_class_normal: 2061 if (p.sign) { 2062 s->float_exception_flags = orig_flags | float_flag_invalid; 2063 return 0; 2064 } 2065 2066 if (p.exp < DECOMPOSED_BINARY_POINT) { 2067 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2068 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2069 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2070 } else { 2071 s->float_exception_flags = orig_flags | float_flag_invalid; 2072 return max; 2073 } 2074 2075 /* For uint64 this will never trip, but if p.exp is too large 2076 * to shift a decomposed fraction we shall have exited via the 2077 * 3rd leg above. 2078 */ 2079 if (r > max) { 2080 s->float_exception_flags = orig_flags | float_flag_invalid; 2081 return max; 2082 } 2083 return r; 2084 default: 2085 g_assert_not_reached(); 2086 } 2087 } 2088 2089 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2090 float_status *s) 2091 { 2092 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2093 rmode, scale, UINT16_MAX, s); 2094 } 2095 2096 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2097 float_status *s) 2098 { 2099 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2100 rmode, scale, UINT32_MAX, s); 2101 } 2102 2103 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2104 float_status *s) 2105 { 2106 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2107 rmode, scale, UINT64_MAX, s); 2108 } 2109 2110 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2111 float_status *s) 2112 { 2113 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2114 rmode, scale, UINT16_MAX, s); 2115 } 2116 2117 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2118 float_status *s) 2119 { 2120 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2121 rmode, scale, UINT32_MAX, s); 2122 } 2123 2124 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2125 float_status *s) 2126 { 2127 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2128 rmode, scale, UINT64_MAX, s); 2129 } 2130 2131 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2132 float_status *s) 2133 { 2134 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2135 rmode, scale, UINT16_MAX, s); 2136 } 2137 2138 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2139 float_status *s) 2140 { 2141 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2142 rmode, scale, UINT32_MAX, s); 2143 } 2144 2145 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2146 float_status *s) 2147 { 2148 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2149 rmode, scale, UINT64_MAX, s); 2150 } 2151 2152 uint16_t float16_to_uint16(float16 a, float_status *s) 2153 { 2154 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2155 } 2156 2157 uint32_t float16_to_uint32(float16 a, float_status *s) 2158 { 2159 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2160 } 2161 2162 uint64_t float16_to_uint64(float16 a, float_status *s) 2163 { 2164 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2165 } 2166 2167 uint16_t float32_to_uint16(float32 a, float_status *s) 2168 { 2169 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2170 } 2171 2172 uint32_t float32_to_uint32(float32 a, float_status *s) 2173 { 2174 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2175 } 2176 2177 uint64_t float32_to_uint64(float32 a, float_status *s) 2178 { 2179 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2180 } 2181 2182 uint16_t float64_to_uint16(float64 a, float_status *s) 2183 { 2184 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2185 } 2186 2187 uint32_t float64_to_uint32(float64 a, float_status *s) 2188 { 2189 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2190 } 2191 2192 uint64_t float64_to_uint64(float64 a, float_status *s) 2193 { 2194 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2195 } 2196 2197 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2198 { 2199 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2200 } 2201 2202 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2203 { 2204 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2205 } 2206 2207 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2208 { 2209 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2210 } 2211 2212 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2213 { 2214 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2215 } 2216 2217 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2218 { 2219 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2220 } 2221 2222 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2223 { 2224 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2225 } 2226 2227 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2228 { 2229 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2230 } 2231 2232 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2233 { 2234 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2235 } 2236 2237 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2238 { 2239 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2240 } 2241 2242 /* 2243 * Integer to float conversions 2244 * 2245 * Returns the result of converting the two's complement integer `a' 2246 * to the floating-point format. The conversion is performed according 2247 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2248 */ 2249 2250 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2251 { 2252 FloatParts r = { .sign = false }; 2253 2254 if (a == 0) { 2255 r.cls = float_class_zero; 2256 } else { 2257 uint64_t f = a; 2258 int shift; 2259 2260 r.cls = float_class_normal; 2261 if (a < 0) { 2262 f = -f; 2263 r.sign = true; 2264 } 2265 shift = clz64(f) - 1; 2266 scale = MIN(MAX(scale, -0x10000), 0x10000); 2267 2268 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2269 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2270 } 2271 2272 return r; 2273 } 2274 2275 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2276 { 2277 FloatParts pa = int_to_float(a, scale, status); 2278 return float16_round_pack_canonical(pa, status); 2279 } 2280 2281 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2282 { 2283 return int64_to_float16_scalbn(a, scale, status); 2284 } 2285 2286 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2287 { 2288 return int64_to_float16_scalbn(a, scale, status); 2289 } 2290 2291 float16 int64_to_float16(int64_t a, float_status *status) 2292 { 2293 return int64_to_float16_scalbn(a, 0, status); 2294 } 2295 2296 float16 int32_to_float16(int32_t a, float_status *status) 2297 { 2298 return int64_to_float16_scalbn(a, 0, status); 2299 } 2300 2301 float16 int16_to_float16(int16_t a, float_status *status) 2302 { 2303 return int64_to_float16_scalbn(a, 0, status); 2304 } 2305 2306 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2307 { 2308 FloatParts pa = int_to_float(a, scale, status); 2309 return float32_round_pack_canonical(pa, status); 2310 } 2311 2312 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2313 { 2314 return int64_to_float32_scalbn(a, scale, status); 2315 } 2316 2317 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2318 { 2319 return int64_to_float32_scalbn(a, scale, status); 2320 } 2321 2322 float32 int64_to_float32(int64_t a, float_status *status) 2323 { 2324 return int64_to_float32_scalbn(a, 0, status); 2325 } 2326 2327 float32 int32_to_float32(int32_t a, float_status *status) 2328 { 2329 return int64_to_float32_scalbn(a, 0, status); 2330 } 2331 2332 float32 int16_to_float32(int16_t a, float_status *status) 2333 { 2334 return int64_to_float32_scalbn(a, 0, status); 2335 } 2336 2337 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2338 { 2339 FloatParts pa = int_to_float(a, scale, status); 2340 return float64_round_pack_canonical(pa, status); 2341 } 2342 2343 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2344 { 2345 return int64_to_float64_scalbn(a, scale, status); 2346 } 2347 2348 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2349 { 2350 return int64_to_float64_scalbn(a, scale, status); 2351 } 2352 2353 float64 int64_to_float64(int64_t a, float_status *status) 2354 { 2355 return int64_to_float64_scalbn(a, 0, status); 2356 } 2357 2358 float64 int32_to_float64(int32_t a, float_status *status) 2359 { 2360 return int64_to_float64_scalbn(a, 0, status); 2361 } 2362 2363 float64 int16_to_float64(int16_t a, float_status *status) 2364 { 2365 return int64_to_float64_scalbn(a, 0, status); 2366 } 2367 2368 2369 /* 2370 * Unsigned Integer to float conversions 2371 * 2372 * Returns the result of converting the unsigned integer `a' to the 2373 * floating-point format. The conversion is performed according to the 2374 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2375 */ 2376 2377 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2378 { 2379 FloatParts r = { .sign = false }; 2380 2381 if (a == 0) { 2382 r.cls = float_class_zero; 2383 } else { 2384 scale = MIN(MAX(scale, -0x10000), 0x10000); 2385 r.cls = float_class_normal; 2386 if ((int64_t)a < 0) { 2387 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2388 shift64RightJamming(a, 1, &a); 2389 r.frac = a; 2390 } else { 2391 int shift = clz64(a) - 1; 2392 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2393 r.frac = a << shift; 2394 } 2395 } 2396 2397 return r; 2398 } 2399 2400 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2401 { 2402 FloatParts pa = uint_to_float(a, scale, status); 2403 return float16_round_pack_canonical(pa, status); 2404 } 2405 2406 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2407 { 2408 return uint64_to_float16_scalbn(a, scale, status); 2409 } 2410 2411 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2412 { 2413 return uint64_to_float16_scalbn(a, scale, status); 2414 } 2415 2416 float16 uint64_to_float16(uint64_t a, float_status *status) 2417 { 2418 return uint64_to_float16_scalbn(a, 0, status); 2419 } 2420 2421 float16 uint32_to_float16(uint32_t a, float_status *status) 2422 { 2423 return uint64_to_float16_scalbn(a, 0, status); 2424 } 2425 2426 float16 uint16_to_float16(uint16_t a, float_status *status) 2427 { 2428 return uint64_to_float16_scalbn(a, 0, status); 2429 } 2430 2431 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2432 { 2433 FloatParts pa = uint_to_float(a, scale, status); 2434 return float32_round_pack_canonical(pa, status); 2435 } 2436 2437 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2438 { 2439 return uint64_to_float32_scalbn(a, scale, status); 2440 } 2441 2442 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2443 { 2444 return uint64_to_float32_scalbn(a, scale, status); 2445 } 2446 2447 float32 uint64_to_float32(uint64_t a, float_status *status) 2448 { 2449 return uint64_to_float32_scalbn(a, 0, status); 2450 } 2451 2452 float32 uint32_to_float32(uint32_t a, float_status *status) 2453 { 2454 return uint64_to_float32_scalbn(a, 0, status); 2455 } 2456 2457 float32 uint16_to_float32(uint16_t a, float_status *status) 2458 { 2459 return uint64_to_float32_scalbn(a, 0, status); 2460 } 2461 2462 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2463 { 2464 FloatParts pa = uint_to_float(a, scale, status); 2465 return float64_round_pack_canonical(pa, status); 2466 } 2467 2468 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2469 { 2470 return uint64_to_float64_scalbn(a, scale, status); 2471 } 2472 2473 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2474 { 2475 return uint64_to_float64_scalbn(a, scale, status); 2476 } 2477 2478 float64 uint64_to_float64(uint64_t a, float_status *status) 2479 { 2480 return uint64_to_float64_scalbn(a, 0, status); 2481 } 2482 2483 float64 uint32_to_float64(uint32_t a, float_status *status) 2484 { 2485 return uint64_to_float64_scalbn(a, 0, status); 2486 } 2487 2488 float64 uint16_to_float64(uint16_t a, float_status *status) 2489 { 2490 return uint64_to_float64_scalbn(a, 0, status); 2491 } 2492 2493 /* Float Min/Max */ 2494 /* min() and max() functions. These can't be implemented as 2495 * 'compare and pick one input' because that would mishandle 2496 * NaNs and +0 vs -0. 2497 * 2498 * minnum() and maxnum() functions. These are similar to the min() 2499 * and max() functions but if one of the arguments is a QNaN and 2500 * the other is numerical then the numerical argument is returned. 2501 * SNaNs will get quietened before being returned. 2502 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2503 * and maxNum() operations. min() and max() are the typical min/max 2504 * semantics provided by many CPUs which predate that specification. 2505 * 2506 * minnummag() and maxnummag() functions correspond to minNumMag() 2507 * and minNumMag() from the IEEE-754 2008. 2508 */ 2509 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2510 bool ieee, bool ismag, float_status *s) 2511 { 2512 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2513 if (ieee) { 2514 /* Takes two floating-point values `a' and `b', one of 2515 * which is a NaN, and returns the appropriate NaN 2516 * result. If either `a' or `b' is a signaling NaN, 2517 * the invalid exception is raised. 2518 */ 2519 if (is_snan(a.cls) || is_snan(b.cls)) { 2520 return pick_nan(a, b, s); 2521 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2522 return b; 2523 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2524 return a; 2525 } 2526 } 2527 return pick_nan(a, b, s); 2528 } else { 2529 int a_exp, b_exp; 2530 2531 switch (a.cls) { 2532 case float_class_normal: 2533 a_exp = a.exp; 2534 break; 2535 case float_class_inf: 2536 a_exp = INT_MAX; 2537 break; 2538 case float_class_zero: 2539 a_exp = INT_MIN; 2540 break; 2541 default: 2542 g_assert_not_reached(); 2543 break; 2544 } 2545 switch (b.cls) { 2546 case float_class_normal: 2547 b_exp = b.exp; 2548 break; 2549 case float_class_inf: 2550 b_exp = INT_MAX; 2551 break; 2552 case float_class_zero: 2553 b_exp = INT_MIN; 2554 break; 2555 default: 2556 g_assert_not_reached(); 2557 break; 2558 } 2559 2560 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2561 bool a_less = a_exp < b_exp; 2562 if (a_exp == b_exp) { 2563 a_less = a.frac < b.frac; 2564 } 2565 return a_less ^ ismin ? b : a; 2566 } 2567 2568 if (a.sign == b.sign) { 2569 bool a_less = a_exp < b_exp; 2570 if (a_exp == b_exp) { 2571 a_less = a.frac < b.frac; 2572 } 2573 return a.sign ^ a_less ^ ismin ? b : a; 2574 } else { 2575 return a.sign ^ ismin ? b : a; 2576 } 2577 } 2578 } 2579 2580 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2581 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2582 float_status *s) \ 2583 { \ 2584 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2585 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2586 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2587 \ 2588 return float ## sz ## _round_pack_canonical(pr, s); \ 2589 } 2590 2591 MINMAX(16, min, true, false, false) 2592 MINMAX(16, minnum, true, true, false) 2593 MINMAX(16, minnummag, true, true, true) 2594 MINMAX(16, max, false, false, false) 2595 MINMAX(16, maxnum, false, true, false) 2596 MINMAX(16, maxnummag, false, true, true) 2597 2598 MINMAX(32, min, true, false, false) 2599 MINMAX(32, minnum, true, true, false) 2600 MINMAX(32, minnummag, true, true, true) 2601 MINMAX(32, max, false, false, false) 2602 MINMAX(32, maxnum, false, true, false) 2603 MINMAX(32, maxnummag, false, true, true) 2604 2605 MINMAX(64, min, true, false, false) 2606 MINMAX(64, minnum, true, true, false) 2607 MINMAX(64, minnummag, true, true, true) 2608 MINMAX(64, max, false, false, false) 2609 MINMAX(64, maxnum, false, true, false) 2610 MINMAX(64, maxnummag, false, true, true) 2611 2612 #undef MINMAX 2613 2614 /* Floating point compare */ 2615 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2616 float_status *s) 2617 { 2618 if (is_nan(a.cls) || is_nan(b.cls)) { 2619 if (!is_quiet || 2620 a.cls == float_class_snan || 2621 b.cls == float_class_snan) { 2622 s->float_exception_flags |= float_flag_invalid; 2623 } 2624 return float_relation_unordered; 2625 } 2626 2627 if (a.cls == float_class_zero) { 2628 if (b.cls == float_class_zero) { 2629 return float_relation_equal; 2630 } 2631 return b.sign ? float_relation_greater : float_relation_less; 2632 } else if (b.cls == float_class_zero) { 2633 return a.sign ? float_relation_less : float_relation_greater; 2634 } 2635 2636 /* The only really important thing about infinity is its sign. If 2637 * both are infinities the sign marks the smallest of the two. 2638 */ 2639 if (a.cls == float_class_inf) { 2640 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2641 return float_relation_equal; 2642 } 2643 return a.sign ? float_relation_less : float_relation_greater; 2644 } else if (b.cls == float_class_inf) { 2645 return b.sign ? float_relation_greater : float_relation_less; 2646 } 2647 2648 if (a.sign != b.sign) { 2649 return a.sign ? float_relation_less : float_relation_greater; 2650 } 2651 2652 if (a.exp == b.exp) { 2653 if (a.frac == b.frac) { 2654 return float_relation_equal; 2655 } 2656 if (a.sign) { 2657 return a.frac > b.frac ? 2658 float_relation_less : float_relation_greater; 2659 } else { 2660 return a.frac > b.frac ? 2661 float_relation_greater : float_relation_less; 2662 } 2663 } else { 2664 if (a.sign) { 2665 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2666 } else { 2667 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2668 } 2669 } 2670 } 2671 2672 #define COMPARE(sz) \ 2673 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2674 float_status *s) \ 2675 { \ 2676 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2677 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2678 return compare_floats(pa, pb, false, s); \ 2679 } \ 2680 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2681 float_status *s) \ 2682 { \ 2683 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2684 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2685 return compare_floats(pa, pb, true, s); \ 2686 } 2687 2688 COMPARE(16) 2689 COMPARE(32) 2690 COMPARE(64) 2691 2692 #undef COMPARE 2693 2694 /* Multiply A by 2 raised to the power N. */ 2695 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2696 { 2697 if (unlikely(is_nan(a.cls))) { 2698 return return_nan(a, s); 2699 } 2700 if (a.cls == float_class_normal) { 2701 /* The largest float type (even though not supported by FloatParts) 2702 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2703 * still allows rounding to infinity, without allowing overflow 2704 * within the int32_t that backs FloatParts.exp. 2705 */ 2706 n = MIN(MAX(n, -0x10000), 0x10000); 2707 a.exp += n; 2708 } 2709 return a; 2710 } 2711 2712 float16 float16_scalbn(float16 a, int n, float_status *status) 2713 { 2714 FloatParts pa = float16_unpack_canonical(a, status); 2715 FloatParts pr = scalbn_decomposed(pa, n, status); 2716 return float16_round_pack_canonical(pr, status); 2717 } 2718 2719 float32 float32_scalbn(float32 a, int n, float_status *status) 2720 { 2721 FloatParts pa = float32_unpack_canonical(a, status); 2722 FloatParts pr = scalbn_decomposed(pa, n, status); 2723 return float32_round_pack_canonical(pr, status); 2724 } 2725 2726 float64 float64_scalbn(float64 a, int n, float_status *status) 2727 { 2728 FloatParts pa = float64_unpack_canonical(a, status); 2729 FloatParts pr = scalbn_decomposed(pa, n, status); 2730 return float64_round_pack_canonical(pr, status); 2731 } 2732 2733 /* 2734 * Square Root 2735 * 2736 * The old softfloat code did an approximation step before zeroing in 2737 * on the final result. However for simpleness we just compute the 2738 * square root by iterating down from the implicit bit to enough extra 2739 * bits to ensure we get a correctly rounded result. 2740 * 2741 * This does mean however the calculation is slower than before, 2742 * especially for 64 bit floats. 2743 */ 2744 2745 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2746 { 2747 uint64_t a_frac, r_frac, s_frac; 2748 int bit, last_bit; 2749 2750 if (is_nan(a.cls)) { 2751 return return_nan(a, s); 2752 } 2753 if (a.cls == float_class_zero) { 2754 return a; /* sqrt(+-0) = +-0 */ 2755 } 2756 if (a.sign) { 2757 s->float_exception_flags |= float_flag_invalid; 2758 return parts_default_nan(s); 2759 } 2760 if (a.cls == float_class_inf) { 2761 return a; /* sqrt(+inf) = +inf */ 2762 } 2763 2764 assert(a.cls == float_class_normal); 2765 2766 /* We need two overflow bits at the top. Adding room for that is a 2767 * right shift. If the exponent is odd, we can discard the low bit 2768 * by multiplying the fraction by 2; that's a left shift. Combine 2769 * those and we shift right if the exponent is even. 2770 */ 2771 a_frac = a.frac; 2772 if (!(a.exp & 1)) { 2773 a_frac >>= 1; 2774 } 2775 a.exp >>= 1; 2776 2777 /* Bit-by-bit computation of sqrt. */ 2778 r_frac = 0; 2779 s_frac = 0; 2780 2781 /* Iterate from implicit bit down to the 3 extra bits to compute a 2782 * properly rounded result. Remember we've inserted one more bit 2783 * at the top, so these positions are one less. 2784 */ 2785 bit = DECOMPOSED_BINARY_POINT - 1; 2786 last_bit = MAX(p->frac_shift - 4, 0); 2787 do { 2788 uint64_t q = 1ULL << bit; 2789 uint64_t t_frac = s_frac + q; 2790 if (t_frac <= a_frac) { 2791 s_frac = t_frac + q; 2792 a_frac -= t_frac; 2793 r_frac += q; 2794 } 2795 a_frac <<= 1; 2796 } while (--bit >= last_bit); 2797 2798 /* Undo the right shift done above. If there is any remaining 2799 * fraction, the result is inexact. Set the sticky bit. 2800 */ 2801 a.frac = (r_frac << 1) + (a_frac != 0); 2802 2803 return a; 2804 } 2805 2806 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 2807 { 2808 FloatParts pa = float16_unpack_canonical(a, status); 2809 FloatParts pr = sqrt_float(pa, status, &float16_params); 2810 return float16_round_pack_canonical(pr, status); 2811 } 2812 2813 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 2814 { 2815 FloatParts pa = float32_unpack_canonical(a, status); 2816 FloatParts pr = sqrt_float(pa, status, &float32_params); 2817 return float32_round_pack_canonical(pr, status); 2818 } 2819 2820 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 2821 { 2822 FloatParts pa = float64_unpack_canonical(a, status); 2823 FloatParts pr = sqrt_float(pa, status, &float64_params); 2824 return float64_round_pack_canonical(pr, status); 2825 } 2826 2827 /*---------------------------------------------------------------------------- 2828 | The pattern for a default generated NaN. 2829 *----------------------------------------------------------------------------*/ 2830 2831 float16 float16_default_nan(float_status *status) 2832 { 2833 FloatParts p = parts_default_nan(status); 2834 p.frac >>= float16_params.frac_shift; 2835 return float16_pack_raw(p); 2836 } 2837 2838 float32 float32_default_nan(float_status *status) 2839 { 2840 FloatParts p = parts_default_nan(status); 2841 p.frac >>= float32_params.frac_shift; 2842 return float32_pack_raw(p); 2843 } 2844 2845 float64 float64_default_nan(float_status *status) 2846 { 2847 FloatParts p = parts_default_nan(status); 2848 p.frac >>= float64_params.frac_shift; 2849 return float64_pack_raw(p); 2850 } 2851 2852 float128 float128_default_nan(float_status *status) 2853 { 2854 FloatParts p = parts_default_nan(status); 2855 float128 r; 2856 2857 /* Extrapolate from the choices made by parts_default_nan to fill 2858 * in the quad-floating format. If the low bit is set, assume we 2859 * want to set all non-snan bits. 2860 */ 2861 r.low = -(p.frac & 1); 2862 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2863 r.high |= LIT64(0x7FFF000000000000); 2864 r.high |= (uint64_t)p.sign << 63; 2865 2866 return r; 2867 } 2868 2869 /*---------------------------------------------------------------------------- 2870 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2871 *----------------------------------------------------------------------------*/ 2872 2873 float16 float16_silence_nan(float16 a, float_status *status) 2874 { 2875 FloatParts p = float16_unpack_raw(a); 2876 p.frac <<= float16_params.frac_shift; 2877 p = parts_silence_nan(p, status); 2878 p.frac >>= float16_params.frac_shift; 2879 return float16_pack_raw(p); 2880 } 2881 2882 float32 float32_silence_nan(float32 a, float_status *status) 2883 { 2884 FloatParts p = float32_unpack_raw(a); 2885 p.frac <<= float32_params.frac_shift; 2886 p = parts_silence_nan(p, status); 2887 p.frac >>= float32_params.frac_shift; 2888 return float32_pack_raw(p); 2889 } 2890 2891 float64 float64_silence_nan(float64 a, float_status *status) 2892 { 2893 FloatParts p = float64_unpack_raw(a); 2894 p.frac <<= float64_params.frac_shift; 2895 p = parts_silence_nan(p, status); 2896 p.frac >>= float64_params.frac_shift; 2897 return float64_pack_raw(p); 2898 } 2899 2900 /*---------------------------------------------------------------------------- 2901 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2902 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2903 | input. If `zSign' is 1, the input is negated before being converted to an 2904 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2905 | is simply rounded to an integer, with the inexact exception raised if the 2906 | input cannot be represented exactly as an integer. However, if the fixed- 2907 | point input is too large, the invalid exception is raised and the largest 2908 | positive or negative integer is returned. 2909 *----------------------------------------------------------------------------*/ 2910 2911 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2912 { 2913 int8_t roundingMode; 2914 flag roundNearestEven; 2915 int8_t roundIncrement, roundBits; 2916 int32_t z; 2917 2918 roundingMode = status->float_rounding_mode; 2919 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2920 switch (roundingMode) { 2921 case float_round_nearest_even: 2922 case float_round_ties_away: 2923 roundIncrement = 0x40; 2924 break; 2925 case float_round_to_zero: 2926 roundIncrement = 0; 2927 break; 2928 case float_round_up: 2929 roundIncrement = zSign ? 0 : 0x7f; 2930 break; 2931 case float_round_down: 2932 roundIncrement = zSign ? 0x7f : 0; 2933 break; 2934 default: 2935 abort(); 2936 } 2937 roundBits = absZ & 0x7F; 2938 absZ = ( absZ + roundIncrement )>>7; 2939 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2940 z = absZ; 2941 if ( zSign ) z = - z; 2942 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2943 float_raise(float_flag_invalid, status); 2944 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2945 } 2946 if (roundBits) { 2947 status->float_exception_flags |= float_flag_inexact; 2948 } 2949 return z; 2950 2951 } 2952 2953 /*---------------------------------------------------------------------------- 2954 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2955 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2956 | and returns the properly rounded 64-bit integer corresponding to the input. 2957 | If `zSign' is 1, the input is negated before being converted to an integer. 2958 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2959 | the inexact exception raised if the input cannot be represented exactly as 2960 | an integer. However, if the fixed-point input is too large, the invalid 2961 | exception is raised and the largest positive or negative integer is 2962 | returned. 2963 *----------------------------------------------------------------------------*/ 2964 2965 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2966 float_status *status) 2967 { 2968 int8_t roundingMode; 2969 flag roundNearestEven, increment; 2970 int64_t z; 2971 2972 roundingMode = status->float_rounding_mode; 2973 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2974 switch (roundingMode) { 2975 case float_round_nearest_even: 2976 case float_round_ties_away: 2977 increment = ((int64_t) absZ1 < 0); 2978 break; 2979 case float_round_to_zero: 2980 increment = 0; 2981 break; 2982 case float_round_up: 2983 increment = !zSign && absZ1; 2984 break; 2985 case float_round_down: 2986 increment = zSign && absZ1; 2987 break; 2988 default: 2989 abort(); 2990 } 2991 if ( increment ) { 2992 ++absZ0; 2993 if ( absZ0 == 0 ) goto overflow; 2994 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2995 } 2996 z = absZ0; 2997 if ( zSign ) z = - z; 2998 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2999 overflow: 3000 float_raise(float_flag_invalid, status); 3001 return 3002 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3003 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3004 } 3005 if (absZ1) { 3006 status->float_exception_flags |= float_flag_inexact; 3007 } 3008 return z; 3009 3010 } 3011 3012 /*---------------------------------------------------------------------------- 3013 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3014 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3015 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3016 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3017 | with the inexact exception raised if the input cannot be represented exactly 3018 | as an integer. However, if the fixed-point input is too large, the invalid 3019 | exception is raised and the largest unsigned integer is returned. 3020 *----------------------------------------------------------------------------*/ 3021 3022 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3023 uint64_t absZ1, float_status *status) 3024 { 3025 int8_t roundingMode; 3026 flag roundNearestEven, increment; 3027 3028 roundingMode = status->float_rounding_mode; 3029 roundNearestEven = (roundingMode == float_round_nearest_even); 3030 switch (roundingMode) { 3031 case float_round_nearest_even: 3032 case float_round_ties_away: 3033 increment = ((int64_t)absZ1 < 0); 3034 break; 3035 case float_round_to_zero: 3036 increment = 0; 3037 break; 3038 case float_round_up: 3039 increment = !zSign && absZ1; 3040 break; 3041 case float_round_down: 3042 increment = zSign && absZ1; 3043 break; 3044 default: 3045 abort(); 3046 } 3047 if (increment) { 3048 ++absZ0; 3049 if (absZ0 == 0) { 3050 float_raise(float_flag_invalid, status); 3051 return LIT64(0xFFFFFFFFFFFFFFFF); 3052 } 3053 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3054 } 3055 3056 if (zSign && absZ0) { 3057 float_raise(float_flag_invalid, status); 3058 return 0; 3059 } 3060 3061 if (absZ1) { 3062 status->float_exception_flags |= float_flag_inexact; 3063 } 3064 return absZ0; 3065 } 3066 3067 /*---------------------------------------------------------------------------- 3068 | If `a' is denormal and we are in flush-to-zero mode then set the 3069 | input-denormal exception and return zero. Otherwise just return the value. 3070 *----------------------------------------------------------------------------*/ 3071 float32 float32_squash_input_denormal(float32 a, float_status *status) 3072 { 3073 if (status->flush_inputs_to_zero) { 3074 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3075 float_raise(float_flag_input_denormal, status); 3076 return make_float32(float32_val(a) & 0x80000000); 3077 } 3078 } 3079 return a; 3080 } 3081 3082 /*---------------------------------------------------------------------------- 3083 | Normalizes the subnormal single-precision floating-point value represented 3084 | by the denormalized significand `aSig'. The normalized exponent and 3085 | significand are stored at the locations pointed to by `zExpPtr' and 3086 | `zSigPtr', respectively. 3087 *----------------------------------------------------------------------------*/ 3088 3089 static void 3090 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3091 { 3092 int8_t shiftCount; 3093 3094 shiftCount = clz32(aSig) - 8; 3095 *zSigPtr = aSig<<shiftCount; 3096 *zExpPtr = 1 - shiftCount; 3097 3098 } 3099 3100 /*---------------------------------------------------------------------------- 3101 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3102 | and significand `zSig', and returns the proper single-precision floating- 3103 | point value corresponding to the abstract input. Ordinarily, the abstract 3104 | value is simply rounded and packed into the single-precision format, with 3105 | the inexact exception raised if the abstract input cannot be represented 3106 | exactly. However, if the abstract value is too large, the overflow and 3107 | inexact exceptions are raised and an infinity or maximal finite value is 3108 | returned. If the abstract value is too small, the input value is rounded to 3109 | a subnormal number, and the underflow and inexact exceptions are raised if 3110 | the abstract input cannot be represented exactly as a subnormal single- 3111 | precision floating-point number. 3112 | The input significand `zSig' has its binary point between bits 30 3113 | and 29, which is 7 bits to the left of the usual location. This shifted 3114 | significand must be normalized or smaller. If `zSig' is not normalized, 3115 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3116 | and it must not require rounding. In the usual case that `zSig' is 3117 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3118 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3119 | Binary Floating-Point Arithmetic. 3120 *----------------------------------------------------------------------------*/ 3121 3122 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3123 float_status *status) 3124 { 3125 int8_t roundingMode; 3126 flag roundNearestEven; 3127 int8_t roundIncrement, roundBits; 3128 flag isTiny; 3129 3130 roundingMode = status->float_rounding_mode; 3131 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3132 switch (roundingMode) { 3133 case float_round_nearest_even: 3134 case float_round_ties_away: 3135 roundIncrement = 0x40; 3136 break; 3137 case float_round_to_zero: 3138 roundIncrement = 0; 3139 break; 3140 case float_round_up: 3141 roundIncrement = zSign ? 0 : 0x7f; 3142 break; 3143 case float_round_down: 3144 roundIncrement = zSign ? 0x7f : 0; 3145 break; 3146 default: 3147 abort(); 3148 break; 3149 } 3150 roundBits = zSig & 0x7F; 3151 if ( 0xFD <= (uint16_t) zExp ) { 3152 if ( ( 0xFD < zExp ) 3153 || ( ( zExp == 0xFD ) 3154 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3155 ) { 3156 float_raise(float_flag_overflow | float_flag_inexact, status); 3157 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3158 } 3159 if ( zExp < 0 ) { 3160 if (status->flush_to_zero) { 3161 float_raise(float_flag_output_denormal, status); 3162 return packFloat32(zSign, 0, 0); 3163 } 3164 isTiny = 3165 (status->float_detect_tininess 3166 == float_tininess_before_rounding) 3167 || ( zExp < -1 ) 3168 || ( zSig + roundIncrement < 0x80000000 ); 3169 shift32RightJamming( zSig, - zExp, &zSig ); 3170 zExp = 0; 3171 roundBits = zSig & 0x7F; 3172 if (isTiny && roundBits) { 3173 float_raise(float_flag_underflow, status); 3174 } 3175 } 3176 } 3177 if (roundBits) { 3178 status->float_exception_flags |= float_flag_inexact; 3179 } 3180 zSig = ( zSig + roundIncrement )>>7; 3181 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3182 if ( zSig == 0 ) zExp = 0; 3183 return packFloat32( zSign, zExp, zSig ); 3184 3185 } 3186 3187 /*---------------------------------------------------------------------------- 3188 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3189 | and significand `zSig', and returns the proper single-precision floating- 3190 | point value corresponding to the abstract input. This routine is just like 3191 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3192 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3193 | floating-point exponent. 3194 *----------------------------------------------------------------------------*/ 3195 3196 static float32 3197 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3198 float_status *status) 3199 { 3200 int8_t shiftCount; 3201 3202 shiftCount = clz32(zSig) - 1; 3203 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3204 status); 3205 3206 } 3207 3208 /*---------------------------------------------------------------------------- 3209 | If `a' is denormal and we are in flush-to-zero mode then set the 3210 | input-denormal exception and return zero. Otherwise just return the value. 3211 *----------------------------------------------------------------------------*/ 3212 float64 float64_squash_input_denormal(float64 a, float_status *status) 3213 { 3214 if (status->flush_inputs_to_zero) { 3215 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3216 float_raise(float_flag_input_denormal, status); 3217 return make_float64(float64_val(a) & (1ULL << 63)); 3218 } 3219 } 3220 return a; 3221 } 3222 3223 /*---------------------------------------------------------------------------- 3224 | Normalizes the subnormal double-precision floating-point value represented 3225 | by the denormalized significand `aSig'. The normalized exponent and 3226 | significand are stored at the locations pointed to by `zExpPtr' and 3227 | `zSigPtr', respectively. 3228 *----------------------------------------------------------------------------*/ 3229 3230 static void 3231 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3232 { 3233 int8_t shiftCount; 3234 3235 shiftCount = clz64(aSig) - 11; 3236 *zSigPtr = aSig<<shiftCount; 3237 *zExpPtr = 1 - shiftCount; 3238 3239 } 3240 3241 /*---------------------------------------------------------------------------- 3242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3243 | double-precision floating-point value, returning the result. After being 3244 | shifted into the proper positions, the three fields are simply added 3245 | together to form the result. This means that any integer portion of `zSig' 3246 | will be added into the exponent. Since a properly normalized significand 3247 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3248 | than the desired result exponent whenever `zSig' is a complete, normalized 3249 | significand. 3250 *----------------------------------------------------------------------------*/ 3251 3252 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3253 { 3254 3255 return make_float64( 3256 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3257 3258 } 3259 3260 /*---------------------------------------------------------------------------- 3261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3262 | and significand `zSig', and returns the proper double-precision floating- 3263 | point value corresponding to the abstract input. Ordinarily, the abstract 3264 | value is simply rounded and packed into the double-precision format, with 3265 | the inexact exception raised if the abstract input cannot be represented 3266 | exactly. However, if the abstract value is too large, the overflow and 3267 | inexact exceptions are raised and an infinity or maximal finite value is 3268 | returned. If the abstract value is too small, the input value is rounded to 3269 | a subnormal number, and the underflow and inexact exceptions are raised if 3270 | the abstract input cannot be represented exactly as a subnormal double- 3271 | precision floating-point number. 3272 | The input significand `zSig' has its binary point between bits 62 3273 | and 61, which is 10 bits to the left of the usual location. This shifted 3274 | significand must be normalized or smaller. If `zSig' is not normalized, 3275 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3276 | and it must not require rounding. In the usual case that `zSig' is 3277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3278 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3279 | Binary Floating-Point Arithmetic. 3280 *----------------------------------------------------------------------------*/ 3281 3282 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3283 float_status *status) 3284 { 3285 int8_t roundingMode; 3286 flag roundNearestEven; 3287 int roundIncrement, roundBits; 3288 flag isTiny; 3289 3290 roundingMode = status->float_rounding_mode; 3291 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3292 switch (roundingMode) { 3293 case float_round_nearest_even: 3294 case float_round_ties_away: 3295 roundIncrement = 0x200; 3296 break; 3297 case float_round_to_zero: 3298 roundIncrement = 0; 3299 break; 3300 case float_round_up: 3301 roundIncrement = zSign ? 0 : 0x3ff; 3302 break; 3303 case float_round_down: 3304 roundIncrement = zSign ? 0x3ff : 0; 3305 break; 3306 case float_round_to_odd: 3307 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3308 break; 3309 default: 3310 abort(); 3311 } 3312 roundBits = zSig & 0x3FF; 3313 if ( 0x7FD <= (uint16_t) zExp ) { 3314 if ( ( 0x7FD < zExp ) 3315 || ( ( zExp == 0x7FD ) 3316 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3317 ) { 3318 bool overflow_to_inf = roundingMode != float_round_to_odd && 3319 roundIncrement != 0; 3320 float_raise(float_flag_overflow | float_flag_inexact, status); 3321 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3322 } 3323 if ( zExp < 0 ) { 3324 if (status->flush_to_zero) { 3325 float_raise(float_flag_output_denormal, status); 3326 return packFloat64(zSign, 0, 0); 3327 } 3328 isTiny = 3329 (status->float_detect_tininess 3330 == float_tininess_before_rounding) 3331 || ( zExp < -1 ) 3332 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3333 shift64RightJamming( zSig, - zExp, &zSig ); 3334 zExp = 0; 3335 roundBits = zSig & 0x3FF; 3336 if (isTiny && roundBits) { 3337 float_raise(float_flag_underflow, status); 3338 } 3339 if (roundingMode == float_round_to_odd) { 3340 /* 3341 * For round-to-odd case, the roundIncrement depends on 3342 * zSig which just changed. 3343 */ 3344 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3345 } 3346 } 3347 } 3348 if (roundBits) { 3349 status->float_exception_flags |= float_flag_inexact; 3350 } 3351 zSig = ( zSig + roundIncrement )>>10; 3352 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3353 if ( zSig == 0 ) zExp = 0; 3354 return packFloat64( zSign, zExp, zSig ); 3355 3356 } 3357 3358 /*---------------------------------------------------------------------------- 3359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3360 | and significand `zSig', and returns the proper double-precision floating- 3361 | point value corresponding to the abstract input. This routine is just like 3362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3364 | floating-point exponent. 3365 *----------------------------------------------------------------------------*/ 3366 3367 static float64 3368 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3369 float_status *status) 3370 { 3371 int8_t shiftCount; 3372 3373 shiftCount = clz64(zSig) - 1; 3374 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3375 status); 3376 3377 } 3378 3379 /*---------------------------------------------------------------------------- 3380 | Normalizes the subnormal extended double-precision floating-point value 3381 | represented by the denormalized significand `aSig'. The normalized exponent 3382 | and significand are stored at the locations pointed to by `zExpPtr' and 3383 | `zSigPtr', respectively. 3384 *----------------------------------------------------------------------------*/ 3385 3386 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3387 uint64_t *zSigPtr) 3388 { 3389 int8_t shiftCount; 3390 3391 shiftCount = clz64(aSig); 3392 *zSigPtr = aSig<<shiftCount; 3393 *zExpPtr = 1 - shiftCount; 3394 } 3395 3396 /*---------------------------------------------------------------------------- 3397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3398 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3399 | and returns the proper extended double-precision floating-point value 3400 | corresponding to the abstract input. Ordinarily, the abstract value is 3401 | rounded and packed into the extended double-precision format, with the 3402 | inexact exception raised if the abstract input cannot be represented 3403 | exactly. However, if the abstract value is too large, the overflow and 3404 | inexact exceptions are raised and an infinity or maximal finite value is 3405 | returned. If the abstract value is too small, the input value is rounded to 3406 | a subnormal number, and the underflow and inexact exceptions are raised if 3407 | the abstract input cannot be represented exactly as a subnormal extended 3408 | double-precision floating-point number. 3409 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3410 | number of bits as single or double precision, respectively. Otherwise, the 3411 | result is rounded to the full precision of the extended double-precision 3412 | format. 3413 | The input significand must be normalized or smaller. If the input 3414 | significand is not normalized, `zExp' must be 0; in that case, the result 3415 | returned is a subnormal number, and it must not require rounding. The 3416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3417 | Floating-Point Arithmetic. 3418 *----------------------------------------------------------------------------*/ 3419 3420 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3421 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3422 float_status *status) 3423 { 3424 int8_t roundingMode; 3425 flag roundNearestEven, increment, isTiny; 3426 int64_t roundIncrement, roundMask, roundBits; 3427 3428 roundingMode = status->float_rounding_mode; 3429 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3430 if ( roundingPrecision == 80 ) goto precision80; 3431 if ( roundingPrecision == 64 ) { 3432 roundIncrement = LIT64( 0x0000000000000400 ); 3433 roundMask = LIT64( 0x00000000000007FF ); 3434 } 3435 else if ( roundingPrecision == 32 ) { 3436 roundIncrement = LIT64( 0x0000008000000000 ); 3437 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3438 } 3439 else { 3440 goto precision80; 3441 } 3442 zSig0 |= ( zSig1 != 0 ); 3443 switch (roundingMode) { 3444 case float_round_nearest_even: 3445 case float_round_ties_away: 3446 break; 3447 case float_round_to_zero: 3448 roundIncrement = 0; 3449 break; 3450 case float_round_up: 3451 roundIncrement = zSign ? 0 : roundMask; 3452 break; 3453 case float_round_down: 3454 roundIncrement = zSign ? roundMask : 0; 3455 break; 3456 default: 3457 abort(); 3458 } 3459 roundBits = zSig0 & roundMask; 3460 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3461 if ( ( 0x7FFE < zExp ) 3462 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3463 ) { 3464 goto overflow; 3465 } 3466 if ( zExp <= 0 ) { 3467 if (status->flush_to_zero) { 3468 float_raise(float_flag_output_denormal, status); 3469 return packFloatx80(zSign, 0, 0); 3470 } 3471 isTiny = 3472 (status->float_detect_tininess 3473 == float_tininess_before_rounding) 3474 || ( zExp < 0 ) 3475 || ( zSig0 <= zSig0 + roundIncrement ); 3476 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3477 zExp = 0; 3478 roundBits = zSig0 & roundMask; 3479 if (isTiny && roundBits) { 3480 float_raise(float_flag_underflow, status); 3481 } 3482 if (roundBits) { 3483 status->float_exception_flags |= float_flag_inexact; 3484 } 3485 zSig0 += roundIncrement; 3486 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3487 roundIncrement = roundMask + 1; 3488 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3489 roundMask |= roundIncrement; 3490 } 3491 zSig0 &= ~ roundMask; 3492 return packFloatx80( zSign, zExp, zSig0 ); 3493 } 3494 } 3495 if (roundBits) { 3496 status->float_exception_flags |= float_flag_inexact; 3497 } 3498 zSig0 += roundIncrement; 3499 if ( zSig0 < roundIncrement ) { 3500 ++zExp; 3501 zSig0 = LIT64( 0x8000000000000000 ); 3502 } 3503 roundIncrement = roundMask + 1; 3504 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3505 roundMask |= roundIncrement; 3506 } 3507 zSig0 &= ~ roundMask; 3508 if ( zSig0 == 0 ) zExp = 0; 3509 return packFloatx80( zSign, zExp, zSig0 ); 3510 precision80: 3511 switch (roundingMode) { 3512 case float_round_nearest_even: 3513 case float_round_ties_away: 3514 increment = ((int64_t)zSig1 < 0); 3515 break; 3516 case float_round_to_zero: 3517 increment = 0; 3518 break; 3519 case float_round_up: 3520 increment = !zSign && zSig1; 3521 break; 3522 case float_round_down: 3523 increment = zSign && zSig1; 3524 break; 3525 default: 3526 abort(); 3527 } 3528 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3529 if ( ( 0x7FFE < zExp ) 3530 || ( ( zExp == 0x7FFE ) 3531 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3532 && increment 3533 ) 3534 ) { 3535 roundMask = 0; 3536 overflow: 3537 float_raise(float_flag_overflow | float_flag_inexact, status); 3538 if ( ( roundingMode == float_round_to_zero ) 3539 || ( zSign && ( roundingMode == float_round_up ) ) 3540 || ( ! zSign && ( roundingMode == float_round_down ) ) 3541 ) { 3542 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3543 } 3544 return packFloatx80(zSign, 3545 floatx80_infinity_high, 3546 floatx80_infinity_low); 3547 } 3548 if ( zExp <= 0 ) { 3549 isTiny = 3550 (status->float_detect_tininess 3551 == float_tininess_before_rounding) 3552 || ( zExp < 0 ) 3553 || ! increment 3554 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3555 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3556 zExp = 0; 3557 if (isTiny && zSig1) { 3558 float_raise(float_flag_underflow, status); 3559 } 3560 if (zSig1) { 3561 status->float_exception_flags |= float_flag_inexact; 3562 } 3563 switch (roundingMode) { 3564 case float_round_nearest_even: 3565 case float_round_ties_away: 3566 increment = ((int64_t)zSig1 < 0); 3567 break; 3568 case float_round_to_zero: 3569 increment = 0; 3570 break; 3571 case float_round_up: 3572 increment = !zSign && zSig1; 3573 break; 3574 case float_round_down: 3575 increment = zSign && zSig1; 3576 break; 3577 default: 3578 abort(); 3579 } 3580 if ( increment ) { 3581 ++zSig0; 3582 zSig0 &= 3583 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3584 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3585 } 3586 return packFloatx80( zSign, zExp, zSig0 ); 3587 } 3588 } 3589 if (zSig1) { 3590 status->float_exception_flags |= float_flag_inexact; 3591 } 3592 if ( increment ) { 3593 ++zSig0; 3594 if ( zSig0 == 0 ) { 3595 ++zExp; 3596 zSig0 = LIT64( 0x8000000000000000 ); 3597 } 3598 else { 3599 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3600 } 3601 } 3602 else { 3603 if ( zSig0 == 0 ) zExp = 0; 3604 } 3605 return packFloatx80( zSign, zExp, zSig0 ); 3606 3607 } 3608 3609 /*---------------------------------------------------------------------------- 3610 | Takes an abstract floating-point value having sign `zSign', exponent 3611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3612 | and returns the proper extended double-precision floating-point value 3613 | corresponding to the abstract input. This routine is just like 3614 | `roundAndPackFloatx80' except that the input significand does not have to be 3615 | normalized. 3616 *----------------------------------------------------------------------------*/ 3617 3618 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3619 flag zSign, int32_t zExp, 3620 uint64_t zSig0, uint64_t zSig1, 3621 float_status *status) 3622 { 3623 int8_t shiftCount; 3624 3625 if ( zSig0 == 0 ) { 3626 zSig0 = zSig1; 3627 zSig1 = 0; 3628 zExp -= 64; 3629 } 3630 shiftCount = clz64(zSig0); 3631 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3632 zExp -= shiftCount; 3633 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3634 zSig0, zSig1, status); 3635 3636 } 3637 3638 /*---------------------------------------------------------------------------- 3639 | Returns the least-significant 64 fraction bits of the quadruple-precision 3640 | floating-point value `a'. 3641 *----------------------------------------------------------------------------*/ 3642 3643 static inline uint64_t extractFloat128Frac1( float128 a ) 3644 { 3645 3646 return a.low; 3647 3648 } 3649 3650 /*---------------------------------------------------------------------------- 3651 | Returns the most-significant 48 fraction bits of the quadruple-precision 3652 | floating-point value `a'. 3653 *----------------------------------------------------------------------------*/ 3654 3655 static inline uint64_t extractFloat128Frac0( float128 a ) 3656 { 3657 3658 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3659 3660 } 3661 3662 /*---------------------------------------------------------------------------- 3663 | Returns the exponent bits of the quadruple-precision floating-point value 3664 | `a'. 3665 *----------------------------------------------------------------------------*/ 3666 3667 static inline int32_t extractFloat128Exp( float128 a ) 3668 { 3669 3670 return ( a.high>>48 ) & 0x7FFF; 3671 3672 } 3673 3674 /*---------------------------------------------------------------------------- 3675 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3676 *----------------------------------------------------------------------------*/ 3677 3678 static inline flag extractFloat128Sign( float128 a ) 3679 { 3680 3681 return a.high>>63; 3682 3683 } 3684 3685 /*---------------------------------------------------------------------------- 3686 | Normalizes the subnormal quadruple-precision floating-point value 3687 | represented by the denormalized significand formed by the concatenation of 3688 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3689 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3690 | significand are stored at the location pointed to by `zSig0Ptr', and the 3691 | least significant 64 bits of the normalized significand are stored at the 3692 | location pointed to by `zSig1Ptr'. 3693 *----------------------------------------------------------------------------*/ 3694 3695 static void 3696 normalizeFloat128Subnormal( 3697 uint64_t aSig0, 3698 uint64_t aSig1, 3699 int32_t *zExpPtr, 3700 uint64_t *zSig0Ptr, 3701 uint64_t *zSig1Ptr 3702 ) 3703 { 3704 int8_t shiftCount; 3705 3706 if ( aSig0 == 0 ) { 3707 shiftCount = clz64(aSig1) - 15; 3708 if ( shiftCount < 0 ) { 3709 *zSig0Ptr = aSig1>>( - shiftCount ); 3710 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3711 } 3712 else { 3713 *zSig0Ptr = aSig1<<shiftCount; 3714 *zSig1Ptr = 0; 3715 } 3716 *zExpPtr = - shiftCount - 63; 3717 } 3718 else { 3719 shiftCount = clz64(aSig0) - 15; 3720 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3721 *zExpPtr = 1 - shiftCount; 3722 } 3723 3724 } 3725 3726 /*---------------------------------------------------------------------------- 3727 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3728 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3729 | floating-point value, returning the result. After being shifted into the 3730 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3731 | added together to form the most significant 32 bits of the result. This 3732 | means that any integer portion of `zSig0' will be added into the exponent. 3733 | Since a properly normalized significand will have an integer portion equal 3734 | to 1, the `zExp' input should be 1 less than the desired result exponent 3735 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3736 | significand. 3737 *----------------------------------------------------------------------------*/ 3738 3739 static inline float128 3740 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3741 { 3742 float128 z; 3743 3744 z.low = zSig1; 3745 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3746 return z; 3747 3748 } 3749 3750 /*---------------------------------------------------------------------------- 3751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3752 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3753 | and `zSig2', and returns the proper quadruple-precision floating-point value 3754 | corresponding to the abstract input. Ordinarily, the abstract value is 3755 | simply rounded and packed into the quadruple-precision format, with the 3756 | inexact exception raised if the abstract input cannot be represented 3757 | exactly. However, if the abstract value is too large, the overflow and 3758 | inexact exceptions are raised and an infinity or maximal finite value is 3759 | returned. If the abstract value is too small, the input value is rounded to 3760 | a subnormal number, and the underflow and inexact exceptions are raised if 3761 | the abstract input cannot be represented exactly as a subnormal quadruple- 3762 | precision floating-point number. 3763 | The input significand must be normalized or smaller. If the input 3764 | significand is not normalized, `zExp' must be 0; in that case, the result 3765 | returned is a subnormal number, and it must not require rounding. In the 3766 | usual case that the input significand is normalized, `zExp' must be 1 less 3767 | than the ``true'' floating-point exponent. The handling of underflow and 3768 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3769 *----------------------------------------------------------------------------*/ 3770 3771 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3772 uint64_t zSig0, uint64_t zSig1, 3773 uint64_t zSig2, float_status *status) 3774 { 3775 int8_t roundingMode; 3776 flag roundNearestEven, increment, isTiny; 3777 3778 roundingMode = status->float_rounding_mode; 3779 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3780 switch (roundingMode) { 3781 case float_round_nearest_even: 3782 case float_round_ties_away: 3783 increment = ((int64_t)zSig2 < 0); 3784 break; 3785 case float_round_to_zero: 3786 increment = 0; 3787 break; 3788 case float_round_up: 3789 increment = !zSign && zSig2; 3790 break; 3791 case float_round_down: 3792 increment = zSign && zSig2; 3793 break; 3794 case float_round_to_odd: 3795 increment = !(zSig1 & 0x1) && zSig2; 3796 break; 3797 default: 3798 abort(); 3799 } 3800 if ( 0x7FFD <= (uint32_t) zExp ) { 3801 if ( ( 0x7FFD < zExp ) 3802 || ( ( zExp == 0x7FFD ) 3803 && eq128( 3804 LIT64( 0x0001FFFFFFFFFFFF ), 3805 LIT64( 0xFFFFFFFFFFFFFFFF ), 3806 zSig0, 3807 zSig1 3808 ) 3809 && increment 3810 ) 3811 ) { 3812 float_raise(float_flag_overflow | float_flag_inexact, status); 3813 if ( ( roundingMode == float_round_to_zero ) 3814 || ( zSign && ( roundingMode == float_round_up ) ) 3815 || ( ! zSign && ( roundingMode == float_round_down ) ) 3816 || (roundingMode == float_round_to_odd) 3817 ) { 3818 return 3819 packFloat128( 3820 zSign, 3821 0x7FFE, 3822 LIT64( 0x0000FFFFFFFFFFFF ), 3823 LIT64( 0xFFFFFFFFFFFFFFFF ) 3824 ); 3825 } 3826 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3827 } 3828 if ( zExp < 0 ) { 3829 if (status->flush_to_zero) { 3830 float_raise(float_flag_output_denormal, status); 3831 return packFloat128(zSign, 0, 0, 0); 3832 } 3833 isTiny = 3834 (status->float_detect_tininess 3835 == float_tininess_before_rounding) 3836 || ( zExp < -1 ) 3837 || ! increment 3838 || lt128( 3839 zSig0, 3840 zSig1, 3841 LIT64( 0x0001FFFFFFFFFFFF ), 3842 LIT64( 0xFFFFFFFFFFFFFFFF ) 3843 ); 3844 shift128ExtraRightJamming( 3845 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3846 zExp = 0; 3847 if (isTiny && zSig2) { 3848 float_raise(float_flag_underflow, status); 3849 } 3850 switch (roundingMode) { 3851 case float_round_nearest_even: 3852 case float_round_ties_away: 3853 increment = ((int64_t)zSig2 < 0); 3854 break; 3855 case float_round_to_zero: 3856 increment = 0; 3857 break; 3858 case float_round_up: 3859 increment = !zSign && zSig2; 3860 break; 3861 case float_round_down: 3862 increment = zSign && zSig2; 3863 break; 3864 case float_round_to_odd: 3865 increment = !(zSig1 & 0x1) && zSig2; 3866 break; 3867 default: 3868 abort(); 3869 } 3870 } 3871 } 3872 if (zSig2) { 3873 status->float_exception_flags |= float_flag_inexact; 3874 } 3875 if ( increment ) { 3876 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3877 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3878 } 3879 else { 3880 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3881 } 3882 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3883 3884 } 3885 3886 /*---------------------------------------------------------------------------- 3887 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3888 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3889 | returns the proper quadruple-precision floating-point value corresponding 3890 | to the abstract input. This routine is just like `roundAndPackFloat128' 3891 | except that the input significand has fewer bits and does not have to be 3892 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3893 | point exponent. 3894 *----------------------------------------------------------------------------*/ 3895 3896 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3897 uint64_t zSig0, uint64_t zSig1, 3898 float_status *status) 3899 { 3900 int8_t shiftCount; 3901 uint64_t zSig2; 3902 3903 if ( zSig0 == 0 ) { 3904 zSig0 = zSig1; 3905 zSig1 = 0; 3906 zExp -= 64; 3907 } 3908 shiftCount = clz64(zSig0) - 15; 3909 if ( 0 <= shiftCount ) { 3910 zSig2 = 0; 3911 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3912 } 3913 else { 3914 shift128ExtraRightJamming( 3915 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3916 } 3917 zExp -= shiftCount; 3918 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3919 3920 } 3921 3922 3923 /*---------------------------------------------------------------------------- 3924 | Returns the result of converting the 32-bit two's complement integer `a' 3925 | to the extended double-precision floating-point format. The conversion 3926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3927 | Arithmetic. 3928 *----------------------------------------------------------------------------*/ 3929 3930 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3931 { 3932 flag zSign; 3933 uint32_t absA; 3934 int8_t shiftCount; 3935 uint64_t zSig; 3936 3937 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3938 zSign = ( a < 0 ); 3939 absA = zSign ? - a : a; 3940 shiftCount = clz32(absA) + 32; 3941 zSig = absA; 3942 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3943 3944 } 3945 3946 /*---------------------------------------------------------------------------- 3947 | Returns the result of converting the 32-bit two's complement integer `a' to 3948 | the quadruple-precision floating-point format. The conversion is performed 3949 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3950 *----------------------------------------------------------------------------*/ 3951 3952 float128 int32_to_float128(int32_t a, float_status *status) 3953 { 3954 flag zSign; 3955 uint32_t absA; 3956 int8_t shiftCount; 3957 uint64_t zSig0; 3958 3959 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3960 zSign = ( a < 0 ); 3961 absA = zSign ? - a : a; 3962 shiftCount = clz32(absA) + 17; 3963 zSig0 = absA; 3964 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3965 3966 } 3967 3968 /*---------------------------------------------------------------------------- 3969 | Returns the result of converting the 64-bit two's complement integer `a' 3970 | to the extended double-precision floating-point format. The conversion 3971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3972 | Arithmetic. 3973 *----------------------------------------------------------------------------*/ 3974 3975 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3976 { 3977 flag zSign; 3978 uint64_t absA; 3979 int8_t shiftCount; 3980 3981 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3982 zSign = ( a < 0 ); 3983 absA = zSign ? - a : a; 3984 shiftCount = clz64(absA); 3985 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3986 3987 } 3988 3989 /*---------------------------------------------------------------------------- 3990 | Returns the result of converting the 64-bit two's complement integer `a' to 3991 | the quadruple-precision floating-point format. The conversion is performed 3992 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3993 *----------------------------------------------------------------------------*/ 3994 3995 float128 int64_to_float128(int64_t a, float_status *status) 3996 { 3997 flag zSign; 3998 uint64_t absA; 3999 int8_t shiftCount; 4000 int32_t zExp; 4001 uint64_t zSig0, zSig1; 4002 4003 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4004 zSign = ( a < 0 ); 4005 absA = zSign ? - a : a; 4006 shiftCount = clz64(absA) + 49; 4007 zExp = 0x406E - shiftCount; 4008 if ( 64 <= shiftCount ) { 4009 zSig1 = 0; 4010 zSig0 = absA; 4011 shiftCount -= 64; 4012 } 4013 else { 4014 zSig1 = absA; 4015 zSig0 = 0; 4016 } 4017 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4018 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4019 4020 } 4021 4022 /*---------------------------------------------------------------------------- 4023 | Returns the result of converting the 64-bit unsigned integer `a' 4024 | to the quadruple-precision floating-point format. The conversion is performed 4025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4026 *----------------------------------------------------------------------------*/ 4027 4028 float128 uint64_to_float128(uint64_t a, float_status *status) 4029 { 4030 if (a == 0) { 4031 return float128_zero; 4032 } 4033 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4034 } 4035 4036 /*---------------------------------------------------------------------------- 4037 | Returns the result of converting the single-precision floating-point value 4038 | `a' to the extended double-precision floating-point format. The conversion 4039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4040 | Arithmetic. 4041 *----------------------------------------------------------------------------*/ 4042 4043 floatx80 float32_to_floatx80(float32 a, float_status *status) 4044 { 4045 flag aSign; 4046 int aExp; 4047 uint32_t aSig; 4048 4049 a = float32_squash_input_denormal(a, status); 4050 aSig = extractFloat32Frac( a ); 4051 aExp = extractFloat32Exp( a ); 4052 aSign = extractFloat32Sign( a ); 4053 if ( aExp == 0xFF ) { 4054 if (aSig) { 4055 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4056 } 4057 return packFloatx80(aSign, 4058 floatx80_infinity_high, 4059 floatx80_infinity_low); 4060 } 4061 if ( aExp == 0 ) { 4062 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4063 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4064 } 4065 aSig |= 0x00800000; 4066 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4067 4068 } 4069 4070 /*---------------------------------------------------------------------------- 4071 | Returns the result of converting the single-precision floating-point value 4072 | `a' to the double-precision floating-point format. The conversion is 4073 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4074 | Arithmetic. 4075 *----------------------------------------------------------------------------*/ 4076 4077 float128 float32_to_float128(float32 a, float_status *status) 4078 { 4079 flag aSign; 4080 int aExp; 4081 uint32_t aSig; 4082 4083 a = float32_squash_input_denormal(a, status); 4084 aSig = extractFloat32Frac( a ); 4085 aExp = extractFloat32Exp( a ); 4086 aSign = extractFloat32Sign( a ); 4087 if ( aExp == 0xFF ) { 4088 if (aSig) { 4089 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4090 } 4091 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4092 } 4093 if ( aExp == 0 ) { 4094 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4095 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4096 --aExp; 4097 } 4098 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4099 4100 } 4101 4102 /*---------------------------------------------------------------------------- 4103 | Returns the remainder of the single-precision floating-point value `a' 4104 | with respect to the corresponding value `b'. The operation is performed 4105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4106 *----------------------------------------------------------------------------*/ 4107 4108 float32 float32_rem(float32 a, float32 b, float_status *status) 4109 { 4110 flag aSign, zSign; 4111 int aExp, bExp, expDiff; 4112 uint32_t aSig, bSig; 4113 uint32_t q; 4114 uint64_t aSig64, bSig64, q64; 4115 uint32_t alternateASig; 4116 int32_t sigMean; 4117 a = float32_squash_input_denormal(a, status); 4118 b = float32_squash_input_denormal(b, status); 4119 4120 aSig = extractFloat32Frac( a ); 4121 aExp = extractFloat32Exp( a ); 4122 aSign = extractFloat32Sign( a ); 4123 bSig = extractFloat32Frac( b ); 4124 bExp = extractFloat32Exp( b ); 4125 if ( aExp == 0xFF ) { 4126 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4127 return propagateFloat32NaN(a, b, status); 4128 } 4129 float_raise(float_flag_invalid, status); 4130 return float32_default_nan(status); 4131 } 4132 if ( bExp == 0xFF ) { 4133 if (bSig) { 4134 return propagateFloat32NaN(a, b, status); 4135 } 4136 return a; 4137 } 4138 if ( bExp == 0 ) { 4139 if ( bSig == 0 ) { 4140 float_raise(float_flag_invalid, status); 4141 return float32_default_nan(status); 4142 } 4143 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4144 } 4145 if ( aExp == 0 ) { 4146 if ( aSig == 0 ) return a; 4147 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4148 } 4149 expDiff = aExp - bExp; 4150 aSig |= 0x00800000; 4151 bSig |= 0x00800000; 4152 if ( expDiff < 32 ) { 4153 aSig <<= 8; 4154 bSig <<= 8; 4155 if ( expDiff < 0 ) { 4156 if ( expDiff < -1 ) return a; 4157 aSig >>= 1; 4158 } 4159 q = ( bSig <= aSig ); 4160 if ( q ) aSig -= bSig; 4161 if ( 0 < expDiff ) { 4162 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4163 q >>= 32 - expDiff; 4164 bSig >>= 2; 4165 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4166 } 4167 else { 4168 aSig >>= 2; 4169 bSig >>= 2; 4170 } 4171 } 4172 else { 4173 if ( bSig <= aSig ) aSig -= bSig; 4174 aSig64 = ( (uint64_t) aSig )<<40; 4175 bSig64 = ( (uint64_t) bSig )<<40; 4176 expDiff -= 64; 4177 while ( 0 < expDiff ) { 4178 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4179 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4180 aSig64 = - ( ( bSig * q64 )<<38 ); 4181 expDiff -= 62; 4182 } 4183 expDiff += 64; 4184 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4185 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4186 q = q64>>( 64 - expDiff ); 4187 bSig <<= 6; 4188 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4189 } 4190 do { 4191 alternateASig = aSig; 4192 ++q; 4193 aSig -= bSig; 4194 } while ( 0 <= (int32_t) aSig ); 4195 sigMean = aSig + alternateASig; 4196 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4197 aSig = alternateASig; 4198 } 4199 zSign = ( (int32_t) aSig < 0 ); 4200 if ( zSign ) aSig = - aSig; 4201 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4202 } 4203 4204 4205 4206 /*---------------------------------------------------------------------------- 4207 | Returns the binary exponential of the single-precision floating-point value 4208 | `a'. The operation is performed according to the IEC/IEEE Standard for 4209 | Binary Floating-Point Arithmetic. 4210 | 4211 | Uses the following identities: 4212 | 4213 | 1. ------------------------------------------------------------------------- 4214 | x x*ln(2) 4215 | 2 = e 4216 | 4217 | 2. ------------------------------------------------------------------------- 4218 | 2 3 4 5 n 4219 | x x x x x x x 4220 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4221 | 1! 2! 3! 4! 5! n! 4222 *----------------------------------------------------------------------------*/ 4223 4224 static const float64 float32_exp2_coefficients[15] = 4225 { 4226 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4227 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4228 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4229 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4230 const_float64( 0x3f81111111111111ll ), /* 5 */ 4231 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4232 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4233 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4234 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4235 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4236 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4237 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4238 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4239 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4240 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4241 }; 4242 4243 float32 float32_exp2(float32 a, float_status *status) 4244 { 4245 flag aSign; 4246 int aExp; 4247 uint32_t aSig; 4248 float64 r, x, xn; 4249 int i; 4250 a = float32_squash_input_denormal(a, status); 4251 4252 aSig = extractFloat32Frac( a ); 4253 aExp = extractFloat32Exp( a ); 4254 aSign = extractFloat32Sign( a ); 4255 4256 if ( aExp == 0xFF) { 4257 if (aSig) { 4258 return propagateFloat32NaN(a, float32_zero, status); 4259 } 4260 return (aSign) ? float32_zero : a; 4261 } 4262 if (aExp == 0) { 4263 if (aSig == 0) return float32_one; 4264 } 4265 4266 float_raise(float_flag_inexact, status); 4267 4268 /* ******************************* */ 4269 /* using float64 for approximation */ 4270 /* ******************************* */ 4271 x = float32_to_float64(a, status); 4272 x = float64_mul(x, float64_ln2, status); 4273 4274 xn = x; 4275 r = float64_one; 4276 for (i = 0 ; i < 15 ; i++) { 4277 float64 f; 4278 4279 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4280 r = float64_add(r, f, status); 4281 4282 xn = float64_mul(xn, x, status); 4283 } 4284 4285 return float64_to_float32(r, status); 4286 } 4287 4288 /*---------------------------------------------------------------------------- 4289 | Returns the binary log of the single-precision floating-point value `a'. 4290 | The operation is performed according to the IEC/IEEE Standard for Binary 4291 | Floating-Point Arithmetic. 4292 *----------------------------------------------------------------------------*/ 4293 float32 float32_log2(float32 a, float_status *status) 4294 { 4295 flag aSign, zSign; 4296 int aExp; 4297 uint32_t aSig, zSig, i; 4298 4299 a = float32_squash_input_denormal(a, status); 4300 aSig = extractFloat32Frac( a ); 4301 aExp = extractFloat32Exp( a ); 4302 aSign = extractFloat32Sign( a ); 4303 4304 if ( aExp == 0 ) { 4305 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4306 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4307 } 4308 if ( aSign ) { 4309 float_raise(float_flag_invalid, status); 4310 return float32_default_nan(status); 4311 } 4312 if ( aExp == 0xFF ) { 4313 if (aSig) { 4314 return propagateFloat32NaN(a, float32_zero, status); 4315 } 4316 return a; 4317 } 4318 4319 aExp -= 0x7F; 4320 aSig |= 0x00800000; 4321 zSign = aExp < 0; 4322 zSig = aExp << 23; 4323 4324 for (i = 1 << 22; i > 0; i >>= 1) { 4325 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4326 if ( aSig & 0x01000000 ) { 4327 aSig >>= 1; 4328 zSig |= i; 4329 } 4330 } 4331 4332 if ( zSign ) 4333 zSig = -zSig; 4334 4335 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4336 } 4337 4338 /*---------------------------------------------------------------------------- 4339 | Returns 1 if the single-precision floating-point value `a' is equal to 4340 | the corresponding value `b', and 0 otherwise. The invalid exception is 4341 | raised if either operand is a NaN. Otherwise, the comparison is performed 4342 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4343 *----------------------------------------------------------------------------*/ 4344 4345 int float32_eq(float32 a, float32 b, float_status *status) 4346 { 4347 uint32_t av, bv; 4348 a = float32_squash_input_denormal(a, status); 4349 b = float32_squash_input_denormal(b, status); 4350 4351 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4352 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4353 ) { 4354 float_raise(float_flag_invalid, status); 4355 return 0; 4356 } 4357 av = float32_val(a); 4358 bv = float32_val(b); 4359 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4360 } 4361 4362 /*---------------------------------------------------------------------------- 4363 | Returns 1 if the single-precision floating-point value `a' is less than 4364 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4365 | exception is raised if either operand is a NaN. The comparison is performed 4366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4367 *----------------------------------------------------------------------------*/ 4368 4369 int float32_le(float32 a, float32 b, float_status *status) 4370 { 4371 flag aSign, bSign; 4372 uint32_t av, bv; 4373 a = float32_squash_input_denormal(a, status); 4374 b = float32_squash_input_denormal(b, status); 4375 4376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4378 ) { 4379 float_raise(float_flag_invalid, status); 4380 return 0; 4381 } 4382 aSign = extractFloat32Sign( a ); 4383 bSign = extractFloat32Sign( b ); 4384 av = float32_val(a); 4385 bv = float32_val(b); 4386 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4387 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4388 4389 } 4390 4391 /*---------------------------------------------------------------------------- 4392 | Returns 1 if the single-precision floating-point value `a' is less than 4393 | the corresponding value `b', and 0 otherwise. The invalid exception is 4394 | raised if either operand is a NaN. The comparison is performed according 4395 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4396 *----------------------------------------------------------------------------*/ 4397 4398 int float32_lt(float32 a, float32 b, float_status *status) 4399 { 4400 flag aSign, bSign; 4401 uint32_t av, bv; 4402 a = float32_squash_input_denormal(a, status); 4403 b = float32_squash_input_denormal(b, status); 4404 4405 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4406 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4407 ) { 4408 float_raise(float_flag_invalid, status); 4409 return 0; 4410 } 4411 aSign = extractFloat32Sign( a ); 4412 bSign = extractFloat32Sign( b ); 4413 av = float32_val(a); 4414 bv = float32_val(b); 4415 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4416 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4417 4418 } 4419 4420 /*---------------------------------------------------------------------------- 4421 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4422 | be compared, and 0 otherwise. The invalid exception is raised if either 4423 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4424 | Standard for Binary Floating-Point Arithmetic. 4425 *----------------------------------------------------------------------------*/ 4426 4427 int float32_unordered(float32 a, float32 b, float_status *status) 4428 { 4429 a = float32_squash_input_denormal(a, status); 4430 b = float32_squash_input_denormal(b, status); 4431 4432 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4433 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4434 ) { 4435 float_raise(float_flag_invalid, status); 4436 return 1; 4437 } 4438 return 0; 4439 } 4440 4441 /*---------------------------------------------------------------------------- 4442 | Returns 1 if the single-precision floating-point value `a' is equal to 4443 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4444 | exception. The comparison is performed according to the IEC/IEEE Standard 4445 | for Binary Floating-Point Arithmetic. 4446 *----------------------------------------------------------------------------*/ 4447 4448 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4449 { 4450 a = float32_squash_input_denormal(a, status); 4451 b = float32_squash_input_denormal(b, status); 4452 4453 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4454 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4455 ) { 4456 if (float32_is_signaling_nan(a, status) 4457 || float32_is_signaling_nan(b, status)) { 4458 float_raise(float_flag_invalid, status); 4459 } 4460 return 0; 4461 } 4462 return ( float32_val(a) == float32_val(b) ) || 4463 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4464 } 4465 4466 /*---------------------------------------------------------------------------- 4467 | Returns 1 if the single-precision floating-point value `a' is less than or 4468 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4469 | cause an exception. Otherwise, the comparison is performed according to the 4470 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4471 *----------------------------------------------------------------------------*/ 4472 4473 int float32_le_quiet(float32 a, float32 b, float_status *status) 4474 { 4475 flag aSign, bSign; 4476 uint32_t av, bv; 4477 a = float32_squash_input_denormal(a, status); 4478 b = float32_squash_input_denormal(b, status); 4479 4480 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4481 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4482 ) { 4483 if (float32_is_signaling_nan(a, status) 4484 || float32_is_signaling_nan(b, status)) { 4485 float_raise(float_flag_invalid, status); 4486 } 4487 return 0; 4488 } 4489 aSign = extractFloat32Sign( a ); 4490 bSign = extractFloat32Sign( b ); 4491 av = float32_val(a); 4492 bv = float32_val(b); 4493 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4494 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4495 4496 } 4497 4498 /*---------------------------------------------------------------------------- 4499 | Returns 1 if the single-precision floating-point value `a' is less than 4500 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4501 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4502 | Standard for Binary Floating-Point Arithmetic. 4503 *----------------------------------------------------------------------------*/ 4504 4505 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4506 { 4507 flag aSign, bSign; 4508 uint32_t av, bv; 4509 a = float32_squash_input_denormal(a, status); 4510 b = float32_squash_input_denormal(b, status); 4511 4512 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4513 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4514 ) { 4515 if (float32_is_signaling_nan(a, status) 4516 || float32_is_signaling_nan(b, status)) { 4517 float_raise(float_flag_invalid, status); 4518 } 4519 return 0; 4520 } 4521 aSign = extractFloat32Sign( a ); 4522 bSign = extractFloat32Sign( b ); 4523 av = float32_val(a); 4524 bv = float32_val(b); 4525 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4526 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4527 4528 } 4529 4530 /*---------------------------------------------------------------------------- 4531 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4532 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4533 | comparison is performed according to the IEC/IEEE Standard for Binary 4534 | Floating-Point Arithmetic. 4535 *----------------------------------------------------------------------------*/ 4536 4537 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4538 { 4539 a = float32_squash_input_denormal(a, status); 4540 b = float32_squash_input_denormal(b, status); 4541 4542 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4543 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4544 ) { 4545 if (float32_is_signaling_nan(a, status) 4546 || float32_is_signaling_nan(b, status)) { 4547 float_raise(float_flag_invalid, status); 4548 } 4549 return 1; 4550 } 4551 return 0; 4552 } 4553 4554 /*---------------------------------------------------------------------------- 4555 | If `a' is denormal and we are in flush-to-zero mode then set the 4556 | input-denormal exception and return zero. Otherwise just return the value. 4557 *----------------------------------------------------------------------------*/ 4558 float16 float16_squash_input_denormal(float16 a, float_status *status) 4559 { 4560 if (status->flush_inputs_to_zero) { 4561 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4562 float_raise(float_flag_input_denormal, status); 4563 return make_float16(float16_val(a) & 0x8000); 4564 } 4565 } 4566 return a; 4567 } 4568 4569 /*---------------------------------------------------------------------------- 4570 | Returns the result of converting the double-precision floating-point value 4571 | `a' to the extended double-precision floating-point format. The conversion 4572 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4573 | Arithmetic. 4574 *----------------------------------------------------------------------------*/ 4575 4576 floatx80 float64_to_floatx80(float64 a, float_status *status) 4577 { 4578 flag aSign; 4579 int aExp; 4580 uint64_t aSig; 4581 4582 a = float64_squash_input_denormal(a, status); 4583 aSig = extractFloat64Frac( a ); 4584 aExp = extractFloat64Exp( a ); 4585 aSign = extractFloat64Sign( a ); 4586 if ( aExp == 0x7FF ) { 4587 if (aSig) { 4588 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4589 } 4590 return packFloatx80(aSign, 4591 floatx80_infinity_high, 4592 floatx80_infinity_low); 4593 } 4594 if ( aExp == 0 ) { 4595 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4596 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4597 } 4598 return 4599 packFloatx80( 4600 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4601 4602 } 4603 4604 /*---------------------------------------------------------------------------- 4605 | Returns the result of converting the double-precision floating-point value 4606 | `a' to the quadruple-precision floating-point format. The conversion is 4607 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4608 | Arithmetic. 4609 *----------------------------------------------------------------------------*/ 4610 4611 float128 float64_to_float128(float64 a, float_status *status) 4612 { 4613 flag aSign; 4614 int aExp; 4615 uint64_t aSig, zSig0, zSig1; 4616 4617 a = float64_squash_input_denormal(a, status); 4618 aSig = extractFloat64Frac( a ); 4619 aExp = extractFloat64Exp( a ); 4620 aSign = extractFloat64Sign( a ); 4621 if ( aExp == 0x7FF ) { 4622 if (aSig) { 4623 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4624 } 4625 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4626 } 4627 if ( aExp == 0 ) { 4628 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4629 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4630 --aExp; 4631 } 4632 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4633 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4634 4635 } 4636 4637 4638 /*---------------------------------------------------------------------------- 4639 | Returns the remainder of the double-precision floating-point value `a' 4640 | with respect to the corresponding value `b'. The operation is performed 4641 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4642 *----------------------------------------------------------------------------*/ 4643 4644 float64 float64_rem(float64 a, float64 b, float_status *status) 4645 { 4646 flag aSign, zSign; 4647 int aExp, bExp, expDiff; 4648 uint64_t aSig, bSig; 4649 uint64_t q, alternateASig; 4650 int64_t sigMean; 4651 4652 a = float64_squash_input_denormal(a, status); 4653 b = float64_squash_input_denormal(b, status); 4654 aSig = extractFloat64Frac( a ); 4655 aExp = extractFloat64Exp( a ); 4656 aSign = extractFloat64Sign( a ); 4657 bSig = extractFloat64Frac( b ); 4658 bExp = extractFloat64Exp( b ); 4659 if ( aExp == 0x7FF ) { 4660 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4661 return propagateFloat64NaN(a, b, status); 4662 } 4663 float_raise(float_flag_invalid, status); 4664 return float64_default_nan(status); 4665 } 4666 if ( bExp == 0x7FF ) { 4667 if (bSig) { 4668 return propagateFloat64NaN(a, b, status); 4669 } 4670 return a; 4671 } 4672 if ( bExp == 0 ) { 4673 if ( bSig == 0 ) { 4674 float_raise(float_flag_invalid, status); 4675 return float64_default_nan(status); 4676 } 4677 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4678 } 4679 if ( aExp == 0 ) { 4680 if ( aSig == 0 ) return a; 4681 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4682 } 4683 expDiff = aExp - bExp; 4684 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4685 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4686 if ( expDiff < 0 ) { 4687 if ( expDiff < -1 ) return a; 4688 aSig >>= 1; 4689 } 4690 q = ( bSig <= aSig ); 4691 if ( q ) aSig -= bSig; 4692 expDiff -= 64; 4693 while ( 0 < expDiff ) { 4694 q = estimateDiv128To64( aSig, 0, bSig ); 4695 q = ( 2 < q ) ? q - 2 : 0; 4696 aSig = - ( ( bSig>>2 ) * q ); 4697 expDiff -= 62; 4698 } 4699 expDiff += 64; 4700 if ( 0 < expDiff ) { 4701 q = estimateDiv128To64( aSig, 0, bSig ); 4702 q = ( 2 < q ) ? q - 2 : 0; 4703 q >>= 64 - expDiff; 4704 bSig >>= 2; 4705 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4706 } 4707 else { 4708 aSig >>= 2; 4709 bSig >>= 2; 4710 } 4711 do { 4712 alternateASig = aSig; 4713 ++q; 4714 aSig -= bSig; 4715 } while ( 0 <= (int64_t) aSig ); 4716 sigMean = aSig + alternateASig; 4717 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4718 aSig = alternateASig; 4719 } 4720 zSign = ( (int64_t) aSig < 0 ); 4721 if ( zSign ) aSig = - aSig; 4722 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4723 4724 } 4725 4726 /*---------------------------------------------------------------------------- 4727 | Returns the binary log of the double-precision floating-point value `a'. 4728 | The operation is performed according to the IEC/IEEE Standard for Binary 4729 | Floating-Point Arithmetic. 4730 *----------------------------------------------------------------------------*/ 4731 float64 float64_log2(float64 a, float_status *status) 4732 { 4733 flag aSign, zSign; 4734 int aExp; 4735 uint64_t aSig, aSig0, aSig1, zSig, i; 4736 a = float64_squash_input_denormal(a, status); 4737 4738 aSig = extractFloat64Frac( a ); 4739 aExp = extractFloat64Exp( a ); 4740 aSign = extractFloat64Sign( a ); 4741 4742 if ( aExp == 0 ) { 4743 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4744 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4745 } 4746 if ( aSign ) { 4747 float_raise(float_flag_invalid, status); 4748 return float64_default_nan(status); 4749 } 4750 if ( aExp == 0x7FF ) { 4751 if (aSig) { 4752 return propagateFloat64NaN(a, float64_zero, status); 4753 } 4754 return a; 4755 } 4756 4757 aExp -= 0x3FF; 4758 aSig |= LIT64( 0x0010000000000000 ); 4759 zSign = aExp < 0; 4760 zSig = (uint64_t)aExp << 52; 4761 for (i = 1LL << 51; i > 0; i >>= 1) { 4762 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4763 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4764 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4765 aSig >>= 1; 4766 zSig |= i; 4767 } 4768 } 4769 4770 if ( zSign ) 4771 zSig = -zSig; 4772 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4773 } 4774 4775 /*---------------------------------------------------------------------------- 4776 | Returns 1 if the double-precision floating-point value `a' is equal to the 4777 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4778 | if either operand is a NaN. Otherwise, the comparison is performed 4779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4780 *----------------------------------------------------------------------------*/ 4781 4782 int float64_eq(float64 a, float64 b, float_status *status) 4783 { 4784 uint64_t av, bv; 4785 a = float64_squash_input_denormal(a, status); 4786 b = float64_squash_input_denormal(b, status); 4787 4788 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4789 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4790 ) { 4791 float_raise(float_flag_invalid, status); 4792 return 0; 4793 } 4794 av = float64_val(a); 4795 bv = float64_val(b); 4796 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4797 4798 } 4799 4800 /*---------------------------------------------------------------------------- 4801 | Returns 1 if the double-precision floating-point value `a' is less than or 4802 | equal to the corresponding value `b', and 0 otherwise. The invalid 4803 | exception is raised if either operand is a NaN. The comparison is performed 4804 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4805 *----------------------------------------------------------------------------*/ 4806 4807 int float64_le(float64 a, float64 b, float_status *status) 4808 { 4809 flag aSign, bSign; 4810 uint64_t av, bv; 4811 a = float64_squash_input_denormal(a, status); 4812 b = float64_squash_input_denormal(b, status); 4813 4814 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4815 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4816 ) { 4817 float_raise(float_flag_invalid, status); 4818 return 0; 4819 } 4820 aSign = extractFloat64Sign( a ); 4821 bSign = extractFloat64Sign( b ); 4822 av = float64_val(a); 4823 bv = float64_val(b); 4824 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4825 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4826 4827 } 4828 4829 /*---------------------------------------------------------------------------- 4830 | Returns 1 if the double-precision floating-point value `a' is less than 4831 | the corresponding value `b', and 0 otherwise. The invalid exception is 4832 | raised if either operand is a NaN. The comparison is performed according 4833 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4834 *----------------------------------------------------------------------------*/ 4835 4836 int float64_lt(float64 a, float64 b, float_status *status) 4837 { 4838 flag aSign, bSign; 4839 uint64_t av, bv; 4840 4841 a = float64_squash_input_denormal(a, status); 4842 b = float64_squash_input_denormal(b, status); 4843 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4844 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4845 ) { 4846 float_raise(float_flag_invalid, status); 4847 return 0; 4848 } 4849 aSign = extractFloat64Sign( a ); 4850 bSign = extractFloat64Sign( b ); 4851 av = float64_val(a); 4852 bv = float64_val(b); 4853 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4854 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4855 4856 } 4857 4858 /*---------------------------------------------------------------------------- 4859 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4860 | be compared, and 0 otherwise. The invalid exception is raised if either 4861 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4862 | Standard for Binary Floating-Point Arithmetic. 4863 *----------------------------------------------------------------------------*/ 4864 4865 int float64_unordered(float64 a, float64 b, float_status *status) 4866 { 4867 a = float64_squash_input_denormal(a, status); 4868 b = float64_squash_input_denormal(b, status); 4869 4870 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4871 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4872 ) { 4873 float_raise(float_flag_invalid, status); 4874 return 1; 4875 } 4876 return 0; 4877 } 4878 4879 /*---------------------------------------------------------------------------- 4880 | Returns 1 if the double-precision floating-point value `a' is equal to the 4881 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4882 | exception.The comparison is performed according to the IEC/IEEE Standard 4883 | for Binary Floating-Point Arithmetic. 4884 *----------------------------------------------------------------------------*/ 4885 4886 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4887 { 4888 uint64_t av, bv; 4889 a = float64_squash_input_denormal(a, status); 4890 b = float64_squash_input_denormal(b, status); 4891 4892 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4893 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4894 ) { 4895 if (float64_is_signaling_nan(a, status) 4896 || float64_is_signaling_nan(b, status)) { 4897 float_raise(float_flag_invalid, status); 4898 } 4899 return 0; 4900 } 4901 av = float64_val(a); 4902 bv = float64_val(b); 4903 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4904 4905 } 4906 4907 /*---------------------------------------------------------------------------- 4908 | Returns 1 if the double-precision floating-point value `a' is less than or 4909 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4910 | cause an exception. Otherwise, the comparison is performed according to the 4911 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4912 *----------------------------------------------------------------------------*/ 4913 4914 int float64_le_quiet(float64 a, float64 b, float_status *status) 4915 { 4916 flag aSign, bSign; 4917 uint64_t av, bv; 4918 a = float64_squash_input_denormal(a, status); 4919 b = float64_squash_input_denormal(b, status); 4920 4921 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4922 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4923 ) { 4924 if (float64_is_signaling_nan(a, status) 4925 || float64_is_signaling_nan(b, status)) { 4926 float_raise(float_flag_invalid, status); 4927 } 4928 return 0; 4929 } 4930 aSign = extractFloat64Sign( a ); 4931 bSign = extractFloat64Sign( b ); 4932 av = float64_val(a); 4933 bv = float64_val(b); 4934 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4935 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4936 4937 } 4938 4939 /*---------------------------------------------------------------------------- 4940 | Returns 1 if the double-precision floating-point value `a' is less than 4941 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4942 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4943 | Standard for Binary Floating-Point Arithmetic. 4944 *----------------------------------------------------------------------------*/ 4945 4946 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4947 { 4948 flag aSign, bSign; 4949 uint64_t av, bv; 4950 a = float64_squash_input_denormal(a, status); 4951 b = float64_squash_input_denormal(b, status); 4952 4953 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4954 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4955 ) { 4956 if (float64_is_signaling_nan(a, status) 4957 || float64_is_signaling_nan(b, status)) { 4958 float_raise(float_flag_invalid, status); 4959 } 4960 return 0; 4961 } 4962 aSign = extractFloat64Sign( a ); 4963 bSign = extractFloat64Sign( b ); 4964 av = float64_val(a); 4965 bv = float64_val(b); 4966 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4967 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4968 4969 } 4970 4971 /*---------------------------------------------------------------------------- 4972 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4973 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4974 | comparison is performed according to the IEC/IEEE Standard for Binary 4975 | Floating-Point Arithmetic. 4976 *----------------------------------------------------------------------------*/ 4977 4978 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4979 { 4980 a = float64_squash_input_denormal(a, status); 4981 b = float64_squash_input_denormal(b, status); 4982 4983 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4984 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4985 ) { 4986 if (float64_is_signaling_nan(a, status) 4987 || float64_is_signaling_nan(b, status)) { 4988 float_raise(float_flag_invalid, status); 4989 } 4990 return 1; 4991 } 4992 return 0; 4993 } 4994 4995 /*---------------------------------------------------------------------------- 4996 | Returns the result of converting the extended double-precision floating- 4997 | point value `a' to the 32-bit two's complement integer format. The 4998 | conversion is performed according to the IEC/IEEE Standard for Binary 4999 | Floating-Point Arithmetic---which means in particular that the conversion 5000 | is rounded according to the current rounding mode. If `a' is a NaN, the 5001 | largest positive integer is returned. Otherwise, if the conversion 5002 | overflows, the largest integer with the same sign as `a' is returned. 5003 *----------------------------------------------------------------------------*/ 5004 5005 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5006 { 5007 flag aSign; 5008 int32_t aExp, shiftCount; 5009 uint64_t aSig; 5010 5011 if (floatx80_invalid_encoding(a)) { 5012 float_raise(float_flag_invalid, status); 5013 return 1 << 31; 5014 } 5015 aSig = extractFloatx80Frac( a ); 5016 aExp = extractFloatx80Exp( a ); 5017 aSign = extractFloatx80Sign( a ); 5018 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5019 shiftCount = 0x4037 - aExp; 5020 if ( shiftCount <= 0 ) shiftCount = 1; 5021 shift64RightJamming( aSig, shiftCount, &aSig ); 5022 return roundAndPackInt32(aSign, aSig, status); 5023 5024 } 5025 5026 /*---------------------------------------------------------------------------- 5027 | Returns the result of converting the extended double-precision floating- 5028 | point value `a' to the 32-bit two's complement integer format. The 5029 | conversion is performed according to the IEC/IEEE Standard for Binary 5030 | Floating-Point Arithmetic, except that the conversion is always rounded 5031 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5032 | Otherwise, if the conversion overflows, the largest integer with the same 5033 | sign as `a' is returned. 5034 *----------------------------------------------------------------------------*/ 5035 5036 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5037 { 5038 flag aSign; 5039 int32_t aExp, shiftCount; 5040 uint64_t aSig, savedASig; 5041 int32_t z; 5042 5043 if (floatx80_invalid_encoding(a)) { 5044 float_raise(float_flag_invalid, status); 5045 return 1 << 31; 5046 } 5047 aSig = extractFloatx80Frac( a ); 5048 aExp = extractFloatx80Exp( a ); 5049 aSign = extractFloatx80Sign( a ); 5050 if ( 0x401E < aExp ) { 5051 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5052 goto invalid; 5053 } 5054 else if ( aExp < 0x3FFF ) { 5055 if (aExp || aSig) { 5056 status->float_exception_flags |= float_flag_inexact; 5057 } 5058 return 0; 5059 } 5060 shiftCount = 0x403E - aExp; 5061 savedASig = aSig; 5062 aSig >>= shiftCount; 5063 z = aSig; 5064 if ( aSign ) z = - z; 5065 if ( ( z < 0 ) ^ aSign ) { 5066 invalid: 5067 float_raise(float_flag_invalid, status); 5068 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5069 } 5070 if ( ( aSig<<shiftCount ) != savedASig ) { 5071 status->float_exception_flags |= float_flag_inexact; 5072 } 5073 return z; 5074 5075 } 5076 5077 /*---------------------------------------------------------------------------- 5078 | Returns the result of converting the extended double-precision floating- 5079 | point value `a' to the 64-bit two's complement integer format. The 5080 | conversion is performed according to the IEC/IEEE Standard for Binary 5081 | Floating-Point Arithmetic---which means in particular that the conversion 5082 | is rounded according to the current rounding mode. If `a' is a NaN, 5083 | the largest positive integer is returned. Otherwise, if the conversion 5084 | overflows, the largest integer with the same sign as `a' is returned. 5085 *----------------------------------------------------------------------------*/ 5086 5087 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5088 { 5089 flag aSign; 5090 int32_t aExp, shiftCount; 5091 uint64_t aSig, aSigExtra; 5092 5093 if (floatx80_invalid_encoding(a)) { 5094 float_raise(float_flag_invalid, status); 5095 return 1ULL << 63; 5096 } 5097 aSig = extractFloatx80Frac( a ); 5098 aExp = extractFloatx80Exp( a ); 5099 aSign = extractFloatx80Sign( a ); 5100 shiftCount = 0x403E - aExp; 5101 if ( shiftCount <= 0 ) { 5102 if ( shiftCount ) { 5103 float_raise(float_flag_invalid, status); 5104 if (!aSign || floatx80_is_any_nan(a)) { 5105 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5106 } 5107 return (int64_t) LIT64( 0x8000000000000000 ); 5108 } 5109 aSigExtra = 0; 5110 } 5111 else { 5112 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5113 } 5114 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5115 5116 } 5117 5118 /*---------------------------------------------------------------------------- 5119 | Returns the result of converting the extended double-precision floating- 5120 | point value `a' to the 64-bit two's complement integer format. The 5121 | conversion is performed according to the IEC/IEEE Standard for Binary 5122 | Floating-Point Arithmetic, except that the conversion is always rounded 5123 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5124 | Otherwise, if the conversion overflows, the largest integer with the same 5125 | sign as `a' is returned. 5126 *----------------------------------------------------------------------------*/ 5127 5128 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5129 { 5130 flag aSign; 5131 int32_t aExp, shiftCount; 5132 uint64_t aSig; 5133 int64_t z; 5134 5135 if (floatx80_invalid_encoding(a)) { 5136 float_raise(float_flag_invalid, status); 5137 return 1ULL << 63; 5138 } 5139 aSig = extractFloatx80Frac( a ); 5140 aExp = extractFloatx80Exp( a ); 5141 aSign = extractFloatx80Sign( a ); 5142 shiftCount = aExp - 0x403E; 5143 if ( 0 <= shiftCount ) { 5144 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5145 if ( ( a.high != 0xC03E ) || aSig ) { 5146 float_raise(float_flag_invalid, status); 5147 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5148 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5149 } 5150 } 5151 return (int64_t) LIT64( 0x8000000000000000 ); 5152 } 5153 else if ( aExp < 0x3FFF ) { 5154 if (aExp | aSig) { 5155 status->float_exception_flags |= float_flag_inexact; 5156 } 5157 return 0; 5158 } 5159 z = aSig>>( - shiftCount ); 5160 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5161 status->float_exception_flags |= float_flag_inexact; 5162 } 5163 if ( aSign ) z = - z; 5164 return z; 5165 5166 } 5167 5168 /*---------------------------------------------------------------------------- 5169 | Returns the result of converting the extended double-precision floating- 5170 | point value `a' to the single-precision floating-point format. The 5171 | conversion is performed according to the IEC/IEEE Standard for Binary 5172 | Floating-Point Arithmetic. 5173 *----------------------------------------------------------------------------*/ 5174 5175 float32 floatx80_to_float32(floatx80 a, float_status *status) 5176 { 5177 flag aSign; 5178 int32_t aExp; 5179 uint64_t aSig; 5180 5181 if (floatx80_invalid_encoding(a)) { 5182 float_raise(float_flag_invalid, status); 5183 return float32_default_nan(status); 5184 } 5185 aSig = extractFloatx80Frac( a ); 5186 aExp = extractFloatx80Exp( a ); 5187 aSign = extractFloatx80Sign( a ); 5188 if ( aExp == 0x7FFF ) { 5189 if ( (uint64_t) ( aSig<<1 ) ) { 5190 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5191 } 5192 return packFloat32( aSign, 0xFF, 0 ); 5193 } 5194 shift64RightJamming( aSig, 33, &aSig ); 5195 if ( aExp || aSig ) aExp -= 0x3F81; 5196 return roundAndPackFloat32(aSign, aExp, aSig, status); 5197 5198 } 5199 5200 /*---------------------------------------------------------------------------- 5201 | Returns the result of converting the extended double-precision floating- 5202 | point value `a' to the double-precision floating-point format. The 5203 | conversion is performed according to the IEC/IEEE Standard for Binary 5204 | Floating-Point Arithmetic. 5205 *----------------------------------------------------------------------------*/ 5206 5207 float64 floatx80_to_float64(floatx80 a, float_status *status) 5208 { 5209 flag aSign; 5210 int32_t aExp; 5211 uint64_t aSig, zSig; 5212 5213 if (floatx80_invalid_encoding(a)) { 5214 float_raise(float_flag_invalid, status); 5215 return float64_default_nan(status); 5216 } 5217 aSig = extractFloatx80Frac( a ); 5218 aExp = extractFloatx80Exp( a ); 5219 aSign = extractFloatx80Sign( a ); 5220 if ( aExp == 0x7FFF ) { 5221 if ( (uint64_t) ( aSig<<1 ) ) { 5222 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5223 } 5224 return packFloat64( aSign, 0x7FF, 0 ); 5225 } 5226 shift64RightJamming( aSig, 1, &zSig ); 5227 if ( aExp || aSig ) aExp -= 0x3C01; 5228 return roundAndPackFloat64(aSign, aExp, zSig, status); 5229 5230 } 5231 5232 /*---------------------------------------------------------------------------- 5233 | Returns the result of converting the extended double-precision floating- 5234 | point value `a' to the quadruple-precision floating-point format. The 5235 | conversion is performed according to the IEC/IEEE Standard for Binary 5236 | Floating-Point Arithmetic. 5237 *----------------------------------------------------------------------------*/ 5238 5239 float128 floatx80_to_float128(floatx80 a, float_status *status) 5240 { 5241 flag aSign; 5242 int aExp; 5243 uint64_t aSig, zSig0, zSig1; 5244 5245 if (floatx80_invalid_encoding(a)) { 5246 float_raise(float_flag_invalid, status); 5247 return float128_default_nan(status); 5248 } 5249 aSig = extractFloatx80Frac( a ); 5250 aExp = extractFloatx80Exp( a ); 5251 aSign = extractFloatx80Sign( a ); 5252 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5253 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5254 } 5255 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5256 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5257 5258 } 5259 5260 /*---------------------------------------------------------------------------- 5261 | Rounds the extended double-precision floating-point value `a' 5262 | to the precision provided by floatx80_rounding_precision and returns the 5263 | result as an extended double-precision floating-point value. 5264 | The operation is performed according to the IEC/IEEE Standard for Binary 5265 | Floating-Point Arithmetic. 5266 *----------------------------------------------------------------------------*/ 5267 5268 floatx80 floatx80_round(floatx80 a, float_status *status) 5269 { 5270 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5271 extractFloatx80Sign(a), 5272 extractFloatx80Exp(a), 5273 extractFloatx80Frac(a), 0, status); 5274 } 5275 5276 /*---------------------------------------------------------------------------- 5277 | Rounds the extended double-precision floating-point value `a' to an integer, 5278 | and returns the result as an extended quadruple-precision floating-point 5279 | value. The operation is performed according to the IEC/IEEE Standard for 5280 | Binary Floating-Point Arithmetic. 5281 *----------------------------------------------------------------------------*/ 5282 5283 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5284 { 5285 flag aSign; 5286 int32_t aExp; 5287 uint64_t lastBitMask, roundBitsMask; 5288 floatx80 z; 5289 5290 if (floatx80_invalid_encoding(a)) { 5291 float_raise(float_flag_invalid, status); 5292 return floatx80_default_nan(status); 5293 } 5294 aExp = extractFloatx80Exp( a ); 5295 if ( 0x403E <= aExp ) { 5296 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5297 return propagateFloatx80NaN(a, a, status); 5298 } 5299 return a; 5300 } 5301 if ( aExp < 0x3FFF ) { 5302 if ( ( aExp == 0 ) 5303 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5304 return a; 5305 } 5306 status->float_exception_flags |= float_flag_inexact; 5307 aSign = extractFloatx80Sign( a ); 5308 switch (status->float_rounding_mode) { 5309 case float_round_nearest_even: 5310 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5311 ) { 5312 return 5313 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5314 } 5315 break; 5316 case float_round_ties_away: 5317 if (aExp == 0x3FFE) { 5318 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5319 } 5320 break; 5321 case float_round_down: 5322 return 5323 aSign ? 5324 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5325 : packFloatx80( 0, 0, 0 ); 5326 case float_round_up: 5327 return 5328 aSign ? packFloatx80( 1, 0, 0 ) 5329 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5330 } 5331 return packFloatx80( aSign, 0, 0 ); 5332 } 5333 lastBitMask = 1; 5334 lastBitMask <<= 0x403E - aExp; 5335 roundBitsMask = lastBitMask - 1; 5336 z = a; 5337 switch (status->float_rounding_mode) { 5338 case float_round_nearest_even: 5339 z.low += lastBitMask>>1; 5340 if ((z.low & roundBitsMask) == 0) { 5341 z.low &= ~lastBitMask; 5342 } 5343 break; 5344 case float_round_ties_away: 5345 z.low += lastBitMask >> 1; 5346 break; 5347 case float_round_to_zero: 5348 break; 5349 case float_round_up: 5350 if (!extractFloatx80Sign(z)) { 5351 z.low += roundBitsMask; 5352 } 5353 break; 5354 case float_round_down: 5355 if (extractFloatx80Sign(z)) { 5356 z.low += roundBitsMask; 5357 } 5358 break; 5359 default: 5360 abort(); 5361 } 5362 z.low &= ~ roundBitsMask; 5363 if ( z.low == 0 ) { 5364 ++z.high; 5365 z.low = LIT64( 0x8000000000000000 ); 5366 } 5367 if (z.low != a.low) { 5368 status->float_exception_flags |= float_flag_inexact; 5369 } 5370 return z; 5371 5372 } 5373 5374 /*---------------------------------------------------------------------------- 5375 | Returns the result of adding the absolute values of the extended double- 5376 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5377 | negated before being returned. `zSign' is ignored if the result is a NaN. 5378 | The addition is performed according to the IEC/IEEE Standard for Binary 5379 | Floating-Point Arithmetic. 5380 *----------------------------------------------------------------------------*/ 5381 5382 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5383 float_status *status) 5384 { 5385 int32_t aExp, bExp, zExp; 5386 uint64_t aSig, bSig, zSig0, zSig1; 5387 int32_t expDiff; 5388 5389 aSig = extractFloatx80Frac( a ); 5390 aExp = extractFloatx80Exp( a ); 5391 bSig = extractFloatx80Frac( b ); 5392 bExp = extractFloatx80Exp( b ); 5393 expDiff = aExp - bExp; 5394 if ( 0 < expDiff ) { 5395 if ( aExp == 0x7FFF ) { 5396 if ((uint64_t)(aSig << 1)) { 5397 return propagateFloatx80NaN(a, b, status); 5398 } 5399 return a; 5400 } 5401 if ( bExp == 0 ) --expDiff; 5402 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5403 zExp = aExp; 5404 } 5405 else if ( expDiff < 0 ) { 5406 if ( bExp == 0x7FFF ) { 5407 if ((uint64_t)(bSig << 1)) { 5408 return propagateFloatx80NaN(a, b, status); 5409 } 5410 return packFloatx80(zSign, 5411 floatx80_infinity_high, 5412 floatx80_infinity_low); 5413 } 5414 if ( aExp == 0 ) ++expDiff; 5415 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5416 zExp = bExp; 5417 } 5418 else { 5419 if ( aExp == 0x7FFF ) { 5420 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5421 return propagateFloatx80NaN(a, b, status); 5422 } 5423 return a; 5424 } 5425 zSig1 = 0; 5426 zSig0 = aSig + bSig; 5427 if ( aExp == 0 ) { 5428 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5429 goto roundAndPack; 5430 } 5431 zExp = aExp; 5432 goto shiftRight1; 5433 } 5434 zSig0 = aSig + bSig; 5435 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5436 shiftRight1: 5437 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5438 zSig0 |= LIT64( 0x8000000000000000 ); 5439 ++zExp; 5440 roundAndPack: 5441 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5442 zSign, zExp, zSig0, zSig1, status); 5443 } 5444 5445 /*---------------------------------------------------------------------------- 5446 | Returns the result of subtracting the absolute values of the extended 5447 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5448 | difference is negated before being returned. `zSign' is ignored if the 5449 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5450 | Standard for Binary Floating-Point Arithmetic. 5451 *----------------------------------------------------------------------------*/ 5452 5453 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5454 float_status *status) 5455 { 5456 int32_t aExp, bExp, zExp; 5457 uint64_t aSig, bSig, zSig0, zSig1; 5458 int32_t expDiff; 5459 5460 aSig = extractFloatx80Frac( a ); 5461 aExp = extractFloatx80Exp( a ); 5462 bSig = extractFloatx80Frac( b ); 5463 bExp = extractFloatx80Exp( b ); 5464 expDiff = aExp - bExp; 5465 if ( 0 < expDiff ) goto aExpBigger; 5466 if ( expDiff < 0 ) goto bExpBigger; 5467 if ( aExp == 0x7FFF ) { 5468 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5469 return propagateFloatx80NaN(a, b, status); 5470 } 5471 float_raise(float_flag_invalid, status); 5472 return floatx80_default_nan(status); 5473 } 5474 if ( aExp == 0 ) { 5475 aExp = 1; 5476 bExp = 1; 5477 } 5478 zSig1 = 0; 5479 if ( bSig < aSig ) goto aBigger; 5480 if ( aSig < bSig ) goto bBigger; 5481 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5482 bExpBigger: 5483 if ( bExp == 0x7FFF ) { 5484 if ((uint64_t)(bSig << 1)) { 5485 return propagateFloatx80NaN(a, b, status); 5486 } 5487 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5488 floatx80_infinity_low); 5489 } 5490 if ( aExp == 0 ) ++expDiff; 5491 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5492 bBigger: 5493 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5494 zExp = bExp; 5495 zSign ^= 1; 5496 goto normalizeRoundAndPack; 5497 aExpBigger: 5498 if ( aExp == 0x7FFF ) { 5499 if ((uint64_t)(aSig << 1)) { 5500 return propagateFloatx80NaN(a, b, status); 5501 } 5502 return a; 5503 } 5504 if ( bExp == 0 ) --expDiff; 5505 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5506 aBigger: 5507 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5508 zExp = aExp; 5509 normalizeRoundAndPack: 5510 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5511 zSign, zExp, zSig0, zSig1, status); 5512 } 5513 5514 /*---------------------------------------------------------------------------- 5515 | Returns the result of adding the extended double-precision floating-point 5516 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5517 | Standard for Binary Floating-Point Arithmetic. 5518 *----------------------------------------------------------------------------*/ 5519 5520 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5521 { 5522 flag aSign, bSign; 5523 5524 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5525 float_raise(float_flag_invalid, status); 5526 return floatx80_default_nan(status); 5527 } 5528 aSign = extractFloatx80Sign( a ); 5529 bSign = extractFloatx80Sign( b ); 5530 if ( aSign == bSign ) { 5531 return addFloatx80Sigs(a, b, aSign, status); 5532 } 5533 else { 5534 return subFloatx80Sigs(a, b, aSign, status); 5535 } 5536 5537 } 5538 5539 /*---------------------------------------------------------------------------- 5540 | Returns the result of subtracting the extended double-precision floating- 5541 | point values `a' and `b'. The operation is performed according to the 5542 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5543 *----------------------------------------------------------------------------*/ 5544 5545 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5546 { 5547 flag aSign, bSign; 5548 5549 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5550 float_raise(float_flag_invalid, status); 5551 return floatx80_default_nan(status); 5552 } 5553 aSign = extractFloatx80Sign( a ); 5554 bSign = extractFloatx80Sign( b ); 5555 if ( aSign == bSign ) { 5556 return subFloatx80Sigs(a, b, aSign, status); 5557 } 5558 else { 5559 return addFloatx80Sigs(a, b, aSign, status); 5560 } 5561 5562 } 5563 5564 /*---------------------------------------------------------------------------- 5565 | Returns the result of multiplying the extended double-precision floating- 5566 | point values `a' and `b'. The operation is performed according to the 5567 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5568 *----------------------------------------------------------------------------*/ 5569 5570 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5571 { 5572 flag aSign, bSign, zSign; 5573 int32_t aExp, bExp, zExp; 5574 uint64_t aSig, bSig, zSig0, zSig1; 5575 5576 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5577 float_raise(float_flag_invalid, status); 5578 return floatx80_default_nan(status); 5579 } 5580 aSig = extractFloatx80Frac( a ); 5581 aExp = extractFloatx80Exp( a ); 5582 aSign = extractFloatx80Sign( a ); 5583 bSig = extractFloatx80Frac( b ); 5584 bExp = extractFloatx80Exp( b ); 5585 bSign = extractFloatx80Sign( b ); 5586 zSign = aSign ^ bSign; 5587 if ( aExp == 0x7FFF ) { 5588 if ( (uint64_t) ( aSig<<1 ) 5589 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5590 return propagateFloatx80NaN(a, b, status); 5591 } 5592 if ( ( bExp | bSig ) == 0 ) goto invalid; 5593 return packFloatx80(zSign, floatx80_infinity_high, 5594 floatx80_infinity_low); 5595 } 5596 if ( bExp == 0x7FFF ) { 5597 if ((uint64_t)(bSig << 1)) { 5598 return propagateFloatx80NaN(a, b, status); 5599 } 5600 if ( ( aExp | aSig ) == 0 ) { 5601 invalid: 5602 float_raise(float_flag_invalid, status); 5603 return floatx80_default_nan(status); 5604 } 5605 return packFloatx80(zSign, floatx80_infinity_high, 5606 floatx80_infinity_low); 5607 } 5608 if ( aExp == 0 ) { 5609 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5610 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5611 } 5612 if ( bExp == 0 ) { 5613 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5614 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5615 } 5616 zExp = aExp + bExp - 0x3FFE; 5617 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5618 if ( 0 < (int64_t) zSig0 ) { 5619 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5620 --zExp; 5621 } 5622 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5623 zSign, zExp, zSig0, zSig1, status); 5624 } 5625 5626 /*---------------------------------------------------------------------------- 5627 | Returns the result of dividing the extended double-precision floating-point 5628 | value `a' by the corresponding value `b'. The operation is performed 5629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5630 *----------------------------------------------------------------------------*/ 5631 5632 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5633 { 5634 flag aSign, bSign, zSign; 5635 int32_t aExp, bExp, zExp; 5636 uint64_t aSig, bSig, zSig0, zSig1; 5637 uint64_t rem0, rem1, rem2, term0, term1, term2; 5638 5639 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5640 float_raise(float_flag_invalid, status); 5641 return floatx80_default_nan(status); 5642 } 5643 aSig = extractFloatx80Frac( a ); 5644 aExp = extractFloatx80Exp( a ); 5645 aSign = extractFloatx80Sign( a ); 5646 bSig = extractFloatx80Frac( b ); 5647 bExp = extractFloatx80Exp( b ); 5648 bSign = extractFloatx80Sign( b ); 5649 zSign = aSign ^ bSign; 5650 if ( aExp == 0x7FFF ) { 5651 if ((uint64_t)(aSig << 1)) { 5652 return propagateFloatx80NaN(a, b, status); 5653 } 5654 if ( bExp == 0x7FFF ) { 5655 if ((uint64_t)(bSig << 1)) { 5656 return propagateFloatx80NaN(a, b, status); 5657 } 5658 goto invalid; 5659 } 5660 return packFloatx80(zSign, floatx80_infinity_high, 5661 floatx80_infinity_low); 5662 } 5663 if ( bExp == 0x7FFF ) { 5664 if ((uint64_t)(bSig << 1)) { 5665 return propagateFloatx80NaN(a, b, status); 5666 } 5667 return packFloatx80( zSign, 0, 0 ); 5668 } 5669 if ( bExp == 0 ) { 5670 if ( bSig == 0 ) { 5671 if ( ( aExp | aSig ) == 0 ) { 5672 invalid: 5673 float_raise(float_flag_invalid, status); 5674 return floatx80_default_nan(status); 5675 } 5676 float_raise(float_flag_divbyzero, status); 5677 return packFloatx80(zSign, floatx80_infinity_high, 5678 floatx80_infinity_low); 5679 } 5680 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5681 } 5682 if ( aExp == 0 ) { 5683 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5684 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5685 } 5686 zExp = aExp - bExp + 0x3FFE; 5687 rem1 = 0; 5688 if ( bSig <= aSig ) { 5689 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5690 ++zExp; 5691 } 5692 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5693 mul64To128( bSig, zSig0, &term0, &term1 ); 5694 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5695 while ( (int64_t) rem0 < 0 ) { 5696 --zSig0; 5697 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5698 } 5699 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5700 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5701 mul64To128( bSig, zSig1, &term1, &term2 ); 5702 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5703 while ( (int64_t) rem1 < 0 ) { 5704 --zSig1; 5705 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5706 } 5707 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5708 } 5709 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5710 zSign, zExp, zSig0, zSig1, status); 5711 } 5712 5713 /*---------------------------------------------------------------------------- 5714 | Returns the remainder of the extended double-precision floating-point value 5715 | `a' with respect to the corresponding value `b'. The operation is performed 5716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5717 *----------------------------------------------------------------------------*/ 5718 5719 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5720 { 5721 flag aSign, zSign; 5722 int32_t aExp, bExp, expDiff; 5723 uint64_t aSig0, aSig1, bSig; 5724 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5725 5726 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5727 float_raise(float_flag_invalid, status); 5728 return floatx80_default_nan(status); 5729 } 5730 aSig0 = extractFloatx80Frac( a ); 5731 aExp = extractFloatx80Exp( a ); 5732 aSign = extractFloatx80Sign( a ); 5733 bSig = extractFloatx80Frac( b ); 5734 bExp = extractFloatx80Exp( b ); 5735 if ( aExp == 0x7FFF ) { 5736 if ( (uint64_t) ( aSig0<<1 ) 5737 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5738 return propagateFloatx80NaN(a, b, status); 5739 } 5740 goto invalid; 5741 } 5742 if ( bExp == 0x7FFF ) { 5743 if ((uint64_t)(bSig << 1)) { 5744 return propagateFloatx80NaN(a, b, status); 5745 } 5746 return a; 5747 } 5748 if ( bExp == 0 ) { 5749 if ( bSig == 0 ) { 5750 invalid: 5751 float_raise(float_flag_invalid, status); 5752 return floatx80_default_nan(status); 5753 } 5754 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5755 } 5756 if ( aExp == 0 ) { 5757 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5758 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5759 } 5760 bSig |= LIT64( 0x8000000000000000 ); 5761 zSign = aSign; 5762 expDiff = aExp - bExp; 5763 aSig1 = 0; 5764 if ( expDiff < 0 ) { 5765 if ( expDiff < -1 ) return a; 5766 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5767 expDiff = 0; 5768 } 5769 q = ( bSig <= aSig0 ); 5770 if ( q ) aSig0 -= bSig; 5771 expDiff -= 64; 5772 while ( 0 < expDiff ) { 5773 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5774 q = ( 2 < q ) ? q - 2 : 0; 5775 mul64To128( bSig, q, &term0, &term1 ); 5776 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5777 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5778 expDiff -= 62; 5779 } 5780 expDiff += 64; 5781 if ( 0 < expDiff ) { 5782 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5783 q = ( 2 < q ) ? q - 2 : 0; 5784 q >>= 64 - expDiff; 5785 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5786 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5787 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5788 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5789 ++q; 5790 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5791 } 5792 } 5793 else { 5794 term1 = 0; 5795 term0 = bSig; 5796 } 5797 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5798 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5799 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5800 && ( q & 1 ) ) 5801 ) { 5802 aSig0 = alternateASig0; 5803 aSig1 = alternateASig1; 5804 zSign = ! zSign; 5805 } 5806 return 5807 normalizeRoundAndPackFloatx80( 5808 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5809 5810 } 5811 5812 /*---------------------------------------------------------------------------- 5813 | Returns the square root of the extended double-precision floating-point 5814 | value `a'. The operation is performed according to the IEC/IEEE Standard 5815 | for Binary Floating-Point Arithmetic. 5816 *----------------------------------------------------------------------------*/ 5817 5818 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5819 { 5820 flag aSign; 5821 int32_t aExp, zExp; 5822 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5823 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5824 5825 if (floatx80_invalid_encoding(a)) { 5826 float_raise(float_flag_invalid, status); 5827 return floatx80_default_nan(status); 5828 } 5829 aSig0 = extractFloatx80Frac( a ); 5830 aExp = extractFloatx80Exp( a ); 5831 aSign = extractFloatx80Sign( a ); 5832 if ( aExp == 0x7FFF ) { 5833 if ((uint64_t)(aSig0 << 1)) { 5834 return propagateFloatx80NaN(a, a, status); 5835 } 5836 if ( ! aSign ) return a; 5837 goto invalid; 5838 } 5839 if ( aSign ) { 5840 if ( ( aExp | aSig0 ) == 0 ) return a; 5841 invalid: 5842 float_raise(float_flag_invalid, status); 5843 return floatx80_default_nan(status); 5844 } 5845 if ( aExp == 0 ) { 5846 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5847 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5848 } 5849 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5850 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5851 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5852 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5853 doubleZSig0 = zSig0<<1; 5854 mul64To128( zSig0, zSig0, &term0, &term1 ); 5855 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5856 while ( (int64_t) rem0 < 0 ) { 5857 --zSig0; 5858 doubleZSig0 -= 2; 5859 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5860 } 5861 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5862 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5863 if ( zSig1 == 0 ) zSig1 = 1; 5864 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5865 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5866 mul64To128( zSig1, zSig1, &term2, &term3 ); 5867 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5868 while ( (int64_t) rem1 < 0 ) { 5869 --zSig1; 5870 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5871 term3 |= 1; 5872 term2 |= doubleZSig0; 5873 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5874 } 5875 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5876 } 5877 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5878 zSig0 |= doubleZSig0; 5879 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5880 0, zExp, zSig0, zSig1, status); 5881 } 5882 5883 /*---------------------------------------------------------------------------- 5884 | Returns 1 if the extended double-precision floating-point value `a' is equal 5885 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5886 | raised if either operand is a NaN. Otherwise, the comparison is performed 5887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5888 *----------------------------------------------------------------------------*/ 5889 5890 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5891 { 5892 5893 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5894 || (extractFloatx80Exp(a) == 0x7FFF 5895 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5896 || (extractFloatx80Exp(b) == 0x7FFF 5897 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5898 ) { 5899 float_raise(float_flag_invalid, status); 5900 return 0; 5901 } 5902 return 5903 ( a.low == b.low ) 5904 && ( ( a.high == b.high ) 5905 || ( ( a.low == 0 ) 5906 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5907 ); 5908 5909 } 5910 5911 /*---------------------------------------------------------------------------- 5912 | Returns 1 if the extended double-precision floating-point value `a' is 5913 | less than or equal to the corresponding value `b', and 0 otherwise. The 5914 | invalid exception is raised if either operand is a NaN. The comparison is 5915 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5916 | Arithmetic. 5917 *----------------------------------------------------------------------------*/ 5918 5919 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5920 { 5921 flag aSign, bSign; 5922 5923 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5924 || (extractFloatx80Exp(a) == 0x7FFF 5925 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5926 || (extractFloatx80Exp(b) == 0x7FFF 5927 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5928 ) { 5929 float_raise(float_flag_invalid, status); 5930 return 0; 5931 } 5932 aSign = extractFloatx80Sign( a ); 5933 bSign = extractFloatx80Sign( b ); 5934 if ( aSign != bSign ) { 5935 return 5936 aSign 5937 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5938 == 0 ); 5939 } 5940 return 5941 aSign ? le128( b.high, b.low, a.high, a.low ) 5942 : le128( a.high, a.low, b.high, b.low ); 5943 5944 } 5945 5946 /*---------------------------------------------------------------------------- 5947 | Returns 1 if the extended double-precision floating-point value `a' is 5948 | less than the corresponding value `b', and 0 otherwise. The invalid 5949 | exception is raised if either operand is a NaN. The comparison is performed 5950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5951 *----------------------------------------------------------------------------*/ 5952 5953 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5954 { 5955 flag aSign, bSign; 5956 5957 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5958 || (extractFloatx80Exp(a) == 0x7FFF 5959 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5960 || (extractFloatx80Exp(b) == 0x7FFF 5961 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5962 ) { 5963 float_raise(float_flag_invalid, status); 5964 return 0; 5965 } 5966 aSign = extractFloatx80Sign( a ); 5967 bSign = extractFloatx80Sign( b ); 5968 if ( aSign != bSign ) { 5969 return 5970 aSign 5971 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5972 != 0 ); 5973 } 5974 return 5975 aSign ? lt128( b.high, b.low, a.high, a.low ) 5976 : lt128( a.high, a.low, b.high, b.low ); 5977 5978 } 5979 5980 /*---------------------------------------------------------------------------- 5981 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5982 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5983 | either operand is a NaN. The comparison is performed according to the 5984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5985 *----------------------------------------------------------------------------*/ 5986 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5987 { 5988 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5989 || (extractFloatx80Exp(a) == 0x7FFF 5990 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5991 || (extractFloatx80Exp(b) == 0x7FFF 5992 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5993 ) { 5994 float_raise(float_flag_invalid, status); 5995 return 1; 5996 } 5997 return 0; 5998 } 5999 6000 /*---------------------------------------------------------------------------- 6001 | Returns 1 if the extended double-precision floating-point value `a' is 6002 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6003 | cause an exception. The comparison is performed according to the IEC/IEEE 6004 | Standard for Binary Floating-Point Arithmetic. 6005 *----------------------------------------------------------------------------*/ 6006 6007 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6008 { 6009 6010 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6011 float_raise(float_flag_invalid, status); 6012 return 0; 6013 } 6014 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6015 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6016 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6017 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6018 ) { 6019 if (floatx80_is_signaling_nan(a, status) 6020 || floatx80_is_signaling_nan(b, status)) { 6021 float_raise(float_flag_invalid, status); 6022 } 6023 return 0; 6024 } 6025 return 6026 ( a.low == b.low ) 6027 && ( ( a.high == b.high ) 6028 || ( ( a.low == 0 ) 6029 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6030 ); 6031 6032 } 6033 6034 /*---------------------------------------------------------------------------- 6035 | Returns 1 if the extended double-precision floating-point value `a' is less 6036 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6037 | do not cause an exception. Otherwise, the comparison is performed according 6038 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6039 *----------------------------------------------------------------------------*/ 6040 6041 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6042 { 6043 flag aSign, bSign; 6044 6045 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6046 float_raise(float_flag_invalid, status); 6047 return 0; 6048 } 6049 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6050 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6051 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6052 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6053 ) { 6054 if (floatx80_is_signaling_nan(a, status) 6055 || floatx80_is_signaling_nan(b, status)) { 6056 float_raise(float_flag_invalid, status); 6057 } 6058 return 0; 6059 } 6060 aSign = extractFloatx80Sign( a ); 6061 bSign = extractFloatx80Sign( b ); 6062 if ( aSign != bSign ) { 6063 return 6064 aSign 6065 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6066 == 0 ); 6067 } 6068 return 6069 aSign ? le128( b.high, b.low, a.high, a.low ) 6070 : le128( a.high, a.low, b.high, b.low ); 6071 6072 } 6073 6074 /*---------------------------------------------------------------------------- 6075 | Returns 1 if the extended double-precision floating-point value `a' is less 6076 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6077 | an exception. Otherwise, the comparison is performed according to the 6078 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6079 *----------------------------------------------------------------------------*/ 6080 6081 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6082 { 6083 flag aSign, bSign; 6084 6085 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6086 float_raise(float_flag_invalid, status); 6087 return 0; 6088 } 6089 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6090 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6091 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6092 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6093 ) { 6094 if (floatx80_is_signaling_nan(a, status) 6095 || floatx80_is_signaling_nan(b, status)) { 6096 float_raise(float_flag_invalid, status); 6097 } 6098 return 0; 6099 } 6100 aSign = extractFloatx80Sign( a ); 6101 bSign = extractFloatx80Sign( b ); 6102 if ( aSign != bSign ) { 6103 return 6104 aSign 6105 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6106 != 0 ); 6107 } 6108 return 6109 aSign ? lt128( b.high, b.low, a.high, a.low ) 6110 : lt128( a.high, a.low, b.high, b.low ); 6111 6112 } 6113 6114 /*---------------------------------------------------------------------------- 6115 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6116 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6117 | The comparison is performed according to the IEC/IEEE Standard for Binary 6118 | Floating-Point Arithmetic. 6119 *----------------------------------------------------------------------------*/ 6120 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6121 { 6122 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6123 float_raise(float_flag_invalid, status); 6124 return 1; 6125 } 6126 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6127 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6128 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6129 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6130 ) { 6131 if (floatx80_is_signaling_nan(a, status) 6132 || floatx80_is_signaling_nan(b, status)) { 6133 float_raise(float_flag_invalid, status); 6134 } 6135 return 1; 6136 } 6137 return 0; 6138 } 6139 6140 /*---------------------------------------------------------------------------- 6141 | Returns the result of converting the quadruple-precision floating-point 6142 | value `a' to the 32-bit two's complement integer format. The conversion 6143 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6144 | Arithmetic---which means in particular that the conversion is rounded 6145 | according to the current rounding mode. If `a' is a NaN, the largest 6146 | positive integer is returned. Otherwise, if the conversion overflows, the 6147 | largest integer with the same sign as `a' is returned. 6148 *----------------------------------------------------------------------------*/ 6149 6150 int32_t float128_to_int32(float128 a, float_status *status) 6151 { 6152 flag aSign; 6153 int32_t aExp, shiftCount; 6154 uint64_t aSig0, aSig1; 6155 6156 aSig1 = extractFloat128Frac1( a ); 6157 aSig0 = extractFloat128Frac0( a ); 6158 aExp = extractFloat128Exp( a ); 6159 aSign = extractFloat128Sign( a ); 6160 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6161 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6162 aSig0 |= ( aSig1 != 0 ); 6163 shiftCount = 0x4028 - aExp; 6164 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6165 return roundAndPackInt32(aSign, aSig0, status); 6166 6167 } 6168 6169 /*---------------------------------------------------------------------------- 6170 | Returns the result of converting the quadruple-precision floating-point 6171 | value `a' to the 32-bit two's complement integer format. The conversion 6172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6173 | Arithmetic, except that the conversion is always rounded toward zero. If 6174 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6175 | conversion overflows, the largest integer with the same sign as `a' is 6176 | returned. 6177 *----------------------------------------------------------------------------*/ 6178 6179 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6180 { 6181 flag aSign; 6182 int32_t aExp, shiftCount; 6183 uint64_t aSig0, aSig1, savedASig; 6184 int32_t z; 6185 6186 aSig1 = extractFloat128Frac1( a ); 6187 aSig0 = extractFloat128Frac0( a ); 6188 aExp = extractFloat128Exp( a ); 6189 aSign = extractFloat128Sign( a ); 6190 aSig0 |= ( aSig1 != 0 ); 6191 if ( 0x401E < aExp ) { 6192 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6193 goto invalid; 6194 } 6195 else if ( aExp < 0x3FFF ) { 6196 if (aExp || aSig0) { 6197 status->float_exception_flags |= float_flag_inexact; 6198 } 6199 return 0; 6200 } 6201 aSig0 |= LIT64( 0x0001000000000000 ); 6202 shiftCount = 0x402F - aExp; 6203 savedASig = aSig0; 6204 aSig0 >>= shiftCount; 6205 z = aSig0; 6206 if ( aSign ) z = - z; 6207 if ( ( z < 0 ) ^ aSign ) { 6208 invalid: 6209 float_raise(float_flag_invalid, status); 6210 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6211 } 6212 if ( ( aSig0<<shiftCount ) != savedASig ) { 6213 status->float_exception_flags |= float_flag_inexact; 6214 } 6215 return z; 6216 6217 } 6218 6219 /*---------------------------------------------------------------------------- 6220 | Returns the result of converting the quadruple-precision floating-point 6221 | value `a' to the 64-bit two's complement integer format. The conversion 6222 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6223 | Arithmetic---which means in particular that the conversion is rounded 6224 | according to the current rounding mode. If `a' is a NaN, the largest 6225 | positive integer is returned. Otherwise, if the conversion overflows, the 6226 | largest integer with the same sign as `a' is returned. 6227 *----------------------------------------------------------------------------*/ 6228 6229 int64_t float128_to_int64(float128 a, float_status *status) 6230 { 6231 flag aSign; 6232 int32_t aExp, shiftCount; 6233 uint64_t aSig0, aSig1; 6234 6235 aSig1 = extractFloat128Frac1( a ); 6236 aSig0 = extractFloat128Frac0( a ); 6237 aExp = extractFloat128Exp( a ); 6238 aSign = extractFloat128Sign( a ); 6239 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6240 shiftCount = 0x402F - aExp; 6241 if ( shiftCount <= 0 ) { 6242 if ( 0x403E < aExp ) { 6243 float_raise(float_flag_invalid, status); 6244 if ( ! aSign 6245 || ( ( aExp == 0x7FFF ) 6246 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6247 ) 6248 ) { 6249 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6250 } 6251 return (int64_t) LIT64( 0x8000000000000000 ); 6252 } 6253 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6254 } 6255 else { 6256 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6257 } 6258 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6259 6260 } 6261 6262 /*---------------------------------------------------------------------------- 6263 | Returns the result of converting the quadruple-precision floating-point 6264 | value `a' to the 64-bit two's complement integer format. The conversion 6265 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6266 | Arithmetic, except that the conversion is always rounded toward zero. 6267 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6268 | the conversion overflows, the largest integer with the same sign as `a' is 6269 | returned. 6270 *----------------------------------------------------------------------------*/ 6271 6272 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6273 { 6274 flag aSign; 6275 int32_t aExp, shiftCount; 6276 uint64_t aSig0, aSig1; 6277 int64_t z; 6278 6279 aSig1 = extractFloat128Frac1( a ); 6280 aSig0 = extractFloat128Frac0( a ); 6281 aExp = extractFloat128Exp( a ); 6282 aSign = extractFloat128Sign( a ); 6283 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6284 shiftCount = aExp - 0x402F; 6285 if ( 0 < shiftCount ) { 6286 if ( 0x403E <= aExp ) { 6287 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6288 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6289 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6290 if (aSig1) { 6291 status->float_exception_flags |= float_flag_inexact; 6292 } 6293 } 6294 else { 6295 float_raise(float_flag_invalid, status); 6296 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6297 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6298 } 6299 } 6300 return (int64_t) LIT64( 0x8000000000000000 ); 6301 } 6302 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6303 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6304 status->float_exception_flags |= float_flag_inexact; 6305 } 6306 } 6307 else { 6308 if ( aExp < 0x3FFF ) { 6309 if ( aExp | aSig0 | aSig1 ) { 6310 status->float_exception_flags |= float_flag_inexact; 6311 } 6312 return 0; 6313 } 6314 z = aSig0>>( - shiftCount ); 6315 if ( aSig1 6316 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6317 status->float_exception_flags |= float_flag_inexact; 6318 } 6319 } 6320 if ( aSign ) z = - z; 6321 return z; 6322 6323 } 6324 6325 /*---------------------------------------------------------------------------- 6326 | Returns the result of converting the quadruple-precision floating-point value 6327 | `a' to the 64-bit unsigned integer format. The conversion is 6328 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6329 | Arithmetic---which means in particular that the conversion is rounded 6330 | according to the current rounding mode. If `a' is a NaN, the largest 6331 | positive integer is returned. If the conversion overflows, the 6332 | largest unsigned integer is returned. If 'a' is negative, the value is 6333 | rounded and zero is returned; negative values that do not round to zero 6334 | will raise the inexact exception. 6335 *----------------------------------------------------------------------------*/ 6336 6337 uint64_t float128_to_uint64(float128 a, float_status *status) 6338 { 6339 flag aSign; 6340 int aExp; 6341 int shiftCount; 6342 uint64_t aSig0, aSig1; 6343 6344 aSig0 = extractFloat128Frac0(a); 6345 aSig1 = extractFloat128Frac1(a); 6346 aExp = extractFloat128Exp(a); 6347 aSign = extractFloat128Sign(a); 6348 if (aSign && (aExp > 0x3FFE)) { 6349 float_raise(float_flag_invalid, status); 6350 if (float128_is_any_nan(a)) { 6351 return LIT64(0xFFFFFFFFFFFFFFFF); 6352 } else { 6353 return 0; 6354 } 6355 } 6356 if (aExp) { 6357 aSig0 |= LIT64(0x0001000000000000); 6358 } 6359 shiftCount = 0x402F - aExp; 6360 if (shiftCount <= 0) { 6361 if (0x403E < aExp) { 6362 float_raise(float_flag_invalid, status); 6363 return LIT64(0xFFFFFFFFFFFFFFFF); 6364 } 6365 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6366 } else { 6367 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6368 } 6369 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6370 } 6371 6372 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6373 { 6374 uint64_t v; 6375 signed char current_rounding_mode = status->float_rounding_mode; 6376 6377 set_float_rounding_mode(float_round_to_zero, status); 6378 v = float128_to_uint64(a, status); 6379 set_float_rounding_mode(current_rounding_mode, status); 6380 6381 return v; 6382 } 6383 6384 /*---------------------------------------------------------------------------- 6385 | Returns the result of converting the quadruple-precision floating-point 6386 | value `a' to the 32-bit unsigned integer format. The conversion 6387 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6388 | Arithmetic except that the conversion is always rounded toward zero. 6389 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6390 | if the conversion overflows, the largest unsigned integer is returned. 6391 | If 'a' is negative, the value is rounded and zero is returned; negative 6392 | values that do not round to zero will raise the inexact exception. 6393 *----------------------------------------------------------------------------*/ 6394 6395 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6396 { 6397 uint64_t v; 6398 uint32_t res; 6399 int old_exc_flags = get_float_exception_flags(status); 6400 6401 v = float128_to_uint64_round_to_zero(a, status); 6402 if (v > 0xffffffff) { 6403 res = 0xffffffff; 6404 } else { 6405 return v; 6406 } 6407 set_float_exception_flags(old_exc_flags, status); 6408 float_raise(float_flag_invalid, status); 6409 return res; 6410 } 6411 6412 /*---------------------------------------------------------------------------- 6413 | Returns the result of converting the quadruple-precision floating-point 6414 | value `a' to the single-precision floating-point format. The conversion 6415 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6416 | Arithmetic. 6417 *----------------------------------------------------------------------------*/ 6418 6419 float32 float128_to_float32(float128 a, float_status *status) 6420 { 6421 flag aSign; 6422 int32_t aExp; 6423 uint64_t aSig0, aSig1; 6424 uint32_t zSig; 6425 6426 aSig1 = extractFloat128Frac1( a ); 6427 aSig0 = extractFloat128Frac0( a ); 6428 aExp = extractFloat128Exp( a ); 6429 aSign = extractFloat128Sign( a ); 6430 if ( aExp == 0x7FFF ) { 6431 if ( aSig0 | aSig1 ) { 6432 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6433 } 6434 return packFloat32( aSign, 0xFF, 0 ); 6435 } 6436 aSig0 |= ( aSig1 != 0 ); 6437 shift64RightJamming( aSig0, 18, &aSig0 ); 6438 zSig = aSig0; 6439 if ( aExp || zSig ) { 6440 zSig |= 0x40000000; 6441 aExp -= 0x3F81; 6442 } 6443 return roundAndPackFloat32(aSign, aExp, zSig, status); 6444 6445 } 6446 6447 /*---------------------------------------------------------------------------- 6448 | Returns the result of converting the quadruple-precision floating-point 6449 | value `a' to the double-precision floating-point format. The conversion 6450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6451 | Arithmetic. 6452 *----------------------------------------------------------------------------*/ 6453 6454 float64 float128_to_float64(float128 a, float_status *status) 6455 { 6456 flag aSign; 6457 int32_t aExp; 6458 uint64_t aSig0, aSig1; 6459 6460 aSig1 = extractFloat128Frac1( a ); 6461 aSig0 = extractFloat128Frac0( a ); 6462 aExp = extractFloat128Exp( a ); 6463 aSign = extractFloat128Sign( a ); 6464 if ( aExp == 0x7FFF ) { 6465 if ( aSig0 | aSig1 ) { 6466 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6467 } 6468 return packFloat64( aSign, 0x7FF, 0 ); 6469 } 6470 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6471 aSig0 |= ( aSig1 != 0 ); 6472 if ( aExp || aSig0 ) { 6473 aSig0 |= LIT64( 0x4000000000000000 ); 6474 aExp -= 0x3C01; 6475 } 6476 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6477 6478 } 6479 6480 /*---------------------------------------------------------------------------- 6481 | Returns the result of converting the quadruple-precision floating-point 6482 | value `a' to the extended double-precision floating-point format. The 6483 | conversion is performed according to the IEC/IEEE Standard for Binary 6484 | Floating-Point Arithmetic. 6485 *----------------------------------------------------------------------------*/ 6486 6487 floatx80 float128_to_floatx80(float128 a, float_status *status) 6488 { 6489 flag aSign; 6490 int32_t aExp; 6491 uint64_t aSig0, aSig1; 6492 6493 aSig1 = extractFloat128Frac1( a ); 6494 aSig0 = extractFloat128Frac0( a ); 6495 aExp = extractFloat128Exp( a ); 6496 aSign = extractFloat128Sign( a ); 6497 if ( aExp == 0x7FFF ) { 6498 if ( aSig0 | aSig1 ) { 6499 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6500 } 6501 return packFloatx80(aSign, floatx80_infinity_high, 6502 floatx80_infinity_low); 6503 } 6504 if ( aExp == 0 ) { 6505 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6506 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6507 } 6508 else { 6509 aSig0 |= LIT64( 0x0001000000000000 ); 6510 } 6511 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6512 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6513 6514 } 6515 6516 /*---------------------------------------------------------------------------- 6517 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6518 | returns the result as a quadruple-precision floating-point value. The 6519 | operation is performed according to the IEC/IEEE Standard for Binary 6520 | Floating-Point Arithmetic. 6521 *----------------------------------------------------------------------------*/ 6522 6523 float128 float128_round_to_int(float128 a, float_status *status) 6524 { 6525 flag aSign; 6526 int32_t aExp; 6527 uint64_t lastBitMask, roundBitsMask; 6528 float128 z; 6529 6530 aExp = extractFloat128Exp( a ); 6531 if ( 0x402F <= aExp ) { 6532 if ( 0x406F <= aExp ) { 6533 if ( ( aExp == 0x7FFF ) 6534 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6535 ) { 6536 return propagateFloat128NaN(a, a, status); 6537 } 6538 return a; 6539 } 6540 lastBitMask = 1; 6541 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6542 roundBitsMask = lastBitMask - 1; 6543 z = a; 6544 switch (status->float_rounding_mode) { 6545 case float_round_nearest_even: 6546 if ( lastBitMask ) { 6547 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6548 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6549 } 6550 else { 6551 if ( (int64_t) z.low < 0 ) { 6552 ++z.high; 6553 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6554 } 6555 } 6556 break; 6557 case float_round_ties_away: 6558 if (lastBitMask) { 6559 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6560 } else { 6561 if ((int64_t) z.low < 0) { 6562 ++z.high; 6563 } 6564 } 6565 break; 6566 case float_round_to_zero: 6567 break; 6568 case float_round_up: 6569 if (!extractFloat128Sign(z)) { 6570 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6571 } 6572 break; 6573 case float_round_down: 6574 if (extractFloat128Sign(z)) { 6575 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6576 } 6577 break; 6578 default: 6579 abort(); 6580 } 6581 z.low &= ~ roundBitsMask; 6582 } 6583 else { 6584 if ( aExp < 0x3FFF ) { 6585 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6586 status->float_exception_flags |= float_flag_inexact; 6587 aSign = extractFloat128Sign( a ); 6588 switch (status->float_rounding_mode) { 6589 case float_round_nearest_even: 6590 if ( ( aExp == 0x3FFE ) 6591 && ( extractFloat128Frac0( a ) 6592 | extractFloat128Frac1( a ) ) 6593 ) { 6594 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6595 } 6596 break; 6597 case float_round_ties_away: 6598 if (aExp == 0x3FFE) { 6599 return packFloat128(aSign, 0x3FFF, 0, 0); 6600 } 6601 break; 6602 case float_round_down: 6603 return 6604 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6605 : packFloat128( 0, 0, 0, 0 ); 6606 case float_round_up: 6607 return 6608 aSign ? packFloat128( 1, 0, 0, 0 ) 6609 : packFloat128( 0, 0x3FFF, 0, 0 ); 6610 } 6611 return packFloat128( aSign, 0, 0, 0 ); 6612 } 6613 lastBitMask = 1; 6614 lastBitMask <<= 0x402F - aExp; 6615 roundBitsMask = lastBitMask - 1; 6616 z.low = 0; 6617 z.high = a.high; 6618 switch (status->float_rounding_mode) { 6619 case float_round_nearest_even: 6620 z.high += lastBitMask>>1; 6621 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6622 z.high &= ~ lastBitMask; 6623 } 6624 break; 6625 case float_round_ties_away: 6626 z.high += lastBitMask>>1; 6627 break; 6628 case float_round_to_zero: 6629 break; 6630 case float_round_up: 6631 if (!extractFloat128Sign(z)) { 6632 z.high |= ( a.low != 0 ); 6633 z.high += roundBitsMask; 6634 } 6635 break; 6636 case float_round_down: 6637 if (extractFloat128Sign(z)) { 6638 z.high |= (a.low != 0); 6639 z.high += roundBitsMask; 6640 } 6641 break; 6642 default: 6643 abort(); 6644 } 6645 z.high &= ~ roundBitsMask; 6646 } 6647 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6648 status->float_exception_flags |= float_flag_inexact; 6649 } 6650 return z; 6651 6652 } 6653 6654 /*---------------------------------------------------------------------------- 6655 | Returns the result of adding the absolute values of the quadruple-precision 6656 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6657 | before being returned. `zSign' is ignored if the result is a NaN. 6658 | The addition is performed according to the IEC/IEEE Standard for Binary 6659 | Floating-Point Arithmetic. 6660 *----------------------------------------------------------------------------*/ 6661 6662 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6663 float_status *status) 6664 { 6665 int32_t aExp, bExp, zExp; 6666 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6667 int32_t expDiff; 6668 6669 aSig1 = extractFloat128Frac1( a ); 6670 aSig0 = extractFloat128Frac0( a ); 6671 aExp = extractFloat128Exp( a ); 6672 bSig1 = extractFloat128Frac1( b ); 6673 bSig0 = extractFloat128Frac0( b ); 6674 bExp = extractFloat128Exp( b ); 6675 expDiff = aExp - bExp; 6676 if ( 0 < expDiff ) { 6677 if ( aExp == 0x7FFF ) { 6678 if (aSig0 | aSig1) { 6679 return propagateFloat128NaN(a, b, status); 6680 } 6681 return a; 6682 } 6683 if ( bExp == 0 ) { 6684 --expDiff; 6685 } 6686 else { 6687 bSig0 |= LIT64( 0x0001000000000000 ); 6688 } 6689 shift128ExtraRightJamming( 6690 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6691 zExp = aExp; 6692 } 6693 else if ( expDiff < 0 ) { 6694 if ( bExp == 0x7FFF ) { 6695 if (bSig0 | bSig1) { 6696 return propagateFloat128NaN(a, b, status); 6697 } 6698 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6699 } 6700 if ( aExp == 0 ) { 6701 ++expDiff; 6702 } 6703 else { 6704 aSig0 |= LIT64( 0x0001000000000000 ); 6705 } 6706 shift128ExtraRightJamming( 6707 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6708 zExp = bExp; 6709 } 6710 else { 6711 if ( aExp == 0x7FFF ) { 6712 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6713 return propagateFloat128NaN(a, b, status); 6714 } 6715 return a; 6716 } 6717 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6718 if ( aExp == 0 ) { 6719 if (status->flush_to_zero) { 6720 if (zSig0 | zSig1) { 6721 float_raise(float_flag_output_denormal, status); 6722 } 6723 return packFloat128(zSign, 0, 0, 0); 6724 } 6725 return packFloat128( zSign, 0, zSig0, zSig1 ); 6726 } 6727 zSig2 = 0; 6728 zSig0 |= LIT64( 0x0002000000000000 ); 6729 zExp = aExp; 6730 goto shiftRight1; 6731 } 6732 aSig0 |= LIT64( 0x0001000000000000 ); 6733 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6734 --zExp; 6735 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6736 ++zExp; 6737 shiftRight1: 6738 shift128ExtraRightJamming( 6739 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6740 roundAndPack: 6741 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6742 6743 } 6744 6745 /*---------------------------------------------------------------------------- 6746 | Returns the result of subtracting the absolute values of the quadruple- 6747 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6748 | difference is negated before being returned. `zSign' is ignored if the 6749 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6750 | Standard for Binary Floating-Point Arithmetic. 6751 *----------------------------------------------------------------------------*/ 6752 6753 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6754 float_status *status) 6755 { 6756 int32_t aExp, bExp, zExp; 6757 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6758 int32_t expDiff; 6759 6760 aSig1 = extractFloat128Frac1( a ); 6761 aSig0 = extractFloat128Frac0( a ); 6762 aExp = extractFloat128Exp( a ); 6763 bSig1 = extractFloat128Frac1( b ); 6764 bSig0 = extractFloat128Frac0( b ); 6765 bExp = extractFloat128Exp( b ); 6766 expDiff = aExp - bExp; 6767 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6768 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6769 if ( 0 < expDiff ) goto aExpBigger; 6770 if ( expDiff < 0 ) goto bExpBigger; 6771 if ( aExp == 0x7FFF ) { 6772 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6773 return propagateFloat128NaN(a, b, status); 6774 } 6775 float_raise(float_flag_invalid, status); 6776 return float128_default_nan(status); 6777 } 6778 if ( aExp == 0 ) { 6779 aExp = 1; 6780 bExp = 1; 6781 } 6782 if ( bSig0 < aSig0 ) goto aBigger; 6783 if ( aSig0 < bSig0 ) goto bBigger; 6784 if ( bSig1 < aSig1 ) goto aBigger; 6785 if ( aSig1 < bSig1 ) goto bBigger; 6786 return packFloat128(status->float_rounding_mode == float_round_down, 6787 0, 0, 0); 6788 bExpBigger: 6789 if ( bExp == 0x7FFF ) { 6790 if (bSig0 | bSig1) { 6791 return propagateFloat128NaN(a, b, status); 6792 } 6793 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6794 } 6795 if ( aExp == 0 ) { 6796 ++expDiff; 6797 } 6798 else { 6799 aSig0 |= LIT64( 0x4000000000000000 ); 6800 } 6801 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6802 bSig0 |= LIT64( 0x4000000000000000 ); 6803 bBigger: 6804 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6805 zExp = bExp; 6806 zSign ^= 1; 6807 goto normalizeRoundAndPack; 6808 aExpBigger: 6809 if ( aExp == 0x7FFF ) { 6810 if (aSig0 | aSig1) { 6811 return propagateFloat128NaN(a, b, status); 6812 } 6813 return a; 6814 } 6815 if ( bExp == 0 ) { 6816 --expDiff; 6817 } 6818 else { 6819 bSig0 |= LIT64( 0x4000000000000000 ); 6820 } 6821 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6822 aSig0 |= LIT64( 0x4000000000000000 ); 6823 aBigger: 6824 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6825 zExp = aExp; 6826 normalizeRoundAndPack: 6827 --zExp; 6828 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6829 status); 6830 6831 } 6832 6833 /*---------------------------------------------------------------------------- 6834 | Returns the result of adding the quadruple-precision floating-point values 6835 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6836 | for Binary Floating-Point Arithmetic. 6837 *----------------------------------------------------------------------------*/ 6838 6839 float128 float128_add(float128 a, float128 b, float_status *status) 6840 { 6841 flag aSign, bSign; 6842 6843 aSign = extractFloat128Sign( a ); 6844 bSign = extractFloat128Sign( b ); 6845 if ( aSign == bSign ) { 6846 return addFloat128Sigs(a, b, aSign, status); 6847 } 6848 else { 6849 return subFloat128Sigs(a, b, aSign, status); 6850 } 6851 6852 } 6853 6854 /*---------------------------------------------------------------------------- 6855 | Returns the result of subtracting the quadruple-precision floating-point 6856 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6857 | Standard for Binary Floating-Point Arithmetic. 6858 *----------------------------------------------------------------------------*/ 6859 6860 float128 float128_sub(float128 a, float128 b, float_status *status) 6861 { 6862 flag aSign, bSign; 6863 6864 aSign = extractFloat128Sign( a ); 6865 bSign = extractFloat128Sign( b ); 6866 if ( aSign == bSign ) { 6867 return subFloat128Sigs(a, b, aSign, status); 6868 } 6869 else { 6870 return addFloat128Sigs(a, b, aSign, status); 6871 } 6872 6873 } 6874 6875 /*---------------------------------------------------------------------------- 6876 | Returns the result of multiplying the quadruple-precision floating-point 6877 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6878 | Standard for Binary Floating-Point Arithmetic. 6879 *----------------------------------------------------------------------------*/ 6880 6881 float128 float128_mul(float128 a, float128 b, float_status *status) 6882 { 6883 flag aSign, bSign, zSign; 6884 int32_t aExp, bExp, zExp; 6885 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6886 6887 aSig1 = extractFloat128Frac1( a ); 6888 aSig0 = extractFloat128Frac0( a ); 6889 aExp = extractFloat128Exp( a ); 6890 aSign = extractFloat128Sign( a ); 6891 bSig1 = extractFloat128Frac1( b ); 6892 bSig0 = extractFloat128Frac0( b ); 6893 bExp = extractFloat128Exp( b ); 6894 bSign = extractFloat128Sign( b ); 6895 zSign = aSign ^ bSign; 6896 if ( aExp == 0x7FFF ) { 6897 if ( ( aSig0 | aSig1 ) 6898 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6899 return propagateFloat128NaN(a, b, status); 6900 } 6901 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6902 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6903 } 6904 if ( bExp == 0x7FFF ) { 6905 if (bSig0 | bSig1) { 6906 return propagateFloat128NaN(a, b, status); 6907 } 6908 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6909 invalid: 6910 float_raise(float_flag_invalid, status); 6911 return float128_default_nan(status); 6912 } 6913 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6914 } 6915 if ( aExp == 0 ) { 6916 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6917 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6918 } 6919 if ( bExp == 0 ) { 6920 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6921 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6922 } 6923 zExp = aExp + bExp - 0x4000; 6924 aSig0 |= LIT64( 0x0001000000000000 ); 6925 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6926 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6927 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6928 zSig2 |= ( zSig3 != 0 ); 6929 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6930 shift128ExtraRightJamming( 6931 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6932 ++zExp; 6933 } 6934 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6935 6936 } 6937 6938 /*---------------------------------------------------------------------------- 6939 | Returns the result of dividing the quadruple-precision floating-point value 6940 | `a' by the corresponding value `b'. The operation is performed according to 6941 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6942 *----------------------------------------------------------------------------*/ 6943 6944 float128 float128_div(float128 a, float128 b, float_status *status) 6945 { 6946 flag aSign, bSign, zSign; 6947 int32_t aExp, bExp, zExp; 6948 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6949 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6950 6951 aSig1 = extractFloat128Frac1( a ); 6952 aSig0 = extractFloat128Frac0( a ); 6953 aExp = extractFloat128Exp( a ); 6954 aSign = extractFloat128Sign( a ); 6955 bSig1 = extractFloat128Frac1( b ); 6956 bSig0 = extractFloat128Frac0( b ); 6957 bExp = extractFloat128Exp( b ); 6958 bSign = extractFloat128Sign( b ); 6959 zSign = aSign ^ bSign; 6960 if ( aExp == 0x7FFF ) { 6961 if (aSig0 | aSig1) { 6962 return propagateFloat128NaN(a, b, status); 6963 } 6964 if ( bExp == 0x7FFF ) { 6965 if (bSig0 | bSig1) { 6966 return propagateFloat128NaN(a, b, status); 6967 } 6968 goto invalid; 6969 } 6970 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6971 } 6972 if ( bExp == 0x7FFF ) { 6973 if (bSig0 | bSig1) { 6974 return propagateFloat128NaN(a, b, status); 6975 } 6976 return packFloat128( zSign, 0, 0, 0 ); 6977 } 6978 if ( bExp == 0 ) { 6979 if ( ( bSig0 | bSig1 ) == 0 ) { 6980 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6981 invalid: 6982 float_raise(float_flag_invalid, status); 6983 return float128_default_nan(status); 6984 } 6985 float_raise(float_flag_divbyzero, status); 6986 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6987 } 6988 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6989 } 6990 if ( aExp == 0 ) { 6991 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6992 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6993 } 6994 zExp = aExp - bExp + 0x3FFD; 6995 shortShift128Left( 6996 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6997 shortShift128Left( 6998 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6999 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7000 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7001 ++zExp; 7002 } 7003 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7004 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7005 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7006 while ( (int64_t) rem0 < 0 ) { 7007 --zSig0; 7008 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7009 } 7010 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7011 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7012 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7013 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7014 while ( (int64_t) rem1 < 0 ) { 7015 --zSig1; 7016 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7017 } 7018 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7019 } 7020 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7021 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7022 7023 } 7024 7025 /*---------------------------------------------------------------------------- 7026 | Returns the remainder of the quadruple-precision floating-point value `a' 7027 | with respect to the corresponding value `b'. The operation is performed 7028 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7029 *----------------------------------------------------------------------------*/ 7030 7031 float128 float128_rem(float128 a, float128 b, float_status *status) 7032 { 7033 flag aSign, zSign; 7034 int32_t aExp, bExp, expDiff; 7035 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7036 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7037 int64_t sigMean0; 7038 7039 aSig1 = extractFloat128Frac1( a ); 7040 aSig0 = extractFloat128Frac0( a ); 7041 aExp = extractFloat128Exp( a ); 7042 aSign = extractFloat128Sign( a ); 7043 bSig1 = extractFloat128Frac1( b ); 7044 bSig0 = extractFloat128Frac0( b ); 7045 bExp = extractFloat128Exp( b ); 7046 if ( aExp == 0x7FFF ) { 7047 if ( ( aSig0 | aSig1 ) 7048 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7049 return propagateFloat128NaN(a, b, status); 7050 } 7051 goto invalid; 7052 } 7053 if ( bExp == 0x7FFF ) { 7054 if (bSig0 | bSig1) { 7055 return propagateFloat128NaN(a, b, status); 7056 } 7057 return a; 7058 } 7059 if ( bExp == 0 ) { 7060 if ( ( bSig0 | bSig1 ) == 0 ) { 7061 invalid: 7062 float_raise(float_flag_invalid, status); 7063 return float128_default_nan(status); 7064 } 7065 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7066 } 7067 if ( aExp == 0 ) { 7068 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7069 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7070 } 7071 expDiff = aExp - bExp; 7072 if ( expDiff < -1 ) return a; 7073 shortShift128Left( 7074 aSig0 | LIT64( 0x0001000000000000 ), 7075 aSig1, 7076 15 - ( expDiff < 0 ), 7077 &aSig0, 7078 &aSig1 7079 ); 7080 shortShift128Left( 7081 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7082 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7083 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7084 expDiff -= 64; 7085 while ( 0 < expDiff ) { 7086 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7087 q = ( 4 < q ) ? q - 4 : 0; 7088 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7089 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7090 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7091 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7092 expDiff -= 61; 7093 } 7094 if ( -64 < expDiff ) { 7095 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7096 q = ( 4 < q ) ? q - 4 : 0; 7097 q >>= - expDiff; 7098 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7099 expDiff += 52; 7100 if ( expDiff < 0 ) { 7101 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7102 } 7103 else { 7104 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7105 } 7106 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7107 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7108 } 7109 else { 7110 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7111 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7112 } 7113 do { 7114 alternateASig0 = aSig0; 7115 alternateASig1 = aSig1; 7116 ++q; 7117 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7118 } while ( 0 <= (int64_t) aSig0 ); 7119 add128( 7120 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7121 if ( ( sigMean0 < 0 ) 7122 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7123 aSig0 = alternateASig0; 7124 aSig1 = alternateASig1; 7125 } 7126 zSign = ( (int64_t) aSig0 < 0 ); 7127 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7128 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7129 status); 7130 } 7131 7132 /*---------------------------------------------------------------------------- 7133 | Returns the square root of the quadruple-precision floating-point value `a'. 7134 | The operation is performed according to the IEC/IEEE Standard for Binary 7135 | Floating-Point Arithmetic. 7136 *----------------------------------------------------------------------------*/ 7137 7138 float128 float128_sqrt(float128 a, float_status *status) 7139 { 7140 flag aSign; 7141 int32_t aExp, zExp; 7142 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7143 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7144 7145 aSig1 = extractFloat128Frac1( a ); 7146 aSig0 = extractFloat128Frac0( a ); 7147 aExp = extractFloat128Exp( a ); 7148 aSign = extractFloat128Sign( a ); 7149 if ( aExp == 0x7FFF ) { 7150 if (aSig0 | aSig1) { 7151 return propagateFloat128NaN(a, a, status); 7152 } 7153 if ( ! aSign ) return a; 7154 goto invalid; 7155 } 7156 if ( aSign ) { 7157 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7158 invalid: 7159 float_raise(float_flag_invalid, status); 7160 return float128_default_nan(status); 7161 } 7162 if ( aExp == 0 ) { 7163 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7164 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7165 } 7166 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7167 aSig0 |= LIT64( 0x0001000000000000 ); 7168 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7169 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7170 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7171 doubleZSig0 = zSig0<<1; 7172 mul64To128( zSig0, zSig0, &term0, &term1 ); 7173 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7174 while ( (int64_t) rem0 < 0 ) { 7175 --zSig0; 7176 doubleZSig0 -= 2; 7177 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7178 } 7179 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7180 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7181 if ( zSig1 == 0 ) zSig1 = 1; 7182 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7183 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7184 mul64To128( zSig1, zSig1, &term2, &term3 ); 7185 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7186 while ( (int64_t) rem1 < 0 ) { 7187 --zSig1; 7188 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7189 term3 |= 1; 7190 term2 |= doubleZSig0; 7191 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7192 } 7193 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7194 } 7195 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7196 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7197 7198 } 7199 7200 /*---------------------------------------------------------------------------- 7201 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7202 | the corresponding value `b', and 0 otherwise. The invalid exception is 7203 | raised if either operand is a NaN. Otherwise, the comparison is performed 7204 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7205 *----------------------------------------------------------------------------*/ 7206 7207 int float128_eq(float128 a, float128 b, float_status *status) 7208 { 7209 7210 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7211 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7212 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7213 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7214 ) { 7215 float_raise(float_flag_invalid, status); 7216 return 0; 7217 } 7218 return 7219 ( a.low == b.low ) 7220 && ( ( a.high == b.high ) 7221 || ( ( a.low == 0 ) 7222 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7223 ); 7224 7225 } 7226 7227 /*---------------------------------------------------------------------------- 7228 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7229 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7230 | exception is raised if either operand is a NaN. The comparison is performed 7231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7232 *----------------------------------------------------------------------------*/ 7233 7234 int float128_le(float128 a, float128 b, float_status *status) 7235 { 7236 flag aSign, bSign; 7237 7238 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7239 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7240 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7241 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7242 ) { 7243 float_raise(float_flag_invalid, status); 7244 return 0; 7245 } 7246 aSign = extractFloat128Sign( a ); 7247 bSign = extractFloat128Sign( b ); 7248 if ( aSign != bSign ) { 7249 return 7250 aSign 7251 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7252 == 0 ); 7253 } 7254 return 7255 aSign ? le128( b.high, b.low, a.high, a.low ) 7256 : le128( a.high, a.low, b.high, b.low ); 7257 7258 } 7259 7260 /*---------------------------------------------------------------------------- 7261 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7262 | the corresponding value `b', and 0 otherwise. The invalid exception is 7263 | raised if either operand is a NaN. The comparison is performed according 7264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7265 *----------------------------------------------------------------------------*/ 7266 7267 int float128_lt(float128 a, float128 b, float_status *status) 7268 { 7269 flag aSign, bSign; 7270 7271 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7272 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7273 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7274 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7275 ) { 7276 float_raise(float_flag_invalid, status); 7277 return 0; 7278 } 7279 aSign = extractFloat128Sign( a ); 7280 bSign = extractFloat128Sign( b ); 7281 if ( aSign != bSign ) { 7282 return 7283 aSign 7284 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7285 != 0 ); 7286 } 7287 return 7288 aSign ? lt128( b.high, b.low, a.high, a.low ) 7289 : lt128( a.high, a.low, b.high, b.low ); 7290 7291 } 7292 7293 /*---------------------------------------------------------------------------- 7294 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7295 | be compared, and 0 otherwise. The invalid exception is raised if either 7296 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7297 | Standard for Binary Floating-Point Arithmetic. 7298 *----------------------------------------------------------------------------*/ 7299 7300 int float128_unordered(float128 a, float128 b, float_status *status) 7301 { 7302 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7303 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7304 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7305 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7306 ) { 7307 float_raise(float_flag_invalid, status); 7308 return 1; 7309 } 7310 return 0; 7311 } 7312 7313 /*---------------------------------------------------------------------------- 7314 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7315 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7316 | exception. The comparison is performed according to the IEC/IEEE Standard 7317 | for Binary Floating-Point Arithmetic. 7318 *----------------------------------------------------------------------------*/ 7319 7320 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7321 { 7322 7323 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7324 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7325 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7326 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7327 ) { 7328 if (float128_is_signaling_nan(a, status) 7329 || float128_is_signaling_nan(b, status)) { 7330 float_raise(float_flag_invalid, status); 7331 } 7332 return 0; 7333 } 7334 return 7335 ( a.low == b.low ) 7336 && ( ( a.high == b.high ) 7337 || ( ( a.low == 0 ) 7338 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7339 ); 7340 7341 } 7342 7343 /*---------------------------------------------------------------------------- 7344 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7345 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7346 | cause an exception. Otherwise, the comparison is performed according to the 7347 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7348 *----------------------------------------------------------------------------*/ 7349 7350 int float128_le_quiet(float128 a, float128 b, float_status *status) 7351 { 7352 flag aSign, bSign; 7353 7354 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7355 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7356 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7357 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7358 ) { 7359 if (float128_is_signaling_nan(a, status) 7360 || float128_is_signaling_nan(b, status)) { 7361 float_raise(float_flag_invalid, status); 7362 } 7363 return 0; 7364 } 7365 aSign = extractFloat128Sign( a ); 7366 bSign = extractFloat128Sign( b ); 7367 if ( aSign != bSign ) { 7368 return 7369 aSign 7370 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7371 == 0 ); 7372 } 7373 return 7374 aSign ? le128( b.high, b.low, a.high, a.low ) 7375 : le128( a.high, a.low, b.high, b.low ); 7376 7377 } 7378 7379 /*---------------------------------------------------------------------------- 7380 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7381 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7382 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7383 | Standard for Binary Floating-Point Arithmetic. 7384 *----------------------------------------------------------------------------*/ 7385 7386 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7387 { 7388 flag aSign, bSign; 7389 7390 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7391 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7392 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7393 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7394 ) { 7395 if (float128_is_signaling_nan(a, status) 7396 || float128_is_signaling_nan(b, status)) { 7397 float_raise(float_flag_invalid, status); 7398 } 7399 return 0; 7400 } 7401 aSign = extractFloat128Sign( a ); 7402 bSign = extractFloat128Sign( b ); 7403 if ( aSign != bSign ) { 7404 return 7405 aSign 7406 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7407 != 0 ); 7408 } 7409 return 7410 aSign ? lt128( b.high, b.low, a.high, a.low ) 7411 : lt128( a.high, a.low, b.high, b.low ); 7412 7413 } 7414 7415 /*---------------------------------------------------------------------------- 7416 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7417 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7418 | comparison is performed according to the IEC/IEEE Standard for Binary 7419 | Floating-Point Arithmetic. 7420 *----------------------------------------------------------------------------*/ 7421 7422 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7423 { 7424 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7425 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7426 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7427 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7428 ) { 7429 if (float128_is_signaling_nan(a, status) 7430 || float128_is_signaling_nan(b, status)) { 7431 float_raise(float_flag_invalid, status); 7432 } 7433 return 1; 7434 } 7435 return 0; 7436 } 7437 7438 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7439 int is_quiet, float_status *status) 7440 { 7441 flag aSign, bSign; 7442 7443 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7444 float_raise(float_flag_invalid, status); 7445 return float_relation_unordered; 7446 } 7447 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7448 ( extractFloatx80Frac( a )<<1 ) ) || 7449 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7450 ( extractFloatx80Frac( b )<<1 ) )) { 7451 if (!is_quiet || 7452 floatx80_is_signaling_nan(a, status) || 7453 floatx80_is_signaling_nan(b, status)) { 7454 float_raise(float_flag_invalid, status); 7455 } 7456 return float_relation_unordered; 7457 } 7458 aSign = extractFloatx80Sign( a ); 7459 bSign = extractFloatx80Sign( b ); 7460 if ( aSign != bSign ) { 7461 7462 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7463 ( ( a.low | b.low ) == 0 ) ) { 7464 /* zero case */ 7465 return float_relation_equal; 7466 } else { 7467 return 1 - (2 * aSign); 7468 } 7469 } else { 7470 if (a.low == b.low && a.high == b.high) { 7471 return float_relation_equal; 7472 } else { 7473 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7474 } 7475 } 7476 } 7477 7478 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7479 { 7480 return floatx80_compare_internal(a, b, 0, status); 7481 } 7482 7483 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7484 { 7485 return floatx80_compare_internal(a, b, 1, status); 7486 } 7487 7488 static inline int float128_compare_internal(float128 a, float128 b, 7489 int is_quiet, float_status *status) 7490 { 7491 flag aSign, bSign; 7492 7493 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7494 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7495 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7496 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7497 if (!is_quiet || 7498 float128_is_signaling_nan(a, status) || 7499 float128_is_signaling_nan(b, status)) { 7500 float_raise(float_flag_invalid, status); 7501 } 7502 return float_relation_unordered; 7503 } 7504 aSign = extractFloat128Sign( a ); 7505 bSign = extractFloat128Sign( b ); 7506 if ( aSign != bSign ) { 7507 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7508 /* zero case */ 7509 return float_relation_equal; 7510 } else { 7511 return 1 - (2 * aSign); 7512 } 7513 } else { 7514 if (a.low == b.low && a.high == b.high) { 7515 return float_relation_equal; 7516 } else { 7517 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7518 } 7519 } 7520 } 7521 7522 int float128_compare(float128 a, float128 b, float_status *status) 7523 { 7524 return float128_compare_internal(a, b, 0, status); 7525 } 7526 7527 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7528 { 7529 return float128_compare_internal(a, b, 1, status); 7530 } 7531 7532 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7533 { 7534 flag aSign; 7535 int32_t aExp; 7536 uint64_t aSig; 7537 7538 if (floatx80_invalid_encoding(a)) { 7539 float_raise(float_flag_invalid, status); 7540 return floatx80_default_nan(status); 7541 } 7542 aSig = extractFloatx80Frac( a ); 7543 aExp = extractFloatx80Exp( a ); 7544 aSign = extractFloatx80Sign( a ); 7545 7546 if ( aExp == 0x7FFF ) { 7547 if ( aSig<<1 ) { 7548 return propagateFloatx80NaN(a, a, status); 7549 } 7550 return a; 7551 } 7552 7553 if (aExp == 0) { 7554 if (aSig == 0) { 7555 return a; 7556 } 7557 aExp++; 7558 } 7559 7560 if (n > 0x10000) { 7561 n = 0x10000; 7562 } else if (n < -0x10000) { 7563 n = -0x10000; 7564 } 7565 7566 aExp += n; 7567 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7568 aSign, aExp, aSig, 0, status); 7569 } 7570 7571 float128 float128_scalbn(float128 a, int n, float_status *status) 7572 { 7573 flag aSign; 7574 int32_t aExp; 7575 uint64_t aSig0, aSig1; 7576 7577 aSig1 = extractFloat128Frac1( a ); 7578 aSig0 = extractFloat128Frac0( a ); 7579 aExp = extractFloat128Exp( a ); 7580 aSign = extractFloat128Sign( a ); 7581 if ( aExp == 0x7FFF ) { 7582 if ( aSig0 | aSig1 ) { 7583 return propagateFloat128NaN(a, a, status); 7584 } 7585 return a; 7586 } 7587 if (aExp != 0) { 7588 aSig0 |= LIT64( 0x0001000000000000 ); 7589 } else if (aSig0 == 0 && aSig1 == 0) { 7590 return a; 7591 } else { 7592 aExp++; 7593 } 7594 7595 if (n > 0x10000) { 7596 n = 0x10000; 7597 } else if (n < -0x10000) { 7598 n = -0x10000; 7599 } 7600 7601 aExp += n - 1; 7602 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7603 , status); 7604 7605 } 7606