1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 static float32 QEMU_SOFTFLOAT_ATTR 1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1523 float_status *status) 1524 { 1525 FloatParts pa = float32_unpack_canonical(a, status); 1526 FloatParts pb = float32_unpack_canonical(b, status); 1527 FloatParts pc = float32_unpack_canonical(c, status); 1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1529 1530 return float32_round_pack_canonical(pr, status); 1531 } 1532 1533 static float64 QEMU_SOFTFLOAT_ATTR 1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1535 float_status *status) 1536 { 1537 FloatParts pa = float64_unpack_canonical(a, status); 1538 FloatParts pb = float64_unpack_canonical(b, status); 1539 FloatParts pc = float64_unpack_canonical(c, status); 1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1541 1542 return float64_round_pack_canonical(pr, status); 1543 } 1544 1545 float32 QEMU_FLATTEN 1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1547 { 1548 union_float32 ua, ub, uc, ur; 1549 1550 ua.s = xa; 1551 ub.s = xb; 1552 uc.s = xc; 1553 1554 if (unlikely(!can_use_fpu(s))) { 1555 goto soft; 1556 } 1557 if (unlikely(flags & float_muladd_halve_result)) { 1558 goto soft; 1559 } 1560 1561 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1562 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1563 goto soft; 1564 } 1565 /* 1566 * When (a || b) == 0, there's no need to check for under/over flow, 1567 * since we know the addend is (normal || 0) and the product is 0. 1568 */ 1569 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1570 union_float32 up; 1571 bool prod_sign; 1572 1573 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1574 prod_sign ^= !!(flags & float_muladd_negate_product); 1575 up.s = float32_set_sign(float32_zero, prod_sign); 1576 1577 if (flags & float_muladd_negate_c) { 1578 uc.h = -uc.h; 1579 } 1580 ur.h = up.h + uc.h; 1581 } else { 1582 if (flags & float_muladd_negate_product) { 1583 ua.h = -ua.h; 1584 } 1585 if (flags & float_muladd_negate_c) { 1586 uc.h = -uc.h; 1587 } 1588 1589 ur.h = fmaf(ua.h, ub.h, uc.h); 1590 1591 if (unlikely(f32_is_inf(ur))) { 1592 s->float_exception_flags |= float_flag_overflow; 1593 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1594 goto soft; 1595 } 1596 } 1597 if (flags & float_muladd_negate_result) { 1598 return float32_chs(ur.s); 1599 } 1600 return ur.s; 1601 1602 soft: 1603 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1604 } 1605 1606 float64 QEMU_FLATTEN 1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1608 { 1609 union_float64 ua, ub, uc, ur; 1610 1611 ua.s = xa; 1612 ub.s = xb; 1613 uc.s = xc; 1614 1615 if (unlikely(!can_use_fpu(s))) { 1616 goto soft; 1617 } 1618 if (unlikely(flags & float_muladd_halve_result)) { 1619 goto soft; 1620 } 1621 1622 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1623 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1624 goto soft; 1625 } 1626 /* 1627 * When (a || b) == 0, there's no need to check for under/over flow, 1628 * since we know the addend is (normal || 0) and the product is 0. 1629 */ 1630 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1631 union_float64 up; 1632 bool prod_sign; 1633 1634 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1635 prod_sign ^= !!(flags & float_muladd_negate_product); 1636 up.s = float64_set_sign(float64_zero, prod_sign); 1637 1638 if (flags & float_muladd_negate_c) { 1639 uc.h = -uc.h; 1640 } 1641 ur.h = up.h + uc.h; 1642 } else { 1643 if (flags & float_muladd_negate_product) { 1644 ua.h = -ua.h; 1645 } 1646 if (flags & float_muladd_negate_c) { 1647 uc.h = -uc.h; 1648 } 1649 1650 ur.h = fma(ua.h, ub.h, uc.h); 1651 1652 if (unlikely(f64_is_inf(ur))) { 1653 s->float_exception_flags |= float_flag_overflow; 1654 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1655 goto soft; 1656 } 1657 } 1658 if (flags & float_muladd_negate_result) { 1659 return float64_chs(ur.s); 1660 } 1661 return ur.s; 1662 1663 soft: 1664 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1665 } 1666 1667 /* 1668 * Returns the result of dividing the floating-point value `a' by the 1669 * corresponding value `b'. The operation is performed according to 1670 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1671 */ 1672 1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1674 { 1675 bool sign = a.sign ^ b.sign; 1676 1677 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1678 uint64_t n0, n1, q, r; 1679 int exp = a.exp - b.exp; 1680 1681 /* 1682 * We want a 2*N / N-bit division to produce exactly an N-bit 1683 * result, so that we do not lose any precision and so that we 1684 * do not have to renormalize afterward. If A.frac < B.frac, 1685 * then division would produce an (N-1)-bit result; shift A left 1686 * by one to produce the an N-bit result, and decrement the 1687 * exponent to match. 1688 * 1689 * The udiv_qrnnd algorithm that we're using requires normalization, 1690 * i.e. the msb of the denominator must be set. Since we know that 1691 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1692 * by one (more), and the remainder must be shifted right by one. 1693 */ 1694 if (a.frac < b.frac) { 1695 exp -= 1; 1696 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1697 } else { 1698 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1699 } 1700 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1701 1702 /* 1703 * Set lsb if there is a remainder, to set inexact. 1704 * As mentioned above, to find the actual value of the remainder we 1705 * would need to shift right, but (1) we are only concerned about 1706 * non-zero-ness, and (2) the remainder will always be even because 1707 * both inputs to the division primitive are even. 1708 */ 1709 a.frac = q | (r != 0); 1710 a.sign = sign; 1711 a.exp = exp; 1712 return a; 1713 } 1714 /* handle all the NaN cases */ 1715 if (is_nan(a.cls) || is_nan(b.cls)) { 1716 return pick_nan(a, b, s); 1717 } 1718 /* 0/0 or Inf/Inf */ 1719 if (a.cls == b.cls 1720 && 1721 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1722 s->float_exception_flags |= float_flag_invalid; 1723 return parts_default_nan(s); 1724 } 1725 /* Inf / x or 0 / x */ 1726 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1727 a.sign = sign; 1728 return a; 1729 } 1730 /* Div 0 => Inf */ 1731 if (b.cls == float_class_zero) { 1732 s->float_exception_flags |= float_flag_divbyzero; 1733 a.cls = float_class_inf; 1734 a.sign = sign; 1735 return a; 1736 } 1737 /* Div by Inf */ 1738 if (b.cls == float_class_inf) { 1739 a.cls = float_class_zero; 1740 a.sign = sign; 1741 return a; 1742 } 1743 g_assert_not_reached(); 1744 } 1745 1746 float16 float16_div(float16 a, float16 b, float_status *status) 1747 { 1748 FloatParts pa = float16_unpack_canonical(a, status); 1749 FloatParts pb = float16_unpack_canonical(b, status); 1750 FloatParts pr = div_floats(pa, pb, status); 1751 1752 return float16_round_pack_canonical(pr, status); 1753 } 1754 1755 static float32 QEMU_SOFTFLOAT_ATTR 1756 soft_f32_div(float32 a, float32 b, float_status *status) 1757 { 1758 FloatParts pa = float32_unpack_canonical(a, status); 1759 FloatParts pb = float32_unpack_canonical(b, status); 1760 FloatParts pr = div_floats(pa, pb, status); 1761 1762 return float32_round_pack_canonical(pr, status); 1763 } 1764 1765 static float64 QEMU_SOFTFLOAT_ATTR 1766 soft_f64_div(float64 a, float64 b, float_status *status) 1767 { 1768 FloatParts pa = float64_unpack_canonical(a, status); 1769 FloatParts pb = float64_unpack_canonical(b, status); 1770 FloatParts pr = div_floats(pa, pb, status); 1771 1772 return float64_round_pack_canonical(pr, status); 1773 } 1774 1775 static float hard_f32_div(float a, float b) 1776 { 1777 return a / b; 1778 } 1779 1780 static double hard_f64_div(double a, double b) 1781 { 1782 return a / b; 1783 } 1784 1785 static bool f32_div_pre(union_float32 a, union_float32 b) 1786 { 1787 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1788 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1789 fpclassify(b.h) == FP_NORMAL; 1790 } 1791 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1792 } 1793 1794 static bool f64_div_pre(union_float64 a, union_float64 b) 1795 { 1796 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1797 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1798 fpclassify(b.h) == FP_NORMAL; 1799 } 1800 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1801 } 1802 1803 static bool f32_div_post(union_float32 a, union_float32 b) 1804 { 1805 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1806 return fpclassify(a.h) != FP_ZERO; 1807 } 1808 return !float32_is_zero(a.s); 1809 } 1810 1811 static bool f64_div_post(union_float64 a, union_float64 b) 1812 { 1813 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1814 return fpclassify(a.h) != FP_ZERO; 1815 } 1816 return !float64_is_zero(a.s); 1817 } 1818 1819 float32 QEMU_FLATTEN 1820 float32_div(float32 a, float32 b, float_status *s) 1821 { 1822 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1823 f32_div_pre, f32_div_post, NULL, NULL); 1824 } 1825 1826 float64 QEMU_FLATTEN 1827 float64_div(float64 a, float64 b, float_status *s) 1828 { 1829 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1830 f64_div_pre, f64_div_post, NULL, NULL); 1831 } 1832 1833 /* 1834 * Float to Float conversions 1835 * 1836 * Returns the result of converting one float format to another. The 1837 * conversion is performed according to the IEC/IEEE Standard for 1838 * Binary Floating-Point Arithmetic. 1839 * 1840 * The float_to_float helper only needs to take care of raising 1841 * invalid exceptions and handling the conversion on NaNs. 1842 */ 1843 1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1845 float_status *s) 1846 { 1847 if (dstf->arm_althp) { 1848 switch (a.cls) { 1849 case float_class_qnan: 1850 case float_class_snan: 1851 /* There is no NaN in the destination format. Raise Invalid 1852 * and return a zero with the sign of the input NaN. 1853 */ 1854 s->float_exception_flags |= float_flag_invalid; 1855 a.cls = float_class_zero; 1856 a.frac = 0; 1857 a.exp = 0; 1858 break; 1859 1860 case float_class_inf: 1861 /* There is no Inf in the destination format. Raise Invalid 1862 * and return the maximum normal with the correct sign. 1863 */ 1864 s->float_exception_flags |= float_flag_invalid; 1865 a.cls = float_class_normal; 1866 a.exp = dstf->exp_max; 1867 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1868 break; 1869 1870 default: 1871 break; 1872 } 1873 } else if (is_nan(a.cls)) { 1874 if (is_snan(a.cls)) { 1875 s->float_exception_flags |= float_flag_invalid; 1876 a = parts_silence_nan(a, s); 1877 } 1878 if (s->default_nan_mode) { 1879 return parts_default_nan(s); 1880 } 1881 } 1882 return a; 1883 } 1884 1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1886 { 1887 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1888 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1889 FloatParts pr = float_to_float(p, &float32_params, s); 1890 return float32_round_pack_canonical(pr, s); 1891 } 1892 1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1894 { 1895 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1896 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1897 FloatParts pr = float_to_float(p, &float64_params, s); 1898 return float64_round_pack_canonical(pr, s); 1899 } 1900 1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1902 { 1903 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1904 FloatParts p = float32_unpack_canonical(a, s); 1905 FloatParts pr = float_to_float(p, fmt16, s); 1906 return float16a_round_pack_canonical(pr, s, fmt16); 1907 } 1908 1909 float64 float32_to_float64(float32 a, float_status *s) 1910 { 1911 FloatParts p = float32_unpack_canonical(a, s); 1912 FloatParts pr = float_to_float(p, &float64_params, s); 1913 return float64_round_pack_canonical(pr, s); 1914 } 1915 1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1917 { 1918 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1919 FloatParts p = float64_unpack_canonical(a, s); 1920 FloatParts pr = float_to_float(p, fmt16, s); 1921 return float16a_round_pack_canonical(pr, s, fmt16); 1922 } 1923 1924 float32 float64_to_float32(float64 a, float_status *s) 1925 { 1926 FloatParts p = float64_unpack_canonical(a, s); 1927 FloatParts pr = float_to_float(p, &float32_params, s); 1928 return float32_round_pack_canonical(pr, s); 1929 } 1930 1931 /* 1932 * Rounds the floating-point value `a' to an integer, and returns the 1933 * result as a floating-point value. The operation is performed 1934 * according to the IEC/IEEE Standard for Binary Floating-Point 1935 * Arithmetic. 1936 */ 1937 1938 static FloatParts round_to_int(FloatParts a, int rmode, 1939 int scale, float_status *s) 1940 { 1941 switch (a.cls) { 1942 case float_class_qnan: 1943 case float_class_snan: 1944 return return_nan(a, s); 1945 1946 case float_class_zero: 1947 case float_class_inf: 1948 /* already "integral" */ 1949 break; 1950 1951 case float_class_normal: 1952 scale = MIN(MAX(scale, -0x10000), 0x10000); 1953 a.exp += scale; 1954 1955 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1956 /* already integral */ 1957 break; 1958 } 1959 if (a.exp < 0) { 1960 bool one; 1961 /* all fractional */ 1962 s->float_exception_flags |= float_flag_inexact; 1963 switch (rmode) { 1964 case float_round_nearest_even: 1965 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1966 break; 1967 case float_round_ties_away: 1968 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1969 break; 1970 case float_round_to_zero: 1971 one = false; 1972 break; 1973 case float_round_up: 1974 one = !a.sign; 1975 break; 1976 case float_round_down: 1977 one = a.sign; 1978 break; 1979 default: 1980 g_assert_not_reached(); 1981 } 1982 1983 if (one) { 1984 a.frac = DECOMPOSED_IMPLICIT_BIT; 1985 a.exp = 0; 1986 } else { 1987 a.cls = float_class_zero; 1988 } 1989 } else { 1990 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1991 uint64_t frac_lsbm1 = frac_lsb >> 1; 1992 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1993 uint64_t rnd_mask = rnd_even_mask >> 1; 1994 uint64_t inc; 1995 1996 switch (rmode) { 1997 case float_round_nearest_even: 1998 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1999 break; 2000 case float_round_ties_away: 2001 inc = frac_lsbm1; 2002 break; 2003 case float_round_to_zero: 2004 inc = 0; 2005 break; 2006 case float_round_up: 2007 inc = a.sign ? 0 : rnd_mask; 2008 break; 2009 case float_round_down: 2010 inc = a.sign ? rnd_mask : 0; 2011 break; 2012 default: 2013 g_assert_not_reached(); 2014 } 2015 2016 if (a.frac & rnd_mask) { 2017 s->float_exception_flags |= float_flag_inexact; 2018 a.frac += inc; 2019 a.frac &= ~rnd_mask; 2020 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2021 a.frac >>= 1; 2022 a.exp++; 2023 } 2024 } 2025 } 2026 break; 2027 default: 2028 g_assert_not_reached(); 2029 } 2030 return a; 2031 } 2032 2033 float16 float16_round_to_int(float16 a, float_status *s) 2034 { 2035 FloatParts pa = float16_unpack_canonical(a, s); 2036 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2037 return float16_round_pack_canonical(pr, s); 2038 } 2039 2040 float32 float32_round_to_int(float32 a, float_status *s) 2041 { 2042 FloatParts pa = float32_unpack_canonical(a, s); 2043 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2044 return float32_round_pack_canonical(pr, s); 2045 } 2046 2047 float64 float64_round_to_int(float64 a, float_status *s) 2048 { 2049 FloatParts pa = float64_unpack_canonical(a, s); 2050 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2051 return float64_round_pack_canonical(pr, s); 2052 } 2053 2054 /* 2055 * Returns the result of converting the floating-point value `a' to 2056 * the two's complement integer format. The conversion is performed 2057 * according to the IEC/IEEE Standard for Binary Floating-Point 2058 * Arithmetic---which means in particular that the conversion is 2059 * rounded according to the current rounding mode. If `a' is a NaN, 2060 * the largest positive integer is returned. Otherwise, if the 2061 * conversion overflows, the largest integer with the same sign as `a' 2062 * is returned. 2063 */ 2064 2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2066 int64_t min, int64_t max, 2067 float_status *s) 2068 { 2069 uint64_t r; 2070 int orig_flags = get_float_exception_flags(s); 2071 FloatParts p = round_to_int(in, rmode, scale, s); 2072 2073 switch (p.cls) { 2074 case float_class_snan: 2075 case float_class_qnan: 2076 s->float_exception_flags = orig_flags | float_flag_invalid; 2077 return max; 2078 case float_class_inf: 2079 s->float_exception_flags = orig_flags | float_flag_invalid; 2080 return p.sign ? min : max; 2081 case float_class_zero: 2082 return 0; 2083 case float_class_normal: 2084 if (p.exp < DECOMPOSED_BINARY_POINT) { 2085 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2086 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2087 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2088 } else { 2089 r = UINT64_MAX; 2090 } 2091 if (p.sign) { 2092 if (r <= -(uint64_t) min) { 2093 return -r; 2094 } else { 2095 s->float_exception_flags = orig_flags | float_flag_invalid; 2096 return min; 2097 } 2098 } else { 2099 if (r <= max) { 2100 return r; 2101 } else { 2102 s->float_exception_flags = orig_flags | float_flag_invalid; 2103 return max; 2104 } 2105 } 2106 default: 2107 g_assert_not_reached(); 2108 } 2109 } 2110 2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2112 float_status *s) 2113 { 2114 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2115 rmode, scale, INT16_MIN, INT16_MAX, s); 2116 } 2117 2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2119 float_status *s) 2120 { 2121 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2122 rmode, scale, INT32_MIN, INT32_MAX, s); 2123 } 2124 2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2126 float_status *s) 2127 { 2128 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2129 rmode, scale, INT64_MIN, INT64_MAX, s); 2130 } 2131 2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2133 float_status *s) 2134 { 2135 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2136 rmode, scale, INT16_MIN, INT16_MAX, s); 2137 } 2138 2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2140 float_status *s) 2141 { 2142 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2143 rmode, scale, INT32_MIN, INT32_MAX, s); 2144 } 2145 2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2147 float_status *s) 2148 { 2149 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2150 rmode, scale, INT64_MIN, INT64_MAX, s); 2151 } 2152 2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2154 float_status *s) 2155 { 2156 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2157 rmode, scale, INT16_MIN, INT16_MAX, s); 2158 } 2159 2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2161 float_status *s) 2162 { 2163 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2164 rmode, scale, INT32_MIN, INT32_MAX, s); 2165 } 2166 2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2168 float_status *s) 2169 { 2170 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2171 rmode, scale, INT64_MIN, INT64_MAX, s); 2172 } 2173 2174 int16_t float16_to_int16(float16 a, float_status *s) 2175 { 2176 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2177 } 2178 2179 int32_t float16_to_int32(float16 a, float_status *s) 2180 { 2181 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2182 } 2183 2184 int64_t float16_to_int64(float16 a, float_status *s) 2185 { 2186 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2187 } 2188 2189 int16_t float32_to_int16(float32 a, float_status *s) 2190 { 2191 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2192 } 2193 2194 int32_t float32_to_int32(float32 a, float_status *s) 2195 { 2196 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2197 } 2198 2199 int64_t float32_to_int64(float32 a, float_status *s) 2200 { 2201 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2202 } 2203 2204 int16_t float64_to_int16(float64 a, float_status *s) 2205 { 2206 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2207 } 2208 2209 int32_t float64_to_int32(float64 a, float_status *s) 2210 { 2211 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2212 } 2213 2214 int64_t float64_to_int64(float64 a, float_status *s) 2215 { 2216 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2217 } 2218 2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2220 { 2221 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2222 } 2223 2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2225 { 2226 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2227 } 2228 2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2230 { 2231 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2232 } 2233 2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2235 { 2236 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2237 } 2238 2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2240 { 2241 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2242 } 2243 2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2245 { 2246 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2247 } 2248 2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2250 { 2251 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2252 } 2253 2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2255 { 2256 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2257 } 2258 2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2260 { 2261 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2262 } 2263 2264 /* 2265 * Returns the result of converting the floating-point value `a' to 2266 * the unsigned integer format. The conversion is performed according 2267 * to the IEC/IEEE Standard for Binary Floating-Point 2268 * Arithmetic---which means in particular that the conversion is 2269 * rounded according to the current rounding mode. If `a' is a NaN, 2270 * the largest unsigned integer is returned. Otherwise, if the 2271 * conversion overflows, the largest unsigned integer is returned. If 2272 * the 'a' is negative, the result is rounded and zero is returned; 2273 * values that do not round to zero will raise the inexact exception 2274 * flag. 2275 */ 2276 2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2278 uint64_t max, float_status *s) 2279 { 2280 int orig_flags = get_float_exception_flags(s); 2281 FloatParts p = round_to_int(in, rmode, scale, s); 2282 uint64_t r; 2283 2284 switch (p.cls) { 2285 case float_class_snan: 2286 case float_class_qnan: 2287 s->float_exception_flags = orig_flags | float_flag_invalid; 2288 return max; 2289 case float_class_inf: 2290 s->float_exception_flags = orig_flags | float_flag_invalid; 2291 return p.sign ? 0 : max; 2292 case float_class_zero: 2293 return 0; 2294 case float_class_normal: 2295 if (p.sign) { 2296 s->float_exception_flags = orig_flags | float_flag_invalid; 2297 return 0; 2298 } 2299 2300 if (p.exp < DECOMPOSED_BINARY_POINT) { 2301 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2302 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2303 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2304 } else { 2305 s->float_exception_flags = orig_flags | float_flag_invalid; 2306 return max; 2307 } 2308 2309 /* For uint64 this will never trip, but if p.exp is too large 2310 * to shift a decomposed fraction we shall have exited via the 2311 * 3rd leg above. 2312 */ 2313 if (r > max) { 2314 s->float_exception_flags = orig_flags | float_flag_invalid; 2315 return max; 2316 } 2317 return r; 2318 default: 2319 g_assert_not_reached(); 2320 } 2321 } 2322 2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2324 float_status *s) 2325 { 2326 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2327 rmode, scale, UINT16_MAX, s); 2328 } 2329 2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2331 float_status *s) 2332 { 2333 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2334 rmode, scale, UINT32_MAX, s); 2335 } 2336 2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2338 float_status *s) 2339 { 2340 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2341 rmode, scale, UINT64_MAX, s); 2342 } 2343 2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2345 float_status *s) 2346 { 2347 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2348 rmode, scale, UINT16_MAX, s); 2349 } 2350 2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2352 float_status *s) 2353 { 2354 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2355 rmode, scale, UINT32_MAX, s); 2356 } 2357 2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2359 float_status *s) 2360 { 2361 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2362 rmode, scale, UINT64_MAX, s); 2363 } 2364 2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2366 float_status *s) 2367 { 2368 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2369 rmode, scale, UINT16_MAX, s); 2370 } 2371 2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2373 float_status *s) 2374 { 2375 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2376 rmode, scale, UINT32_MAX, s); 2377 } 2378 2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2380 float_status *s) 2381 { 2382 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2383 rmode, scale, UINT64_MAX, s); 2384 } 2385 2386 uint16_t float16_to_uint16(float16 a, float_status *s) 2387 { 2388 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2389 } 2390 2391 uint32_t float16_to_uint32(float16 a, float_status *s) 2392 { 2393 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2394 } 2395 2396 uint64_t float16_to_uint64(float16 a, float_status *s) 2397 { 2398 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2399 } 2400 2401 uint16_t float32_to_uint16(float32 a, float_status *s) 2402 { 2403 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2404 } 2405 2406 uint32_t float32_to_uint32(float32 a, float_status *s) 2407 { 2408 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2409 } 2410 2411 uint64_t float32_to_uint64(float32 a, float_status *s) 2412 { 2413 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2414 } 2415 2416 uint16_t float64_to_uint16(float64 a, float_status *s) 2417 { 2418 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2419 } 2420 2421 uint32_t float64_to_uint32(float64 a, float_status *s) 2422 { 2423 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2424 } 2425 2426 uint64_t float64_to_uint64(float64 a, float_status *s) 2427 { 2428 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2429 } 2430 2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2432 { 2433 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2434 } 2435 2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2437 { 2438 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2439 } 2440 2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2442 { 2443 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2444 } 2445 2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2447 { 2448 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2449 } 2450 2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2452 { 2453 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2454 } 2455 2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2457 { 2458 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2459 } 2460 2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2462 { 2463 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2464 } 2465 2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2467 { 2468 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2469 } 2470 2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2472 { 2473 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2474 } 2475 2476 /* 2477 * Integer to float conversions 2478 * 2479 * Returns the result of converting the two's complement integer `a' 2480 * to the floating-point format. The conversion is performed according 2481 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2482 */ 2483 2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2485 { 2486 FloatParts r = { .sign = false }; 2487 2488 if (a == 0) { 2489 r.cls = float_class_zero; 2490 } else { 2491 uint64_t f = a; 2492 int shift; 2493 2494 r.cls = float_class_normal; 2495 if (a < 0) { 2496 f = -f; 2497 r.sign = true; 2498 } 2499 shift = clz64(f) - 1; 2500 scale = MIN(MAX(scale, -0x10000), 0x10000); 2501 2502 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2503 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2504 } 2505 2506 return r; 2507 } 2508 2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2510 { 2511 FloatParts pa = int_to_float(a, scale, status); 2512 return float16_round_pack_canonical(pa, status); 2513 } 2514 2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2516 { 2517 return int64_to_float16_scalbn(a, scale, status); 2518 } 2519 2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2521 { 2522 return int64_to_float16_scalbn(a, scale, status); 2523 } 2524 2525 float16 int64_to_float16(int64_t a, float_status *status) 2526 { 2527 return int64_to_float16_scalbn(a, 0, status); 2528 } 2529 2530 float16 int32_to_float16(int32_t a, float_status *status) 2531 { 2532 return int64_to_float16_scalbn(a, 0, status); 2533 } 2534 2535 float16 int16_to_float16(int16_t a, float_status *status) 2536 { 2537 return int64_to_float16_scalbn(a, 0, status); 2538 } 2539 2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2541 { 2542 FloatParts pa = int_to_float(a, scale, status); 2543 return float32_round_pack_canonical(pa, status); 2544 } 2545 2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2547 { 2548 return int64_to_float32_scalbn(a, scale, status); 2549 } 2550 2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2552 { 2553 return int64_to_float32_scalbn(a, scale, status); 2554 } 2555 2556 float32 int64_to_float32(int64_t a, float_status *status) 2557 { 2558 return int64_to_float32_scalbn(a, 0, status); 2559 } 2560 2561 float32 int32_to_float32(int32_t a, float_status *status) 2562 { 2563 return int64_to_float32_scalbn(a, 0, status); 2564 } 2565 2566 float32 int16_to_float32(int16_t a, float_status *status) 2567 { 2568 return int64_to_float32_scalbn(a, 0, status); 2569 } 2570 2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2572 { 2573 FloatParts pa = int_to_float(a, scale, status); 2574 return float64_round_pack_canonical(pa, status); 2575 } 2576 2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2578 { 2579 return int64_to_float64_scalbn(a, scale, status); 2580 } 2581 2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2583 { 2584 return int64_to_float64_scalbn(a, scale, status); 2585 } 2586 2587 float64 int64_to_float64(int64_t a, float_status *status) 2588 { 2589 return int64_to_float64_scalbn(a, 0, status); 2590 } 2591 2592 float64 int32_to_float64(int32_t a, float_status *status) 2593 { 2594 return int64_to_float64_scalbn(a, 0, status); 2595 } 2596 2597 float64 int16_to_float64(int16_t a, float_status *status) 2598 { 2599 return int64_to_float64_scalbn(a, 0, status); 2600 } 2601 2602 2603 /* 2604 * Unsigned Integer to float conversions 2605 * 2606 * Returns the result of converting the unsigned integer `a' to the 2607 * floating-point format. The conversion is performed according to the 2608 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2609 */ 2610 2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2612 { 2613 FloatParts r = { .sign = false }; 2614 2615 if (a == 0) { 2616 r.cls = float_class_zero; 2617 } else { 2618 scale = MIN(MAX(scale, -0x10000), 0x10000); 2619 r.cls = float_class_normal; 2620 if ((int64_t)a < 0) { 2621 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2622 shift64RightJamming(a, 1, &a); 2623 r.frac = a; 2624 } else { 2625 int shift = clz64(a) - 1; 2626 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2627 r.frac = a << shift; 2628 } 2629 } 2630 2631 return r; 2632 } 2633 2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2635 { 2636 FloatParts pa = uint_to_float(a, scale, status); 2637 return float16_round_pack_canonical(pa, status); 2638 } 2639 2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2641 { 2642 return uint64_to_float16_scalbn(a, scale, status); 2643 } 2644 2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2646 { 2647 return uint64_to_float16_scalbn(a, scale, status); 2648 } 2649 2650 float16 uint64_to_float16(uint64_t a, float_status *status) 2651 { 2652 return uint64_to_float16_scalbn(a, 0, status); 2653 } 2654 2655 float16 uint32_to_float16(uint32_t a, float_status *status) 2656 { 2657 return uint64_to_float16_scalbn(a, 0, status); 2658 } 2659 2660 float16 uint16_to_float16(uint16_t a, float_status *status) 2661 { 2662 return uint64_to_float16_scalbn(a, 0, status); 2663 } 2664 2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2666 { 2667 FloatParts pa = uint_to_float(a, scale, status); 2668 return float32_round_pack_canonical(pa, status); 2669 } 2670 2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2672 { 2673 return uint64_to_float32_scalbn(a, scale, status); 2674 } 2675 2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2677 { 2678 return uint64_to_float32_scalbn(a, scale, status); 2679 } 2680 2681 float32 uint64_to_float32(uint64_t a, float_status *status) 2682 { 2683 return uint64_to_float32_scalbn(a, 0, status); 2684 } 2685 2686 float32 uint32_to_float32(uint32_t a, float_status *status) 2687 { 2688 return uint64_to_float32_scalbn(a, 0, status); 2689 } 2690 2691 float32 uint16_to_float32(uint16_t a, float_status *status) 2692 { 2693 return uint64_to_float32_scalbn(a, 0, status); 2694 } 2695 2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2697 { 2698 FloatParts pa = uint_to_float(a, scale, status); 2699 return float64_round_pack_canonical(pa, status); 2700 } 2701 2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2703 { 2704 return uint64_to_float64_scalbn(a, scale, status); 2705 } 2706 2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2708 { 2709 return uint64_to_float64_scalbn(a, scale, status); 2710 } 2711 2712 float64 uint64_to_float64(uint64_t a, float_status *status) 2713 { 2714 return uint64_to_float64_scalbn(a, 0, status); 2715 } 2716 2717 float64 uint32_to_float64(uint32_t a, float_status *status) 2718 { 2719 return uint64_to_float64_scalbn(a, 0, status); 2720 } 2721 2722 float64 uint16_to_float64(uint16_t a, float_status *status) 2723 { 2724 return uint64_to_float64_scalbn(a, 0, status); 2725 } 2726 2727 /* Float Min/Max */ 2728 /* min() and max() functions. These can't be implemented as 2729 * 'compare and pick one input' because that would mishandle 2730 * NaNs and +0 vs -0. 2731 * 2732 * minnum() and maxnum() functions. These are similar to the min() 2733 * and max() functions but if one of the arguments is a QNaN and 2734 * the other is numerical then the numerical argument is returned. 2735 * SNaNs will get quietened before being returned. 2736 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2737 * and maxNum() operations. min() and max() are the typical min/max 2738 * semantics provided by many CPUs which predate that specification. 2739 * 2740 * minnummag() and maxnummag() functions correspond to minNumMag() 2741 * and minNumMag() from the IEEE-754 2008. 2742 */ 2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2744 bool ieee, bool ismag, float_status *s) 2745 { 2746 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2747 if (ieee) { 2748 /* Takes two floating-point values `a' and `b', one of 2749 * which is a NaN, and returns the appropriate NaN 2750 * result. If either `a' or `b' is a signaling NaN, 2751 * the invalid exception is raised. 2752 */ 2753 if (is_snan(a.cls) || is_snan(b.cls)) { 2754 return pick_nan(a, b, s); 2755 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2756 return b; 2757 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2758 return a; 2759 } 2760 } 2761 return pick_nan(a, b, s); 2762 } else { 2763 int a_exp, b_exp; 2764 2765 switch (a.cls) { 2766 case float_class_normal: 2767 a_exp = a.exp; 2768 break; 2769 case float_class_inf: 2770 a_exp = INT_MAX; 2771 break; 2772 case float_class_zero: 2773 a_exp = INT_MIN; 2774 break; 2775 default: 2776 g_assert_not_reached(); 2777 break; 2778 } 2779 switch (b.cls) { 2780 case float_class_normal: 2781 b_exp = b.exp; 2782 break; 2783 case float_class_inf: 2784 b_exp = INT_MAX; 2785 break; 2786 case float_class_zero: 2787 b_exp = INT_MIN; 2788 break; 2789 default: 2790 g_assert_not_reached(); 2791 break; 2792 } 2793 2794 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2795 bool a_less = a_exp < b_exp; 2796 if (a_exp == b_exp) { 2797 a_less = a.frac < b.frac; 2798 } 2799 return a_less ^ ismin ? b : a; 2800 } 2801 2802 if (a.sign == b.sign) { 2803 bool a_less = a_exp < b_exp; 2804 if (a_exp == b_exp) { 2805 a_less = a.frac < b.frac; 2806 } 2807 return a.sign ^ a_less ^ ismin ? b : a; 2808 } else { 2809 return a.sign ^ ismin ? b : a; 2810 } 2811 } 2812 } 2813 2814 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2816 float_status *s) \ 2817 { \ 2818 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2819 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2820 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2821 \ 2822 return float ## sz ## _round_pack_canonical(pr, s); \ 2823 } 2824 2825 MINMAX(16, min, true, false, false) 2826 MINMAX(16, minnum, true, true, false) 2827 MINMAX(16, minnummag, true, true, true) 2828 MINMAX(16, max, false, false, false) 2829 MINMAX(16, maxnum, false, true, false) 2830 MINMAX(16, maxnummag, false, true, true) 2831 2832 MINMAX(32, min, true, false, false) 2833 MINMAX(32, minnum, true, true, false) 2834 MINMAX(32, minnummag, true, true, true) 2835 MINMAX(32, max, false, false, false) 2836 MINMAX(32, maxnum, false, true, false) 2837 MINMAX(32, maxnummag, false, true, true) 2838 2839 MINMAX(64, min, true, false, false) 2840 MINMAX(64, minnum, true, true, false) 2841 MINMAX(64, minnummag, true, true, true) 2842 MINMAX(64, max, false, false, false) 2843 MINMAX(64, maxnum, false, true, false) 2844 MINMAX(64, maxnummag, false, true, true) 2845 2846 #undef MINMAX 2847 2848 /* Floating point compare */ 2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2850 float_status *s) 2851 { 2852 if (is_nan(a.cls) || is_nan(b.cls)) { 2853 if (!is_quiet || 2854 a.cls == float_class_snan || 2855 b.cls == float_class_snan) { 2856 s->float_exception_flags |= float_flag_invalid; 2857 } 2858 return float_relation_unordered; 2859 } 2860 2861 if (a.cls == float_class_zero) { 2862 if (b.cls == float_class_zero) { 2863 return float_relation_equal; 2864 } 2865 return b.sign ? float_relation_greater : float_relation_less; 2866 } else if (b.cls == float_class_zero) { 2867 return a.sign ? float_relation_less : float_relation_greater; 2868 } 2869 2870 /* The only really important thing about infinity is its sign. If 2871 * both are infinities the sign marks the smallest of the two. 2872 */ 2873 if (a.cls == float_class_inf) { 2874 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2875 return float_relation_equal; 2876 } 2877 return a.sign ? float_relation_less : float_relation_greater; 2878 } else if (b.cls == float_class_inf) { 2879 return b.sign ? float_relation_greater : float_relation_less; 2880 } 2881 2882 if (a.sign != b.sign) { 2883 return a.sign ? float_relation_less : float_relation_greater; 2884 } 2885 2886 if (a.exp == b.exp) { 2887 if (a.frac == b.frac) { 2888 return float_relation_equal; 2889 } 2890 if (a.sign) { 2891 return a.frac > b.frac ? 2892 float_relation_less : float_relation_greater; 2893 } else { 2894 return a.frac > b.frac ? 2895 float_relation_greater : float_relation_less; 2896 } 2897 } else { 2898 if (a.sign) { 2899 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2900 } else { 2901 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2902 } 2903 } 2904 } 2905 2906 #define COMPARE(sz) \ 2907 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2908 float_status *s) \ 2909 { \ 2910 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2911 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2912 return compare_floats(pa, pb, false, s); \ 2913 } \ 2914 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2915 float_status *s) \ 2916 { \ 2917 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2918 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2919 return compare_floats(pa, pb, true, s); \ 2920 } 2921 2922 COMPARE(16) 2923 COMPARE(32) 2924 COMPARE(64) 2925 2926 #undef COMPARE 2927 2928 /* Multiply A by 2 raised to the power N. */ 2929 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2930 { 2931 if (unlikely(is_nan(a.cls))) { 2932 return return_nan(a, s); 2933 } 2934 if (a.cls == float_class_normal) { 2935 /* The largest float type (even though not supported by FloatParts) 2936 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2937 * still allows rounding to infinity, without allowing overflow 2938 * within the int32_t that backs FloatParts.exp. 2939 */ 2940 n = MIN(MAX(n, -0x10000), 0x10000); 2941 a.exp += n; 2942 } 2943 return a; 2944 } 2945 2946 float16 float16_scalbn(float16 a, int n, float_status *status) 2947 { 2948 FloatParts pa = float16_unpack_canonical(a, status); 2949 FloatParts pr = scalbn_decomposed(pa, n, status); 2950 return float16_round_pack_canonical(pr, status); 2951 } 2952 2953 float32 float32_scalbn(float32 a, int n, float_status *status) 2954 { 2955 FloatParts pa = float32_unpack_canonical(a, status); 2956 FloatParts pr = scalbn_decomposed(pa, n, status); 2957 return float32_round_pack_canonical(pr, status); 2958 } 2959 2960 float64 float64_scalbn(float64 a, int n, float_status *status) 2961 { 2962 FloatParts pa = float64_unpack_canonical(a, status); 2963 FloatParts pr = scalbn_decomposed(pa, n, status); 2964 return float64_round_pack_canonical(pr, status); 2965 } 2966 2967 /* 2968 * Square Root 2969 * 2970 * The old softfloat code did an approximation step before zeroing in 2971 * on the final result. However for simpleness we just compute the 2972 * square root by iterating down from the implicit bit to enough extra 2973 * bits to ensure we get a correctly rounded result. 2974 * 2975 * This does mean however the calculation is slower than before, 2976 * especially for 64 bit floats. 2977 */ 2978 2979 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2980 { 2981 uint64_t a_frac, r_frac, s_frac; 2982 int bit, last_bit; 2983 2984 if (is_nan(a.cls)) { 2985 return return_nan(a, s); 2986 } 2987 if (a.cls == float_class_zero) { 2988 return a; /* sqrt(+-0) = +-0 */ 2989 } 2990 if (a.sign) { 2991 s->float_exception_flags |= float_flag_invalid; 2992 return parts_default_nan(s); 2993 } 2994 if (a.cls == float_class_inf) { 2995 return a; /* sqrt(+inf) = +inf */ 2996 } 2997 2998 assert(a.cls == float_class_normal); 2999 3000 /* We need two overflow bits at the top. Adding room for that is a 3001 * right shift. If the exponent is odd, we can discard the low bit 3002 * by multiplying the fraction by 2; that's a left shift. Combine 3003 * those and we shift right if the exponent is even. 3004 */ 3005 a_frac = a.frac; 3006 if (!(a.exp & 1)) { 3007 a_frac >>= 1; 3008 } 3009 a.exp >>= 1; 3010 3011 /* Bit-by-bit computation of sqrt. */ 3012 r_frac = 0; 3013 s_frac = 0; 3014 3015 /* Iterate from implicit bit down to the 3 extra bits to compute a 3016 * properly rounded result. Remember we've inserted one more bit 3017 * at the top, so these positions are one less. 3018 */ 3019 bit = DECOMPOSED_BINARY_POINT - 1; 3020 last_bit = MAX(p->frac_shift - 4, 0); 3021 do { 3022 uint64_t q = 1ULL << bit; 3023 uint64_t t_frac = s_frac + q; 3024 if (t_frac <= a_frac) { 3025 s_frac = t_frac + q; 3026 a_frac -= t_frac; 3027 r_frac += q; 3028 } 3029 a_frac <<= 1; 3030 } while (--bit >= last_bit); 3031 3032 /* Undo the right shift done above. If there is any remaining 3033 * fraction, the result is inexact. Set the sticky bit. 3034 */ 3035 a.frac = (r_frac << 1) + (a_frac != 0); 3036 3037 return a; 3038 } 3039 3040 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3041 { 3042 FloatParts pa = float16_unpack_canonical(a, status); 3043 FloatParts pr = sqrt_float(pa, status, &float16_params); 3044 return float16_round_pack_canonical(pr, status); 3045 } 3046 3047 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 3048 { 3049 FloatParts pa = float32_unpack_canonical(a, status); 3050 FloatParts pr = sqrt_float(pa, status, &float32_params); 3051 return float32_round_pack_canonical(pr, status); 3052 } 3053 3054 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 3055 { 3056 FloatParts pa = float64_unpack_canonical(a, status); 3057 FloatParts pr = sqrt_float(pa, status, &float64_params); 3058 return float64_round_pack_canonical(pr, status); 3059 } 3060 3061 /*---------------------------------------------------------------------------- 3062 | The pattern for a default generated NaN. 3063 *----------------------------------------------------------------------------*/ 3064 3065 float16 float16_default_nan(float_status *status) 3066 { 3067 FloatParts p = parts_default_nan(status); 3068 p.frac >>= float16_params.frac_shift; 3069 return float16_pack_raw(p); 3070 } 3071 3072 float32 float32_default_nan(float_status *status) 3073 { 3074 FloatParts p = parts_default_nan(status); 3075 p.frac >>= float32_params.frac_shift; 3076 return float32_pack_raw(p); 3077 } 3078 3079 float64 float64_default_nan(float_status *status) 3080 { 3081 FloatParts p = parts_default_nan(status); 3082 p.frac >>= float64_params.frac_shift; 3083 return float64_pack_raw(p); 3084 } 3085 3086 float128 float128_default_nan(float_status *status) 3087 { 3088 FloatParts p = parts_default_nan(status); 3089 float128 r; 3090 3091 /* Extrapolate from the choices made by parts_default_nan to fill 3092 * in the quad-floating format. If the low bit is set, assume we 3093 * want to set all non-snan bits. 3094 */ 3095 r.low = -(p.frac & 1); 3096 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3097 r.high |= LIT64(0x7FFF000000000000); 3098 r.high |= (uint64_t)p.sign << 63; 3099 3100 return r; 3101 } 3102 3103 /*---------------------------------------------------------------------------- 3104 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3105 *----------------------------------------------------------------------------*/ 3106 3107 float16 float16_silence_nan(float16 a, float_status *status) 3108 { 3109 FloatParts p = float16_unpack_raw(a); 3110 p.frac <<= float16_params.frac_shift; 3111 p = parts_silence_nan(p, status); 3112 p.frac >>= float16_params.frac_shift; 3113 return float16_pack_raw(p); 3114 } 3115 3116 float32 float32_silence_nan(float32 a, float_status *status) 3117 { 3118 FloatParts p = float32_unpack_raw(a); 3119 p.frac <<= float32_params.frac_shift; 3120 p = parts_silence_nan(p, status); 3121 p.frac >>= float32_params.frac_shift; 3122 return float32_pack_raw(p); 3123 } 3124 3125 float64 float64_silence_nan(float64 a, float_status *status) 3126 { 3127 FloatParts p = float64_unpack_raw(a); 3128 p.frac <<= float64_params.frac_shift; 3129 p = parts_silence_nan(p, status); 3130 p.frac >>= float64_params.frac_shift; 3131 return float64_pack_raw(p); 3132 } 3133 3134 /*---------------------------------------------------------------------------- 3135 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3136 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3137 | input. If `zSign' is 1, the input is negated before being converted to an 3138 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3139 | is simply rounded to an integer, with the inexact exception raised if the 3140 | input cannot be represented exactly as an integer. However, if the fixed- 3141 | point input is too large, the invalid exception is raised and the largest 3142 | positive or negative integer is returned. 3143 *----------------------------------------------------------------------------*/ 3144 3145 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3146 { 3147 int8_t roundingMode; 3148 flag roundNearestEven; 3149 int8_t roundIncrement, roundBits; 3150 int32_t z; 3151 3152 roundingMode = status->float_rounding_mode; 3153 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3154 switch (roundingMode) { 3155 case float_round_nearest_even: 3156 case float_round_ties_away: 3157 roundIncrement = 0x40; 3158 break; 3159 case float_round_to_zero: 3160 roundIncrement = 0; 3161 break; 3162 case float_round_up: 3163 roundIncrement = zSign ? 0 : 0x7f; 3164 break; 3165 case float_round_down: 3166 roundIncrement = zSign ? 0x7f : 0; 3167 break; 3168 default: 3169 abort(); 3170 } 3171 roundBits = absZ & 0x7F; 3172 absZ = ( absZ + roundIncrement )>>7; 3173 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3174 z = absZ; 3175 if ( zSign ) z = - z; 3176 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3177 float_raise(float_flag_invalid, status); 3178 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3179 } 3180 if (roundBits) { 3181 status->float_exception_flags |= float_flag_inexact; 3182 } 3183 return z; 3184 3185 } 3186 3187 /*---------------------------------------------------------------------------- 3188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3189 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3190 | and returns the properly rounded 64-bit integer corresponding to the input. 3191 | If `zSign' is 1, the input is negated before being converted to an integer. 3192 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3193 | the inexact exception raised if the input cannot be represented exactly as 3194 | an integer. However, if the fixed-point input is too large, the invalid 3195 | exception is raised and the largest positive or negative integer is 3196 | returned. 3197 *----------------------------------------------------------------------------*/ 3198 3199 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3200 float_status *status) 3201 { 3202 int8_t roundingMode; 3203 flag roundNearestEven, increment; 3204 int64_t z; 3205 3206 roundingMode = status->float_rounding_mode; 3207 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3208 switch (roundingMode) { 3209 case float_round_nearest_even: 3210 case float_round_ties_away: 3211 increment = ((int64_t) absZ1 < 0); 3212 break; 3213 case float_round_to_zero: 3214 increment = 0; 3215 break; 3216 case float_round_up: 3217 increment = !zSign && absZ1; 3218 break; 3219 case float_round_down: 3220 increment = zSign && absZ1; 3221 break; 3222 default: 3223 abort(); 3224 } 3225 if ( increment ) { 3226 ++absZ0; 3227 if ( absZ0 == 0 ) goto overflow; 3228 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3229 } 3230 z = absZ0; 3231 if ( zSign ) z = - z; 3232 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3233 overflow: 3234 float_raise(float_flag_invalid, status); 3235 return 3236 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3237 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3238 } 3239 if (absZ1) { 3240 status->float_exception_flags |= float_flag_inexact; 3241 } 3242 return z; 3243 3244 } 3245 3246 /*---------------------------------------------------------------------------- 3247 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3248 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3249 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3250 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3251 | with the inexact exception raised if the input cannot be represented exactly 3252 | as an integer. However, if the fixed-point input is too large, the invalid 3253 | exception is raised and the largest unsigned integer is returned. 3254 *----------------------------------------------------------------------------*/ 3255 3256 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3257 uint64_t absZ1, float_status *status) 3258 { 3259 int8_t roundingMode; 3260 flag roundNearestEven, increment; 3261 3262 roundingMode = status->float_rounding_mode; 3263 roundNearestEven = (roundingMode == float_round_nearest_even); 3264 switch (roundingMode) { 3265 case float_round_nearest_even: 3266 case float_round_ties_away: 3267 increment = ((int64_t)absZ1 < 0); 3268 break; 3269 case float_round_to_zero: 3270 increment = 0; 3271 break; 3272 case float_round_up: 3273 increment = !zSign && absZ1; 3274 break; 3275 case float_round_down: 3276 increment = zSign && absZ1; 3277 break; 3278 default: 3279 abort(); 3280 } 3281 if (increment) { 3282 ++absZ0; 3283 if (absZ0 == 0) { 3284 float_raise(float_flag_invalid, status); 3285 return LIT64(0xFFFFFFFFFFFFFFFF); 3286 } 3287 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3288 } 3289 3290 if (zSign && absZ0) { 3291 float_raise(float_flag_invalid, status); 3292 return 0; 3293 } 3294 3295 if (absZ1) { 3296 status->float_exception_flags |= float_flag_inexact; 3297 } 3298 return absZ0; 3299 } 3300 3301 /*---------------------------------------------------------------------------- 3302 | If `a' is denormal and we are in flush-to-zero mode then set the 3303 | input-denormal exception and return zero. Otherwise just return the value. 3304 *----------------------------------------------------------------------------*/ 3305 float32 float32_squash_input_denormal(float32 a, float_status *status) 3306 { 3307 if (status->flush_inputs_to_zero) { 3308 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3309 float_raise(float_flag_input_denormal, status); 3310 return make_float32(float32_val(a) & 0x80000000); 3311 } 3312 } 3313 return a; 3314 } 3315 3316 /*---------------------------------------------------------------------------- 3317 | Normalizes the subnormal single-precision floating-point value represented 3318 | by the denormalized significand `aSig'. The normalized exponent and 3319 | significand are stored at the locations pointed to by `zExpPtr' and 3320 | `zSigPtr', respectively. 3321 *----------------------------------------------------------------------------*/ 3322 3323 static void 3324 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3325 { 3326 int8_t shiftCount; 3327 3328 shiftCount = clz32(aSig) - 8; 3329 *zSigPtr = aSig<<shiftCount; 3330 *zExpPtr = 1 - shiftCount; 3331 3332 } 3333 3334 /*---------------------------------------------------------------------------- 3335 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3336 | and significand `zSig', and returns the proper single-precision floating- 3337 | point value corresponding to the abstract input. Ordinarily, the abstract 3338 | value is simply rounded and packed into the single-precision format, with 3339 | the inexact exception raised if the abstract input cannot be represented 3340 | exactly. However, if the abstract value is too large, the overflow and 3341 | inexact exceptions are raised and an infinity or maximal finite value is 3342 | returned. If the abstract value is too small, the input value is rounded to 3343 | a subnormal number, and the underflow and inexact exceptions are raised if 3344 | the abstract input cannot be represented exactly as a subnormal single- 3345 | precision floating-point number. 3346 | The input significand `zSig' has its binary point between bits 30 3347 | and 29, which is 7 bits to the left of the usual location. This shifted 3348 | significand must be normalized or smaller. If `zSig' is not normalized, 3349 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3350 | and it must not require rounding. In the usual case that `zSig' is 3351 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3352 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3353 | Binary Floating-Point Arithmetic. 3354 *----------------------------------------------------------------------------*/ 3355 3356 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3357 float_status *status) 3358 { 3359 int8_t roundingMode; 3360 flag roundNearestEven; 3361 int8_t roundIncrement, roundBits; 3362 flag isTiny; 3363 3364 roundingMode = status->float_rounding_mode; 3365 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3366 switch (roundingMode) { 3367 case float_round_nearest_even: 3368 case float_round_ties_away: 3369 roundIncrement = 0x40; 3370 break; 3371 case float_round_to_zero: 3372 roundIncrement = 0; 3373 break; 3374 case float_round_up: 3375 roundIncrement = zSign ? 0 : 0x7f; 3376 break; 3377 case float_round_down: 3378 roundIncrement = zSign ? 0x7f : 0; 3379 break; 3380 default: 3381 abort(); 3382 break; 3383 } 3384 roundBits = zSig & 0x7F; 3385 if ( 0xFD <= (uint16_t) zExp ) { 3386 if ( ( 0xFD < zExp ) 3387 || ( ( zExp == 0xFD ) 3388 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3389 ) { 3390 float_raise(float_flag_overflow | float_flag_inexact, status); 3391 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3392 } 3393 if ( zExp < 0 ) { 3394 if (status->flush_to_zero) { 3395 float_raise(float_flag_output_denormal, status); 3396 return packFloat32(zSign, 0, 0); 3397 } 3398 isTiny = 3399 (status->float_detect_tininess 3400 == float_tininess_before_rounding) 3401 || ( zExp < -1 ) 3402 || ( zSig + roundIncrement < 0x80000000 ); 3403 shift32RightJamming( zSig, - zExp, &zSig ); 3404 zExp = 0; 3405 roundBits = zSig & 0x7F; 3406 if (isTiny && roundBits) { 3407 float_raise(float_flag_underflow, status); 3408 } 3409 } 3410 } 3411 if (roundBits) { 3412 status->float_exception_flags |= float_flag_inexact; 3413 } 3414 zSig = ( zSig + roundIncrement )>>7; 3415 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3416 if ( zSig == 0 ) zExp = 0; 3417 return packFloat32( zSign, zExp, zSig ); 3418 3419 } 3420 3421 /*---------------------------------------------------------------------------- 3422 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3423 | and significand `zSig', and returns the proper single-precision floating- 3424 | point value corresponding to the abstract input. This routine is just like 3425 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3426 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3427 | floating-point exponent. 3428 *----------------------------------------------------------------------------*/ 3429 3430 static float32 3431 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3432 float_status *status) 3433 { 3434 int8_t shiftCount; 3435 3436 shiftCount = clz32(zSig) - 1; 3437 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3438 status); 3439 3440 } 3441 3442 /*---------------------------------------------------------------------------- 3443 | If `a' is denormal and we are in flush-to-zero mode then set the 3444 | input-denormal exception and return zero. Otherwise just return the value. 3445 *----------------------------------------------------------------------------*/ 3446 float64 float64_squash_input_denormal(float64 a, float_status *status) 3447 { 3448 if (status->flush_inputs_to_zero) { 3449 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3450 float_raise(float_flag_input_denormal, status); 3451 return make_float64(float64_val(a) & (1ULL << 63)); 3452 } 3453 } 3454 return a; 3455 } 3456 3457 /*---------------------------------------------------------------------------- 3458 | Normalizes the subnormal double-precision floating-point value represented 3459 | by the denormalized significand `aSig'. The normalized exponent and 3460 | significand are stored at the locations pointed to by `zExpPtr' and 3461 | `zSigPtr', respectively. 3462 *----------------------------------------------------------------------------*/ 3463 3464 static void 3465 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3466 { 3467 int8_t shiftCount; 3468 3469 shiftCount = clz64(aSig) - 11; 3470 *zSigPtr = aSig<<shiftCount; 3471 *zExpPtr = 1 - shiftCount; 3472 3473 } 3474 3475 /*---------------------------------------------------------------------------- 3476 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3477 | double-precision floating-point value, returning the result. After being 3478 | shifted into the proper positions, the three fields are simply added 3479 | together to form the result. This means that any integer portion of `zSig' 3480 | will be added into the exponent. Since a properly normalized significand 3481 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3482 | than the desired result exponent whenever `zSig' is a complete, normalized 3483 | significand. 3484 *----------------------------------------------------------------------------*/ 3485 3486 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3487 { 3488 3489 return make_float64( 3490 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3491 3492 } 3493 3494 /*---------------------------------------------------------------------------- 3495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3496 | and significand `zSig', and returns the proper double-precision floating- 3497 | point value corresponding to the abstract input. Ordinarily, the abstract 3498 | value is simply rounded and packed into the double-precision format, with 3499 | the inexact exception raised if the abstract input cannot be represented 3500 | exactly. However, if the abstract value is too large, the overflow and 3501 | inexact exceptions are raised and an infinity or maximal finite value is 3502 | returned. If the abstract value is too small, the input value is rounded to 3503 | a subnormal number, and the underflow and inexact exceptions are raised if 3504 | the abstract input cannot be represented exactly as a subnormal double- 3505 | precision floating-point number. 3506 | The input significand `zSig' has its binary point between bits 62 3507 | and 61, which is 10 bits to the left of the usual location. This shifted 3508 | significand must be normalized or smaller. If `zSig' is not normalized, 3509 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3510 | and it must not require rounding. In the usual case that `zSig' is 3511 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3512 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3513 | Binary Floating-Point Arithmetic. 3514 *----------------------------------------------------------------------------*/ 3515 3516 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3517 float_status *status) 3518 { 3519 int8_t roundingMode; 3520 flag roundNearestEven; 3521 int roundIncrement, roundBits; 3522 flag isTiny; 3523 3524 roundingMode = status->float_rounding_mode; 3525 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3526 switch (roundingMode) { 3527 case float_round_nearest_even: 3528 case float_round_ties_away: 3529 roundIncrement = 0x200; 3530 break; 3531 case float_round_to_zero: 3532 roundIncrement = 0; 3533 break; 3534 case float_round_up: 3535 roundIncrement = zSign ? 0 : 0x3ff; 3536 break; 3537 case float_round_down: 3538 roundIncrement = zSign ? 0x3ff : 0; 3539 break; 3540 case float_round_to_odd: 3541 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3542 break; 3543 default: 3544 abort(); 3545 } 3546 roundBits = zSig & 0x3FF; 3547 if ( 0x7FD <= (uint16_t) zExp ) { 3548 if ( ( 0x7FD < zExp ) 3549 || ( ( zExp == 0x7FD ) 3550 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3551 ) { 3552 bool overflow_to_inf = roundingMode != float_round_to_odd && 3553 roundIncrement != 0; 3554 float_raise(float_flag_overflow | float_flag_inexact, status); 3555 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3556 } 3557 if ( zExp < 0 ) { 3558 if (status->flush_to_zero) { 3559 float_raise(float_flag_output_denormal, status); 3560 return packFloat64(zSign, 0, 0); 3561 } 3562 isTiny = 3563 (status->float_detect_tininess 3564 == float_tininess_before_rounding) 3565 || ( zExp < -1 ) 3566 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3567 shift64RightJamming( zSig, - zExp, &zSig ); 3568 zExp = 0; 3569 roundBits = zSig & 0x3FF; 3570 if (isTiny && roundBits) { 3571 float_raise(float_flag_underflow, status); 3572 } 3573 if (roundingMode == float_round_to_odd) { 3574 /* 3575 * For round-to-odd case, the roundIncrement depends on 3576 * zSig which just changed. 3577 */ 3578 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3579 } 3580 } 3581 } 3582 if (roundBits) { 3583 status->float_exception_flags |= float_flag_inexact; 3584 } 3585 zSig = ( zSig + roundIncrement )>>10; 3586 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3587 if ( zSig == 0 ) zExp = 0; 3588 return packFloat64( zSign, zExp, zSig ); 3589 3590 } 3591 3592 /*---------------------------------------------------------------------------- 3593 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3594 | and significand `zSig', and returns the proper double-precision floating- 3595 | point value corresponding to the abstract input. This routine is just like 3596 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3597 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3598 | floating-point exponent. 3599 *----------------------------------------------------------------------------*/ 3600 3601 static float64 3602 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3603 float_status *status) 3604 { 3605 int8_t shiftCount; 3606 3607 shiftCount = clz64(zSig) - 1; 3608 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3609 status); 3610 3611 } 3612 3613 /*---------------------------------------------------------------------------- 3614 | Normalizes the subnormal extended double-precision floating-point value 3615 | represented by the denormalized significand `aSig'. The normalized exponent 3616 | and significand are stored at the locations pointed to by `zExpPtr' and 3617 | `zSigPtr', respectively. 3618 *----------------------------------------------------------------------------*/ 3619 3620 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3621 uint64_t *zSigPtr) 3622 { 3623 int8_t shiftCount; 3624 3625 shiftCount = clz64(aSig); 3626 *zSigPtr = aSig<<shiftCount; 3627 *zExpPtr = 1 - shiftCount; 3628 } 3629 3630 /*---------------------------------------------------------------------------- 3631 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3632 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3633 | and returns the proper extended double-precision floating-point value 3634 | corresponding to the abstract input. Ordinarily, the abstract value is 3635 | rounded and packed into the extended double-precision format, with the 3636 | inexact exception raised if the abstract input cannot be represented 3637 | exactly. However, if the abstract value is too large, the overflow and 3638 | inexact exceptions are raised and an infinity or maximal finite value is 3639 | returned. If the abstract value is too small, the input value is rounded to 3640 | a subnormal number, and the underflow and inexact exceptions are raised if 3641 | the abstract input cannot be represented exactly as a subnormal extended 3642 | double-precision floating-point number. 3643 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3644 | number of bits as single or double precision, respectively. Otherwise, the 3645 | result is rounded to the full precision of the extended double-precision 3646 | format. 3647 | The input significand must be normalized or smaller. If the input 3648 | significand is not normalized, `zExp' must be 0; in that case, the result 3649 | returned is a subnormal number, and it must not require rounding. The 3650 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3651 | Floating-Point Arithmetic. 3652 *----------------------------------------------------------------------------*/ 3653 3654 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3655 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3656 float_status *status) 3657 { 3658 int8_t roundingMode; 3659 flag roundNearestEven, increment, isTiny; 3660 int64_t roundIncrement, roundMask, roundBits; 3661 3662 roundingMode = status->float_rounding_mode; 3663 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3664 if ( roundingPrecision == 80 ) goto precision80; 3665 if ( roundingPrecision == 64 ) { 3666 roundIncrement = LIT64( 0x0000000000000400 ); 3667 roundMask = LIT64( 0x00000000000007FF ); 3668 } 3669 else if ( roundingPrecision == 32 ) { 3670 roundIncrement = LIT64( 0x0000008000000000 ); 3671 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3672 } 3673 else { 3674 goto precision80; 3675 } 3676 zSig0 |= ( zSig1 != 0 ); 3677 switch (roundingMode) { 3678 case float_round_nearest_even: 3679 case float_round_ties_away: 3680 break; 3681 case float_round_to_zero: 3682 roundIncrement = 0; 3683 break; 3684 case float_round_up: 3685 roundIncrement = zSign ? 0 : roundMask; 3686 break; 3687 case float_round_down: 3688 roundIncrement = zSign ? roundMask : 0; 3689 break; 3690 default: 3691 abort(); 3692 } 3693 roundBits = zSig0 & roundMask; 3694 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3695 if ( ( 0x7FFE < zExp ) 3696 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3697 ) { 3698 goto overflow; 3699 } 3700 if ( zExp <= 0 ) { 3701 if (status->flush_to_zero) { 3702 float_raise(float_flag_output_denormal, status); 3703 return packFloatx80(zSign, 0, 0); 3704 } 3705 isTiny = 3706 (status->float_detect_tininess 3707 == float_tininess_before_rounding) 3708 || ( zExp < 0 ) 3709 || ( zSig0 <= zSig0 + roundIncrement ); 3710 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3711 zExp = 0; 3712 roundBits = zSig0 & roundMask; 3713 if (isTiny && roundBits) { 3714 float_raise(float_flag_underflow, status); 3715 } 3716 if (roundBits) { 3717 status->float_exception_flags |= float_flag_inexact; 3718 } 3719 zSig0 += roundIncrement; 3720 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3721 roundIncrement = roundMask + 1; 3722 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3723 roundMask |= roundIncrement; 3724 } 3725 zSig0 &= ~ roundMask; 3726 return packFloatx80( zSign, zExp, zSig0 ); 3727 } 3728 } 3729 if (roundBits) { 3730 status->float_exception_flags |= float_flag_inexact; 3731 } 3732 zSig0 += roundIncrement; 3733 if ( zSig0 < roundIncrement ) { 3734 ++zExp; 3735 zSig0 = LIT64( 0x8000000000000000 ); 3736 } 3737 roundIncrement = roundMask + 1; 3738 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3739 roundMask |= roundIncrement; 3740 } 3741 zSig0 &= ~ roundMask; 3742 if ( zSig0 == 0 ) zExp = 0; 3743 return packFloatx80( zSign, zExp, zSig0 ); 3744 precision80: 3745 switch (roundingMode) { 3746 case float_round_nearest_even: 3747 case float_round_ties_away: 3748 increment = ((int64_t)zSig1 < 0); 3749 break; 3750 case float_round_to_zero: 3751 increment = 0; 3752 break; 3753 case float_round_up: 3754 increment = !zSign && zSig1; 3755 break; 3756 case float_round_down: 3757 increment = zSign && zSig1; 3758 break; 3759 default: 3760 abort(); 3761 } 3762 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3763 if ( ( 0x7FFE < zExp ) 3764 || ( ( zExp == 0x7FFE ) 3765 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3766 && increment 3767 ) 3768 ) { 3769 roundMask = 0; 3770 overflow: 3771 float_raise(float_flag_overflow | float_flag_inexact, status); 3772 if ( ( roundingMode == float_round_to_zero ) 3773 || ( zSign && ( roundingMode == float_round_up ) ) 3774 || ( ! zSign && ( roundingMode == float_round_down ) ) 3775 ) { 3776 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3777 } 3778 return packFloatx80(zSign, 3779 floatx80_infinity_high, 3780 floatx80_infinity_low); 3781 } 3782 if ( zExp <= 0 ) { 3783 isTiny = 3784 (status->float_detect_tininess 3785 == float_tininess_before_rounding) 3786 || ( zExp < 0 ) 3787 || ! increment 3788 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3789 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3790 zExp = 0; 3791 if (isTiny && zSig1) { 3792 float_raise(float_flag_underflow, status); 3793 } 3794 if (zSig1) { 3795 status->float_exception_flags |= float_flag_inexact; 3796 } 3797 switch (roundingMode) { 3798 case float_round_nearest_even: 3799 case float_round_ties_away: 3800 increment = ((int64_t)zSig1 < 0); 3801 break; 3802 case float_round_to_zero: 3803 increment = 0; 3804 break; 3805 case float_round_up: 3806 increment = !zSign && zSig1; 3807 break; 3808 case float_round_down: 3809 increment = zSign && zSig1; 3810 break; 3811 default: 3812 abort(); 3813 } 3814 if ( increment ) { 3815 ++zSig0; 3816 zSig0 &= 3817 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3818 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3819 } 3820 return packFloatx80( zSign, zExp, zSig0 ); 3821 } 3822 } 3823 if (zSig1) { 3824 status->float_exception_flags |= float_flag_inexact; 3825 } 3826 if ( increment ) { 3827 ++zSig0; 3828 if ( zSig0 == 0 ) { 3829 ++zExp; 3830 zSig0 = LIT64( 0x8000000000000000 ); 3831 } 3832 else { 3833 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3834 } 3835 } 3836 else { 3837 if ( zSig0 == 0 ) zExp = 0; 3838 } 3839 return packFloatx80( zSign, zExp, zSig0 ); 3840 3841 } 3842 3843 /*---------------------------------------------------------------------------- 3844 | Takes an abstract floating-point value having sign `zSign', exponent 3845 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3846 | and returns the proper extended double-precision floating-point value 3847 | corresponding to the abstract input. This routine is just like 3848 | `roundAndPackFloatx80' except that the input significand does not have to be 3849 | normalized. 3850 *----------------------------------------------------------------------------*/ 3851 3852 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3853 flag zSign, int32_t zExp, 3854 uint64_t zSig0, uint64_t zSig1, 3855 float_status *status) 3856 { 3857 int8_t shiftCount; 3858 3859 if ( zSig0 == 0 ) { 3860 zSig0 = zSig1; 3861 zSig1 = 0; 3862 zExp -= 64; 3863 } 3864 shiftCount = clz64(zSig0); 3865 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3866 zExp -= shiftCount; 3867 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3868 zSig0, zSig1, status); 3869 3870 } 3871 3872 /*---------------------------------------------------------------------------- 3873 | Returns the least-significant 64 fraction bits of the quadruple-precision 3874 | floating-point value `a'. 3875 *----------------------------------------------------------------------------*/ 3876 3877 static inline uint64_t extractFloat128Frac1( float128 a ) 3878 { 3879 3880 return a.low; 3881 3882 } 3883 3884 /*---------------------------------------------------------------------------- 3885 | Returns the most-significant 48 fraction bits of the quadruple-precision 3886 | floating-point value `a'. 3887 *----------------------------------------------------------------------------*/ 3888 3889 static inline uint64_t extractFloat128Frac0( float128 a ) 3890 { 3891 3892 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3893 3894 } 3895 3896 /*---------------------------------------------------------------------------- 3897 | Returns the exponent bits of the quadruple-precision floating-point value 3898 | `a'. 3899 *----------------------------------------------------------------------------*/ 3900 3901 static inline int32_t extractFloat128Exp( float128 a ) 3902 { 3903 3904 return ( a.high>>48 ) & 0x7FFF; 3905 3906 } 3907 3908 /*---------------------------------------------------------------------------- 3909 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3910 *----------------------------------------------------------------------------*/ 3911 3912 static inline flag extractFloat128Sign( float128 a ) 3913 { 3914 3915 return a.high>>63; 3916 3917 } 3918 3919 /*---------------------------------------------------------------------------- 3920 | Normalizes the subnormal quadruple-precision floating-point value 3921 | represented by the denormalized significand formed by the concatenation of 3922 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3923 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3924 | significand are stored at the location pointed to by `zSig0Ptr', and the 3925 | least significant 64 bits of the normalized significand are stored at the 3926 | location pointed to by `zSig1Ptr'. 3927 *----------------------------------------------------------------------------*/ 3928 3929 static void 3930 normalizeFloat128Subnormal( 3931 uint64_t aSig0, 3932 uint64_t aSig1, 3933 int32_t *zExpPtr, 3934 uint64_t *zSig0Ptr, 3935 uint64_t *zSig1Ptr 3936 ) 3937 { 3938 int8_t shiftCount; 3939 3940 if ( aSig0 == 0 ) { 3941 shiftCount = clz64(aSig1) - 15; 3942 if ( shiftCount < 0 ) { 3943 *zSig0Ptr = aSig1>>( - shiftCount ); 3944 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3945 } 3946 else { 3947 *zSig0Ptr = aSig1<<shiftCount; 3948 *zSig1Ptr = 0; 3949 } 3950 *zExpPtr = - shiftCount - 63; 3951 } 3952 else { 3953 shiftCount = clz64(aSig0) - 15; 3954 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3955 *zExpPtr = 1 - shiftCount; 3956 } 3957 3958 } 3959 3960 /*---------------------------------------------------------------------------- 3961 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3962 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3963 | floating-point value, returning the result. After being shifted into the 3964 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3965 | added together to form the most significant 32 bits of the result. This 3966 | means that any integer portion of `zSig0' will be added into the exponent. 3967 | Since a properly normalized significand will have an integer portion equal 3968 | to 1, the `zExp' input should be 1 less than the desired result exponent 3969 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3970 | significand. 3971 *----------------------------------------------------------------------------*/ 3972 3973 static inline float128 3974 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3975 { 3976 float128 z; 3977 3978 z.low = zSig1; 3979 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3980 return z; 3981 3982 } 3983 3984 /*---------------------------------------------------------------------------- 3985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3986 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3987 | and `zSig2', and returns the proper quadruple-precision floating-point value 3988 | corresponding to the abstract input. Ordinarily, the abstract value is 3989 | simply rounded and packed into the quadruple-precision format, with the 3990 | inexact exception raised if the abstract input cannot be represented 3991 | exactly. However, if the abstract value is too large, the overflow and 3992 | inexact exceptions are raised and an infinity or maximal finite value is 3993 | returned. If the abstract value is too small, the input value is rounded to 3994 | a subnormal number, and the underflow and inexact exceptions are raised if 3995 | the abstract input cannot be represented exactly as a subnormal quadruple- 3996 | precision floating-point number. 3997 | The input significand must be normalized or smaller. If the input 3998 | significand is not normalized, `zExp' must be 0; in that case, the result 3999 | returned is a subnormal number, and it must not require rounding. In the 4000 | usual case that the input significand is normalized, `zExp' must be 1 less 4001 | than the ``true'' floating-point exponent. The handling of underflow and 4002 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4003 *----------------------------------------------------------------------------*/ 4004 4005 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4006 uint64_t zSig0, uint64_t zSig1, 4007 uint64_t zSig2, float_status *status) 4008 { 4009 int8_t roundingMode; 4010 flag roundNearestEven, increment, isTiny; 4011 4012 roundingMode = status->float_rounding_mode; 4013 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4014 switch (roundingMode) { 4015 case float_round_nearest_even: 4016 case float_round_ties_away: 4017 increment = ((int64_t)zSig2 < 0); 4018 break; 4019 case float_round_to_zero: 4020 increment = 0; 4021 break; 4022 case float_round_up: 4023 increment = !zSign && zSig2; 4024 break; 4025 case float_round_down: 4026 increment = zSign && zSig2; 4027 break; 4028 case float_round_to_odd: 4029 increment = !(zSig1 & 0x1) && zSig2; 4030 break; 4031 default: 4032 abort(); 4033 } 4034 if ( 0x7FFD <= (uint32_t) zExp ) { 4035 if ( ( 0x7FFD < zExp ) 4036 || ( ( zExp == 0x7FFD ) 4037 && eq128( 4038 LIT64( 0x0001FFFFFFFFFFFF ), 4039 LIT64( 0xFFFFFFFFFFFFFFFF ), 4040 zSig0, 4041 zSig1 4042 ) 4043 && increment 4044 ) 4045 ) { 4046 float_raise(float_flag_overflow | float_flag_inexact, status); 4047 if ( ( roundingMode == float_round_to_zero ) 4048 || ( zSign && ( roundingMode == float_round_up ) ) 4049 || ( ! zSign && ( roundingMode == float_round_down ) ) 4050 || (roundingMode == float_round_to_odd) 4051 ) { 4052 return 4053 packFloat128( 4054 zSign, 4055 0x7FFE, 4056 LIT64( 0x0000FFFFFFFFFFFF ), 4057 LIT64( 0xFFFFFFFFFFFFFFFF ) 4058 ); 4059 } 4060 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4061 } 4062 if ( zExp < 0 ) { 4063 if (status->flush_to_zero) { 4064 float_raise(float_flag_output_denormal, status); 4065 return packFloat128(zSign, 0, 0, 0); 4066 } 4067 isTiny = 4068 (status->float_detect_tininess 4069 == float_tininess_before_rounding) 4070 || ( zExp < -1 ) 4071 || ! increment 4072 || lt128( 4073 zSig0, 4074 zSig1, 4075 LIT64( 0x0001FFFFFFFFFFFF ), 4076 LIT64( 0xFFFFFFFFFFFFFFFF ) 4077 ); 4078 shift128ExtraRightJamming( 4079 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4080 zExp = 0; 4081 if (isTiny && zSig2) { 4082 float_raise(float_flag_underflow, status); 4083 } 4084 switch (roundingMode) { 4085 case float_round_nearest_even: 4086 case float_round_ties_away: 4087 increment = ((int64_t)zSig2 < 0); 4088 break; 4089 case float_round_to_zero: 4090 increment = 0; 4091 break; 4092 case float_round_up: 4093 increment = !zSign && zSig2; 4094 break; 4095 case float_round_down: 4096 increment = zSign && zSig2; 4097 break; 4098 case float_round_to_odd: 4099 increment = !(zSig1 & 0x1) && zSig2; 4100 break; 4101 default: 4102 abort(); 4103 } 4104 } 4105 } 4106 if (zSig2) { 4107 status->float_exception_flags |= float_flag_inexact; 4108 } 4109 if ( increment ) { 4110 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4111 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4112 } 4113 else { 4114 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4115 } 4116 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4117 4118 } 4119 4120 /*---------------------------------------------------------------------------- 4121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4122 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4123 | returns the proper quadruple-precision floating-point value corresponding 4124 | to the abstract input. This routine is just like `roundAndPackFloat128' 4125 | except that the input significand has fewer bits and does not have to be 4126 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4127 | point exponent. 4128 *----------------------------------------------------------------------------*/ 4129 4130 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4131 uint64_t zSig0, uint64_t zSig1, 4132 float_status *status) 4133 { 4134 int8_t shiftCount; 4135 uint64_t zSig2; 4136 4137 if ( zSig0 == 0 ) { 4138 zSig0 = zSig1; 4139 zSig1 = 0; 4140 zExp -= 64; 4141 } 4142 shiftCount = clz64(zSig0) - 15; 4143 if ( 0 <= shiftCount ) { 4144 zSig2 = 0; 4145 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4146 } 4147 else { 4148 shift128ExtraRightJamming( 4149 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4150 } 4151 zExp -= shiftCount; 4152 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4153 4154 } 4155 4156 4157 /*---------------------------------------------------------------------------- 4158 | Returns the result of converting the 32-bit two's complement integer `a' 4159 | to the extended double-precision floating-point format. The conversion 4160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4161 | Arithmetic. 4162 *----------------------------------------------------------------------------*/ 4163 4164 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4165 { 4166 flag zSign; 4167 uint32_t absA; 4168 int8_t shiftCount; 4169 uint64_t zSig; 4170 4171 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4172 zSign = ( a < 0 ); 4173 absA = zSign ? - a : a; 4174 shiftCount = clz32(absA) + 32; 4175 zSig = absA; 4176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4177 4178 } 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns the result of converting the 32-bit two's complement integer `a' to 4182 | the quadruple-precision floating-point format. The conversion is performed 4183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4184 *----------------------------------------------------------------------------*/ 4185 4186 float128 int32_to_float128(int32_t a, float_status *status) 4187 { 4188 flag zSign; 4189 uint32_t absA; 4190 int8_t shiftCount; 4191 uint64_t zSig0; 4192 4193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4194 zSign = ( a < 0 ); 4195 absA = zSign ? - a : a; 4196 shiftCount = clz32(absA) + 17; 4197 zSig0 = absA; 4198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4199 4200 } 4201 4202 /*---------------------------------------------------------------------------- 4203 | Returns the result of converting the 64-bit two's complement integer `a' 4204 | to the extended double-precision floating-point format. The conversion 4205 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4206 | Arithmetic. 4207 *----------------------------------------------------------------------------*/ 4208 4209 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4210 { 4211 flag zSign; 4212 uint64_t absA; 4213 int8_t shiftCount; 4214 4215 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4216 zSign = ( a < 0 ); 4217 absA = zSign ? - a : a; 4218 shiftCount = clz64(absA); 4219 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4220 4221 } 4222 4223 /*---------------------------------------------------------------------------- 4224 | Returns the result of converting the 64-bit two's complement integer `a' to 4225 | the quadruple-precision floating-point format. The conversion is performed 4226 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4227 *----------------------------------------------------------------------------*/ 4228 4229 float128 int64_to_float128(int64_t a, float_status *status) 4230 { 4231 flag zSign; 4232 uint64_t absA; 4233 int8_t shiftCount; 4234 int32_t zExp; 4235 uint64_t zSig0, zSig1; 4236 4237 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4238 zSign = ( a < 0 ); 4239 absA = zSign ? - a : a; 4240 shiftCount = clz64(absA) + 49; 4241 zExp = 0x406E - shiftCount; 4242 if ( 64 <= shiftCount ) { 4243 zSig1 = 0; 4244 zSig0 = absA; 4245 shiftCount -= 64; 4246 } 4247 else { 4248 zSig1 = absA; 4249 zSig0 = 0; 4250 } 4251 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4252 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4253 4254 } 4255 4256 /*---------------------------------------------------------------------------- 4257 | Returns the result of converting the 64-bit unsigned integer `a' 4258 | to the quadruple-precision floating-point format. The conversion is performed 4259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4260 *----------------------------------------------------------------------------*/ 4261 4262 float128 uint64_to_float128(uint64_t a, float_status *status) 4263 { 4264 if (a == 0) { 4265 return float128_zero; 4266 } 4267 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4268 } 4269 4270 /*---------------------------------------------------------------------------- 4271 | Returns the result of converting the single-precision floating-point value 4272 | `a' to the extended double-precision floating-point format. The conversion 4273 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4274 | Arithmetic. 4275 *----------------------------------------------------------------------------*/ 4276 4277 floatx80 float32_to_floatx80(float32 a, float_status *status) 4278 { 4279 flag aSign; 4280 int aExp; 4281 uint32_t aSig; 4282 4283 a = float32_squash_input_denormal(a, status); 4284 aSig = extractFloat32Frac( a ); 4285 aExp = extractFloat32Exp( a ); 4286 aSign = extractFloat32Sign( a ); 4287 if ( aExp == 0xFF ) { 4288 if (aSig) { 4289 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4290 } 4291 return packFloatx80(aSign, 4292 floatx80_infinity_high, 4293 floatx80_infinity_low); 4294 } 4295 if ( aExp == 0 ) { 4296 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4297 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4298 } 4299 aSig |= 0x00800000; 4300 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4301 4302 } 4303 4304 /*---------------------------------------------------------------------------- 4305 | Returns the result of converting the single-precision floating-point value 4306 | `a' to the double-precision floating-point format. The conversion is 4307 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4308 | Arithmetic. 4309 *----------------------------------------------------------------------------*/ 4310 4311 float128 float32_to_float128(float32 a, float_status *status) 4312 { 4313 flag aSign; 4314 int aExp; 4315 uint32_t aSig; 4316 4317 a = float32_squash_input_denormal(a, status); 4318 aSig = extractFloat32Frac( a ); 4319 aExp = extractFloat32Exp( a ); 4320 aSign = extractFloat32Sign( a ); 4321 if ( aExp == 0xFF ) { 4322 if (aSig) { 4323 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4324 } 4325 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4326 } 4327 if ( aExp == 0 ) { 4328 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4329 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4330 --aExp; 4331 } 4332 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4333 4334 } 4335 4336 /*---------------------------------------------------------------------------- 4337 | Returns the remainder of the single-precision floating-point value `a' 4338 | with respect to the corresponding value `b'. The operation is performed 4339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4340 *----------------------------------------------------------------------------*/ 4341 4342 float32 float32_rem(float32 a, float32 b, float_status *status) 4343 { 4344 flag aSign, zSign; 4345 int aExp, bExp, expDiff; 4346 uint32_t aSig, bSig; 4347 uint32_t q; 4348 uint64_t aSig64, bSig64, q64; 4349 uint32_t alternateASig; 4350 int32_t sigMean; 4351 a = float32_squash_input_denormal(a, status); 4352 b = float32_squash_input_denormal(b, status); 4353 4354 aSig = extractFloat32Frac( a ); 4355 aExp = extractFloat32Exp( a ); 4356 aSign = extractFloat32Sign( a ); 4357 bSig = extractFloat32Frac( b ); 4358 bExp = extractFloat32Exp( b ); 4359 if ( aExp == 0xFF ) { 4360 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4361 return propagateFloat32NaN(a, b, status); 4362 } 4363 float_raise(float_flag_invalid, status); 4364 return float32_default_nan(status); 4365 } 4366 if ( bExp == 0xFF ) { 4367 if (bSig) { 4368 return propagateFloat32NaN(a, b, status); 4369 } 4370 return a; 4371 } 4372 if ( bExp == 0 ) { 4373 if ( bSig == 0 ) { 4374 float_raise(float_flag_invalid, status); 4375 return float32_default_nan(status); 4376 } 4377 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4378 } 4379 if ( aExp == 0 ) { 4380 if ( aSig == 0 ) return a; 4381 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4382 } 4383 expDiff = aExp - bExp; 4384 aSig |= 0x00800000; 4385 bSig |= 0x00800000; 4386 if ( expDiff < 32 ) { 4387 aSig <<= 8; 4388 bSig <<= 8; 4389 if ( expDiff < 0 ) { 4390 if ( expDiff < -1 ) return a; 4391 aSig >>= 1; 4392 } 4393 q = ( bSig <= aSig ); 4394 if ( q ) aSig -= bSig; 4395 if ( 0 < expDiff ) { 4396 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4397 q >>= 32 - expDiff; 4398 bSig >>= 2; 4399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4400 } 4401 else { 4402 aSig >>= 2; 4403 bSig >>= 2; 4404 } 4405 } 4406 else { 4407 if ( bSig <= aSig ) aSig -= bSig; 4408 aSig64 = ( (uint64_t) aSig )<<40; 4409 bSig64 = ( (uint64_t) bSig )<<40; 4410 expDiff -= 64; 4411 while ( 0 < expDiff ) { 4412 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4413 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4414 aSig64 = - ( ( bSig * q64 )<<38 ); 4415 expDiff -= 62; 4416 } 4417 expDiff += 64; 4418 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4419 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4420 q = q64>>( 64 - expDiff ); 4421 bSig <<= 6; 4422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4423 } 4424 do { 4425 alternateASig = aSig; 4426 ++q; 4427 aSig -= bSig; 4428 } while ( 0 <= (int32_t) aSig ); 4429 sigMean = aSig + alternateASig; 4430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4431 aSig = alternateASig; 4432 } 4433 zSign = ( (int32_t) aSig < 0 ); 4434 if ( zSign ) aSig = - aSig; 4435 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4436 } 4437 4438 4439 4440 /*---------------------------------------------------------------------------- 4441 | Returns the binary exponential of the single-precision floating-point value 4442 | `a'. The operation is performed according to the IEC/IEEE Standard for 4443 | Binary Floating-Point Arithmetic. 4444 | 4445 | Uses the following identities: 4446 | 4447 | 1. ------------------------------------------------------------------------- 4448 | x x*ln(2) 4449 | 2 = e 4450 | 4451 | 2. ------------------------------------------------------------------------- 4452 | 2 3 4 5 n 4453 | x x x x x x x 4454 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4455 | 1! 2! 3! 4! 5! n! 4456 *----------------------------------------------------------------------------*/ 4457 4458 static const float64 float32_exp2_coefficients[15] = 4459 { 4460 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4461 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4462 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4463 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4464 const_float64( 0x3f81111111111111ll ), /* 5 */ 4465 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4466 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4467 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4468 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4469 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4470 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4471 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4472 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4473 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4474 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4475 }; 4476 4477 float32 float32_exp2(float32 a, float_status *status) 4478 { 4479 flag aSign; 4480 int aExp; 4481 uint32_t aSig; 4482 float64 r, x, xn; 4483 int i; 4484 a = float32_squash_input_denormal(a, status); 4485 4486 aSig = extractFloat32Frac( a ); 4487 aExp = extractFloat32Exp( a ); 4488 aSign = extractFloat32Sign( a ); 4489 4490 if ( aExp == 0xFF) { 4491 if (aSig) { 4492 return propagateFloat32NaN(a, float32_zero, status); 4493 } 4494 return (aSign) ? float32_zero : a; 4495 } 4496 if (aExp == 0) { 4497 if (aSig == 0) return float32_one; 4498 } 4499 4500 float_raise(float_flag_inexact, status); 4501 4502 /* ******************************* */ 4503 /* using float64 for approximation */ 4504 /* ******************************* */ 4505 x = float32_to_float64(a, status); 4506 x = float64_mul(x, float64_ln2, status); 4507 4508 xn = x; 4509 r = float64_one; 4510 for (i = 0 ; i < 15 ; i++) { 4511 float64 f; 4512 4513 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4514 r = float64_add(r, f, status); 4515 4516 xn = float64_mul(xn, x, status); 4517 } 4518 4519 return float64_to_float32(r, status); 4520 } 4521 4522 /*---------------------------------------------------------------------------- 4523 | Returns the binary log of the single-precision floating-point value `a'. 4524 | The operation is performed according to the IEC/IEEE Standard for Binary 4525 | Floating-Point Arithmetic. 4526 *----------------------------------------------------------------------------*/ 4527 float32 float32_log2(float32 a, float_status *status) 4528 { 4529 flag aSign, zSign; 4530 int aExp; 4531 uint32_t aSig, zSig, i; 4532 4533 a = float32_squash_input_denormal(a, status); 4534 aSig = extractFloat32Frac( a ); 4535 aExp = extractFloat32Exp( a ); 4536 aSign = extractFloat32Sign( a ); 4537 4538 if ( aExp == 0 ) { 4539 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4540 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4541 } 4542 if ( aSign ) { 4543 float_raise(float_flag_invalid, status); 4544 return float32_default_nan(status); 4545 } 4546 if ( aExp == 0xFF ) { 4547 if (aSig) { 4548 return propagateFloat32NaN(a, float32_zero, status); 4549 } 4550 return a; 4551 } 4552 4553 aExp -= 0x7F; 4554 aSig |= 0x00800000; 4555 zSign = aExp < 0; 4556 zSig = aExp << 23; 4557 4558 for (i = 1 << 22; i > 0; i >>= 1) { 4559 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4560 if ( aSig & 0x01000000 ) { 4561 aSig >>= 1; 4562 zSig |= i; 4563 } 4564 } 4565 4566 if ( zSign ) 4567 zSig = -zSig; 4568 4569 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4570 } 4571 4572 /*---------------------------------------------------------------------------- 4573 | Returns 1 if the single-precision floating-point value `a' is equal to 4574 | the corresponding value `b', and 0 otherwise. The invalid exception is 4575 | raised if either operand is a NaN. Otherwise, the comparison is performed 4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4577 *----------------------------------------------------------------------------*/ 4578 4579 int float32_eq(float32 a, float32 b, float_status *status) 4580 { 4581 uint32_t av, bv; 4582 a = float32_squash_input_denormal(a, status); 4583 b = float32_squash_input_denormal(b, status); 4584 4585 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4586 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4587 ) { 4588 float_raise(float_flag_invalid, status); 4589 return 0; 4590 } 4591 av = float32_val(a); 4592 bv = float32_val(b); 4593 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4594 } 4595 4596 /*---------------------------------------------------------------------------- 4597 | Returns 1 if the single-precision floating-point value `a' is less than 4598 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4599 | exception is raised if either operand is a NaN. The comparison is performed 4600 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4601 *----------------------------------------------------------------------------*/ 4602 4603 int float32_le(float32 a, float32 b, float_status *status) 4604 { 4605 flag aSign, bSign; 4606 uint32_t av, bv; 4607 a = float32_squash_input_denormal(a, status); 4608 b = float32_squash_input_denormal(b, status); 4609 4610 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4611 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4612 ) { 4613 float_raise(float_flag_invalid, status); 4614 return 0; 4615 } 4616 aSign = extractFloat32Sign( a ); 4617 bSign = extractFloat32Sign( b ); 4618 av = float32_val(a); 4619 bv = float32_val(b); 4620 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4621 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4622 4623 } 4624 4625 /*---------------------------------------------------------------------------- 4626 | Returns 1 if the single-precision floating-point value `a' is less than 4627 | the corresponding value `b', and 0 otherwise. The invalid exception is 4628 | raised if either operand is a NaN. The comparison is performed according 4629 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4630 *----------------------------------------------------------------------------*/ 4631 4632 int float32_lt(float32 a, float32 b, float_status *status) 4633 { 4634 flag aSign, bSign; 4635 uint32_t av, bv; 4636 a = float32_squash_input_denormal(a, status); 4637 b = float32_squash_input_denormal(b, status); 4638 4639 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4640 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4641 ) { 4642 float_raise(float_flag_invalid, status); 4643 return 0; 4644 } 4645 aSign = extractFloat32Sign( a ); 4646 bSign = extractFloat32Sign( b ); 4647 av = float32_val(a); 4648 bv = float32_val(b); 4649 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4650 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4651 4652 } 4653 4654 /*---------------------------------------------------------------------------- 4655 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4656 | be compared, and 0 otherwise. The invalid exception is raised if either 4657 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4658 | Standard for Binary Floating-Point Arithmetic. 4659 *----------------------------------------------------------------------------*/ 4660 4661 int float32_unordered(float32 a, float32 b, float_status *status) 4662 { 4663 a = float32_squash_input_denormal(a, status); 4664 b = float32_squash_input_denormal(b, status); 4665 4666 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4667 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4668 ) { 4669 float_raise(float_flag_invalid, status); 4670 return 1; 4671 } 4672 return 0; 4673 } 4674 4675 /*---------------------------------------------------------------------------- 4676 | Returns 1 if the single-precision floating-point value `a' is equal to 4677 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4678 | exception. The comparison is performed according to the IEC/IEEE Standard 4679 | for Binary Floating-Point Arithmetic. 4680 *----------------------------------------------------------------------------*/ 4681 4682 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4683 { 4684 a = float32_squash_input_denormal(a, status); 4685 b = float32_squash_input_denormal(b, status); 4686 4687 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4688 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4689 ) { 4690 if (float32_is_signaling_nan(a, status) 4691 || float32_is_signaling_nan(b, status)) { 4692 float_raise(float_flag_invalid, status); 4693 } 4694 return 0; 4695 } 4696 return ( float32_val(a) == float32_val(b) ) || 4697 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4698 } 4699 4700 /*---------------------------------------------------------------------------- 4701 | Returns 1 if the single-precision floating-point value `a' is less than or 4702 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4703 | cause an exception. Otherwise, the comparison is performed according to the 4704 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4705 *----------------------------------------------------------------------------*/ 4706 4707 int float32_le_quiet(float32 a, float32 b, float_status *status) 4708 { 4709 flag aSign, bSign; 4710 uint32_t av, bv; 4711 a = float32_squash_input_denormal(a, status); 4712 b = float32_squash_input_denormal(b, status); 4713 4714 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4715 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4716 ) { 4717 if (float32_is_signaling_nan(a, status) 4718 || float32_is_signaling_nan(b, status)) { 4719 float_raise(float_flag_invalid, status); 4720 } 4721 return 0; 4722 } 4723 aSign = extractFloat32Sign( a ); 4724 bSign = extractFloat32Sign( b ); 4725 av = float32_val(a); 4726 bv = float32_val(b); 4727 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4728 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4729 4730 } 4731 4732 /*---------------------------------------------------------------------------- 4733 | Returns 1 if the single-precision floating-point value `a' is less than 4734 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4735 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4736 | Standard for Binary Floating-Point Arithmetic. 4737 *----------------------------------------------------------------------------*/ 4738 4739 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4740 { 4741 flag aSign, bSign; 4742 uint32_t av, bv; 4743 a = float32_squash_input_denormal(a, status); 4744 b = float32_squash_input_denormal(b, status); 4745 4746 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4747 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4748 ) { 4749 if (float32_is_signaling_nan(a, status) 4750 || float32_is_signaling_nan(b, status)) { 4751 float_raise(float_flag_invalid, status); 4752 } 4753 return 0; 4754 } 4755 aSign = extractFloat32Sign( a ); 4756 bSign = extractFloat32Sign( b ); 4757 av = float32_val(a); 4758 bv = float32_val(b); 4759 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4760 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4761 4762 } 4763 4764 /*---------------------------------------------------------------------------- 4765 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4766 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4767 | comparison is performed according to the IEC/IEEE Standard for Binary 4768 | Floating-Point Arithmetic. 4769 *----------------------------------------------------------------------------*/ 4770 4771 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4772 { 4773 a = float32_squash_input_denormal(a, status); 4774 b = float32_squash_input_denormal(b, status); 4775 4776 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4777 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4778 ) { 4779 if (float32_is_signaling_nan(a, status) 4780 || float32_is_signaling_nan(b, status)) { 4781 float_raise(float_flag_invalid, status); 4782 } 4783 return 1; 4784 } 4785 return 0; 4786 } 4787 4788 /*---------------------------------------------------------------------------- 4789 | If `a' is denormal and we are in flush-to-zero mode then set the 4790 | input-denormal exception and return zero. Otherwise just return the value. 4791 *----------------------------------------------------------------------------*/ 4792 float16 float16_squash_input_denormal(float16 a, float_status *status) 4793 { 4794 if (status->flush_inputs_to_zero) { 4795 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4796 float_raise(float_flag_input_denormal, status); 4797 return make_float16(float16_val(a) & 0x8000); 4798 } 4799 } 4800 return a; 4801 } 4802 4803 /*---------------------------------------------------------------------------- 4804 | Returns the result of converting the double-precision floating-point value 4805 | `a' to the extended double-precision floating-point format. The conversion 4806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4807 | Arithmetic. 4808 *----------------------------------------------------------------------------*/ 4809 4810 floatx80 float64_to_floatx80(float64 a, float_status *status) 4811 { 4812 flag aSign; 4813 int aExp; 4814 uint64_t aSig; 4815 4816 a = float64_squash_input_denormal(a, status); 4817 aSig = extractFloat64Frac( a ); 4818 aExp = extractFloat64Exp( a ); 4819 aSign = extractFloat64Sign( a ); 4820 if ( aExp == 0x7FF ) { 4821 if (aSig) { 4822 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4823 } 4824 return packFloatx80(aSign, 4825 floatx80_infinity_high, 4826 floatx80_infinity_low); 4827 } 4828 if ( aExp == 0 ) { 4829 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4830 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4831 } 4832 return 4833 packFloatx80( 4834 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4835 4836 } 4837 4838 /*---------------------------------------------------------------------------- 4839 | Returns the result of converting the double-precision floating-point value 4840 | `a' to the quadruple-precision floating-point format. The conversion is 4841 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4842 | Arithmetic. 4843 *----------------------------------------------------------------------------*/ 4844 4845 float128 float64_to_float128(float64 a, float_status *status) 4846 { 4847 flag aSign; 4848 int aExp; 4849 uint64_t aSig, zSig0, zSig1; 4850 4851 a = float64_squash_input_denormal(a, status); 4852 aSig = extractFloat64Frac( a ); 4853 aExp = extractFloat64Exp( a ); 4854 aSign = extractFloat64Sign( a ); 4855 if ( aExp == 0x7FF ) { 4856 if (aSig) { 4857 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4858 } 4859 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4860 } 4861 if ( aExp == 0 ) { 4862 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4863 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4864 --aExp; 4865 } 4866 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4867 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4868 4869 } 4870 4871 4872 /*---------------------------------------------------------------------------- 4873 | Returns the remainder of the double-precision floating-point value `a' 4874 | with respect to the corresponding value `b'. The operation is performed 4875 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4876 *----------------------------------------------------------------------------*/ 4877 4878 float64 float64_rem(float64 a, float64 b, float_status *status) 4879 { 4880 flag aSign, zSign; 4881 int aExp, bExp, expDiff; 4882 uint64_t aSig, bSig; 4883 uint64_t q, alternateASig; 4884 int64_t sigMean; 4885 4886 a = float64_squash_input_denormal(a, status); 4887 b = float64_squash_input_denormal(b, status); 4888 aSig = extractFloat64Frac( a ); 4889 aExp = extractFloat64Exp( a ); 4890 aSign = extractFloat64Sign( a ); 4891 bSig = extractFloat64Frac( b ); 4892 bExp = extractFloat64Exp( b ); 4893 if ( aExp == 0x7FF ) { 4894 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4895 return propagateFloat64NaN(a, b, status); 4896 } 4897 float_raise(float_flag_invalid, status); 4898 return float64_default_nan(status); 4899 } 4900 if ( bExp == 0x7FF ) { 4901 if (bSig) { 4902 return propagateFloat64NaN(a, b, status); 4903 } 4904 return a; 4905 } 4906 if ( bExp == 0 ) { 4907 if ( bSig == 0 ) { 4908 float_raise(float_flag_invalid, status); 4909 return float64_default_nan(status); 4910 } 4911 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4912 } 4913 if ( aExp == 0 ) { 4914 if ( aSig == 0 ) return a; 4915 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4916 } 4917 expDiff = aExp - bExp; 4918 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4919 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4920 if ( expDiff < 0 ) { 4921 if ( expDiff < -1 ) return a; 4922 aSig >>= 1; 4923 } 4924 q = ( bSig <= aSig ); 4925 if ( q ) aSig -= bSig; 4926 expDiff -= 64; 4927 while ( 0 < expDiff ) { 4928 q = estimateDiv128To64( aSig, 0, bSig ); 4929 q = ( 2 < q ) ? q - 2 : 0; 4930 aSig = - ( ( bSig>>2 ) * q ); 4931 expDiff -= 62; 4932 } 4933 expDiff += 64; 4934 if ( 0 < expDiff ) { 4935 q = estimateDiv128To64( aSig, 0, bSig ); 4936 q = ( 2 < q ) ? q - 2 : 0; 4937 q >>= 64 - expDiff; 4938 bSig >>= 2; 4939 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4940 } 4941 else { 4942 aSig >>= 2; 4943 bSig >>= 2; 4944 } 4945 do { 4946 alternateASig = aSig; 4947 ++q; 4948 aSig -= bSig; 4949 } while ( 0 <= (int64_t) aSig ); 4950 sigMean = aSig + alternateASig; 4951 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4952 aSig = alternateASig; 4953 } 4954 zSign = ( (int64_t) aSig < 0 ); 4955 if ( zSign ) aSig = - aSig; 4956 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4957 4958 } 4959 4960 /*---------------------------------------------------------------------------- 4961 | Returns the binary log of the double-precision floating-point value `a'. 4962 | The operation is performed according to the IEC/IEEE Standard for Binary 4963 | Floating-Point Arithmetic. 4964 *----------------------------------------------------------------------------*/ 4965 float64 float64_log2(float64 a, float_status *status) 4966 { 4967 flag aSign, zSign; 4968 int aExp; 4969 uint64_t aSig, aSig0, aSig1, zSig, i; 4970 a = float64_squash_input_denormal(a, status); 4971 4972 aSig = extractFloat64Frac( a ); 4973 aExp = extractFloat64Exp( a ); 4974 aSign = extractFloat64Sign( a ); 4975 4976 if ( aExp == 0 ) { 4977 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4978 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4979 } 4980 if ( aSign ) { 4981 float_raise(float_flag_invalid, status); 4982 return float64_default_nan(status); 4983 } 4984 if ( aExp == 0x7FF ) { 4985 if (aSig) { 4986 return propagateFloat64NaN(a, float64_zero, status); 4987 } 4988 return a; 4989 } 4990 4991 aExp -= 0x3FF; 4992 aSig |= LIT64( 0x0010000000000000 ); 4993 zSign = aExp < 0; 4994 zSig = (uint64_t)aExp << 52; 4995 for (i = 1LL << 51; i > 0; i >>= 1) { 4996 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4997 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4998 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4999 aSig >>= 1; 5000 zSig |= i; 5001 } 5002 } 5003 5004 if ( zSign ) 5005 zSig = -zSig; 5006 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5007 } 5008 5009 /*---------------------------------------------------------------------------- 5010 | Returns 1 if the double-precision floating-point value `a' is equal to the 5011 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5012 | if either operand is a NaN. Otherwise, the comparison is performed 5013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5014 *----------------------------------------------------------------------------*/ 5015 5016 int float64_eq(float64 a, float64 b, float_status *status) 5017 { 5018 uint64_t av, bv; 5019 a = float64_squash_input_denormal(a, status); 5020 b = float64_squash_input_denormal(b, status); 5021 5022 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5023 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5024 ) { 5025 float_raise(float_flag_invalid, status); 5026 return 0; 5027 } 5028 av = float64_val(a); 5029 bv = float64_val(b); 5030 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5031 5032 } 5033 5034 /*---------------------------------------------------------------------------- 5035 | Returns 1 if the double-precision floating-point value `a' is less than or 5036 | equal to the corresponding value `b', and 0 otherwise. The invalid 5037 | exception is raised if either operand is a NaN. The comparison is performed 5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5039 *----------------------------------------------------------------------------*/ 5040 5041 int float64_le(float64 a, float64 b, float_status *status) 5042 { 5043 flag aSign, bSign; 5044 uint64_t av, bv; 5045 a = float64_squash_input_denormal(a, status); 5046 b = float64_squash_input_denormal(b, status); 5047 5048 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5049 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5050 ) { 5051 float_raise(float_flag_invalid, status); 5052 return 0; 5053 } 5054 aSign = extractFloat64Sign( a ); 5055 bSign = extractFloat64Sign( b ); 5056 av = float64_val(a); 5057 bv = float64_val(b); 5058 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5059 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5060 5061 } 5062 5063 /*---------------------------------------------------------------------------- 5064 | Returns 1 if the double-precision floating-point value `a' is less than 5065 | the corresponding value `b', and 0 otherwise. The invalid exception is 5066 | raised if either operand is a NaN. The comparison is performed according 5067 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5068 *----------------------------------------------------------------------------*/ 5069 5070 int float64_lt(float64 a, float64 b, float_status *status) 5071 { 5072 flag aSign, bSign; 5073 uint64_t av, bv; 5074 5075 a = float64_squash_input_denormal(a, status); 5076 b = float64_squash_input_denormal(b, status); 5077 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5078 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5079 ) { 5080 float_raise(float_flag_invalid, status); 5081 return 0; 5082 } 5083 aSign = extractFloat64Sign( a ); 5084 bSign = extractFloat64Sign( b ); 5085 av = float64_val(a); 5086 bv = float64_val(b); 5087 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5088 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5089 5090 } 5091 5092 /*---------------------------------------------------------------------------- 5093 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5094 | be compared, and 0 otherwise. The invalid exception is raised if either 5095 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5096 | Standard for Binary Floating-Point Arithmetic. 5097 *----------------------------------------------------------------------------*/ 5098 5099 int float64_unordered(float64 a, float64 b, float_status *status) 5100 { 5101 a = float64_squash_input_denormal(a, status); 5102 b = float64_squash_input_denormal(b, status); 5103 5104 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5105 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5106 ) { 5107 float_raise(float_flag_invalid, status); 5108 return 1; 5109 } 5110 return 0; 5111 } 5112 5113 /*---------------------------------------------------------------------------- 5114 | Returns 1 if the double-precision floating-point value `a' is equal to the 5115 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5116 | exception.The comparison is performed according to the IEC/IEEE Standard 5117 | for Binary Floating-Point Arithmetic. 5118 *----------------------------------------------------------------------------*/ 5119 5120 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5121 { 5122 uint64_t av, bv; 5123 a = float64_squash_input_denormal(a, status); 5124 b = float64_squash_input_denormal(b, status); 5125 5126 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5127 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5128 ) { 5129 if (float64_is_signaling_nan(a, status) 5130 || float64_is_signaling_nan(b, status)) { 5131 float_raise(float_flag_invalid, status); 5132 } 5133 return 0; 5134 } 5135 av = float64_val(a); 5136 bv = float64_val(b); 5137 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5138 5139 } 5140 5141 /*---------------------------------------------------------------------------- 5142 | Returns 1 if the double-precision floating-point value `a' is less than or 5143 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5144 | cause an exception. Otherwise, the comparison is performed according to the 5145 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5146 *----------------------------------------------------------------------------*/ 5147 5148 int float64_le_quiet(float64 a, float64 b, float_status *status) 5149 { 5150 flag aSign, bSign; 5151 uint64_t av, bv; 5152 a = float64_squash_input_denormal(a, status); 5153 b = float64_squash_input_denormal(b, status); 5154 5155 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5156 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5157 ) { 5158 if (float64_is_signaling_nan(a, status) 5159 || float64_is_signaling_nan(b, status)) { 5160 float_raise(float_flag_invalid, status); 5161 } 5162 return 0; 5163 } 5164 aSign = extractFloat64Sign( a ); 5165 bSign = extractFloat64Sign( b ); 5166 av = float64_val(a); 5167 bv = float64_val(b); 5168 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5169 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5170 5171 } 5172 5173 /*---------------------------------------------------------------------------- 5174 | Returns 1 if the double-precision floating-point value `a' is less than 5175 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5176 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5177 | Standard for Binary Floating-Point Arithmetic. 5178 *----------------------------------------------------------------------------*/ 5179 5180 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5181 { 5182 flag aSign, bSign; 5183 uint64_t av, bv; 5184 a = float64_squash_input_denormal(a, status); 5185 b = float64_squash_input_denormal(b, status); 5186 5187 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5188 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5189 ) { 5190 if (float64_is_signaling_nan(a, status) 5191 || float64_is_signaling_nan(b, status)) { 5192 float_raise(float_flag_invalid, status); 5193 } 5194 return 0; 5195 } 5196 aSign = extractFloat64Sign( a ); 5197 bSign = extractFloat64Sign( b ); 5198 av = float64_val(a); 5199 bv = float64_val(b); 5200 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5201 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5202 5203 } 5204 5205 /*---------------------------------------------------------------------------- 5206 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5207 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5208 | comparison is performed according to the IEC/IEEE Standard for Binary 5209 | Floating-Point Arithmetic. 5210 *----------------------------------------------------------------------------*/ 5211 5212 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5213 { 5214 a = float64_squash_input_denormal(a, status); 5215 b = float64_squash_input_denormal(b, status); 5216 5217 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5218 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5219 ) { 5220 if (float64_is_signaling_nan(a, status) 5221 || float64_is_signaling_nan(b, status)) { 5222 float_raise(float_flag_invalid, status); 5223 } 5224 return 1; 5225 } 5226 return 0; 5227 } 5228 5229 /*---------------------------------------------------------------------------- 5230 | Returns the result of converting the extended double-precision floating- 5231 | point value `a' to the 32-bit two's complement integer format. The 5232 | conversion is performed according to the IEC/IEEE Standard for Binary 5233 | Floating-Point Arithmetic---which means in particular that the conversion 5234 | is rounded according to the current rounding mode. If `a' is a NaN, the 5235 | largest positive integer is returned. Otherwise, if the conversion 5236 | overflows, the largest integer with the same sign as `a' is returned. 5237 *----------------------------------------------------------------------------*/ 5238 5239 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5240 { 5241 flag aSign; 5242 int32_t aExp, shiftCount; 5243 uint64_t aSig; 5244 5245 if (floatx80_invalid_encoding(a)) { 5246 float_raise(float_flag_invalid, status); 5247 return 1 << 31; 5248 } 5249 aSig = extractFloatx80Frac( a ); 5250 aExp = extractFloatx80Exp( a ); 5251 aSign = extractFloatx80Sign( a ); 5252 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5253 shiftCount = 0x4037 - aExp; 5254 if ( shiftCount <= 0 ) shiftCount = 1; 5255 shift64RightJamming( aSig, shiftCount, &aSig ); 5256 return roundAndPackInt32(aSign, aSig, status); 5257 5258 } 5259 5260 /*---------------------------------------------------------------------------- 5261 | Returns the result of converting the extended double-precision floating- 5262 | point value `a' to the 32-bit two's complement integer format. The 5263 | conversion is performed according to the IEC/IEEE Standard for Binary 5264 | Floating-Point Arithmetic, except that the conversion is always rounded 5265 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5266 | Otherwise, if the conversion overflows, the largest integer with the same 5267 | sign as `a' is returned. 5268 *----------------------------------------------------------------------------*/ 5269 5270 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5271 { 5272 flag aSign; 5273 int32_t aExp, shiftCount; 5274 uint64_t aSig, savedASig; 5275 int32_t z; 5276 5277 if (floatx80_invalid_encoding(a)) { 5278 float_raise(float_flag_invalid, status); 5279 return 1 << 31; 5280 } 5281 aSig = extractFloatx80Frac( a ); 5282 aExp = extractFloatx80Exp( a ); 5283 aSign = extractFloatx80Sign( a ); 5284 if ( 0x401E < aExp ) { 5285 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5286 goto invalid; 5287 } 5288 else if ( aExp < 0x3FFF ) { 5289 if (aExp || aSig) { 5290 status->float_exception_flags |= float_flag_inexact; 5291 } 5292 return 0; 5293 } 5294 shiftCount = 0x403E - aExp; 5295 savedASig = aSig; 5296 aSig >>= shiftCount; 5297 z = aSig; 5298 if ( aSign ) z = - z; 5299 if ( ( z < 0 ) ^ aSign ) { 5300 invalid: 5301 float_raise(float_flag_invalid, status); 5302 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5303 } 5304 if ( ( aSig<<shiftCount ) != savedASig ) { 5305 status->float_exception_flags |= float_flag_inexact; 5306 } 5307 return z; 5308 5309 } 5310 5311 /*---------------------------------------------------------------------------- 5312 | Returns the result of converting the extended double-precision floating- 5313 | point value `a' to the 64-bit two's complement integer format. The 5314 | conversion is performed according to the IEC/IEEE Standard for Binary 5315 | Floating-Point Arithmetic---which means in particular that the conversion 5316 | is rounded according to the current rounding mode. If `a' is a NaN, 5317 | the largest positive integer is returned. Otherwise, if the conversion 5318 | overflows, the largest integer with the same sign as `a' is returned. 5319 *----------------------------------------------------------------------------*/ 5320 5321 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5322 { 5323 flag aSign; 5324 int32_t aExp, shiftCount; 5325 uint64_t aSig, aSigExtra; 5326 5327 if (floatx80_invalid_encoding(a)) { 5328 float_raise(float_flag_invalid, status); 5329 return 1ULL << 63; 5330 } 5331 aSig = extractFloatx80Frac( a ); 5332 aExp = extractFloatx80Exp( a ); 5333 aSign = extractFloatx80Sign( a ); 5334 shiftCount = 0x403E - aExp; 5335 if ( shiftCount <= 0 ) { 5336 if ( shiftCount ) { 5337 float_raise(float_flag_invalid, status); 5338 if (!aSign || floatx80_is_any_nan(a)) { 5339 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5340 } 5341 return (int64_t) LIT64( 0x8000000000000000 ); 5342 } 5343 aSigExtra = 0; 5344 } 5345 else { 5346 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5347 } 5348 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5349 5350 } 5351 5352 /*---------------------------------------------------------------------------- 5353 | Returns the result of converting the extended double-precision floating- 5354 | point value `a' to the 64-bit two's complement integer format. The 5355 | conversion is performed according to the IEC/IEEE Standard for Binary 5356 | Floating-Point Arithmetic, except that the conversion is always rounded 5357 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5358 | Otherwise, if the conversion overflows, the largest integer with the same 5359 | sign as `a' is returned. 5360 *----------------------------------------------------------------------------*/ 5361 5362 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5363 { 5364 flag aSign; 5365 int32_t aExp, shiftCount; 5366 uint64_t aSig; 5367 int64_t z; 5368 5369 if (floatx80_invalid_encoding(a)) { 5370 float_raise(float_flag_invalid, status); 5371 return 1ULL << 63; 5372 } 5373 aSig = extractFloatx80Frac( a ); 5374 aExp = extractFloatx80Exp( a ); 5375 aSign = extractFloatx80Sign( a ); 5376 shiftCount = aExp - 0x403E; 5377 if ( 0 <= shiftCount ) { 5378 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5379 if ( ( a.high != 0xC03E ) || aSig ) { 5380 float_raise(float_flag_invalid, status); 5381 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5382 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5383 } 5384 } 5385 return (int64_t) LIT64( 0x8000000000000000 ); 5386 } 5387 else if ( aExp < 0x3FFF ) { 5388 if (aExp | aSig) { 5389 status->float_exception_flags |= float_flag_inexact; 5390 } 5391 return 0; 5392 } 5393 z = aSig>>( - shiftCount ); 5394 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5395 status->float_exception_flags |= float_flag_inexact; 5396 } 5397 if ( aSign ) z = - z; 5398 return z; 5399 5400 } 5401 5402 /*---------------------------------------------------------------------------- 5403 | Returns the result of converting the extended double-precision floating- 5404 | point value `a' to the single-precision floating-point format. The 5405 | conversion is performed according to the IEC/IEEE Standard for Binary 5406 | Floating-Point Arithmetic. 5407 *----------------------------------------------------------------------------*/ 5408 5409 float32 floatx80_to_float32(floatx80 a, float_status *status) 5410 { 5411 flag aSign; 5412 int32_t aExp; 5413 uint64_t aSig; 5414 5415 if (floatx80_invalid_encoding(a)) { 5416 float_raise(float_flag_invalid, status); 5417 return float32_default_nan(status); 5418 } 5419 aSig = extractFloatx80Frac( a ); 5420 aExp = extractFloatx80Exp( a ); 5421 aSign = extractFloatx80Sign( a ); 5422 if ( aExp == 0x7FFF ) { 5423 if ( (uint64_t) ( aSig<<1 ) ) { 5424 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5425 } 5426 return packFloat32( aSign, 0xFF, 0 ); 5427 } 5428 shift64RightJamming( aSig, 33, &aSig ); 5429 if ( aExp || aSig ) aExp -= 0x3F81; 5430 return roundAndPackFloat32(aSign, aExp, aSig, status); 5431 5432 } 5433 5434 /*---------------------------------------------------------------------------- 5435 | Returns the result of converting the extended double-precision floating- 5436 | point value `a' to the double-precision floating-point format. The 5437 | conversion is performed according to the IEC/IEEE Standard for Binary 5438 | Floating-Point Arithmetic. 5439 *----------------------------------------------------------------------------*/ 5440 5441 float64 floatx80_to_float64(floatx80 a, float_status *status) 5442 { 5443 flag aSign; 5444 int32_t aExp; 5445 uint64_t aSig, zSig; 5446 5447 if (floatx80_invalid_encoding(a)) { 5448 float_raise(float_flag_invalid, status); 5449 return float64_default_nan(status); 5450 } 5451 aSig = extractFloatx80Frac( a ); 5452 aExp = extractFloatx80Exp( a ); 5453 aSign = extractFloatx80Sign( a ); 5454 if ( aExp == 0x7FFF ) { 5455 if ( (uint64_t) ( aSig<<1 ) ) { 5456 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5457 } 5458 return packFloat64( aSign, 0x7FF, 0 ); 5459 } 5460 shift64RightJamming( aSig, 1, &zSig ); 5461 if ( aExp || aSig ) aExp -= 0x3C01; 5462 return roundAndPackFloat64(aSign, aExp, zSig, status); 5463 5464 } 5465 5466 /*---------------------------------------------------------------------------- 5467 | Returns the result of converting the extended double-precision floating- 5468 | point value `a' to the quadruple-precision floating-point format. The 5469 | conversion is performed according to the IEC/IEEE Standard for Binary 5470 | Floating-Point Arithmetic. 5471 *----------------------------------------------------------------------------*/ 5472 5473 float128 floatx80_to_float128(floatx80 a, float_status *status) 5474 { 5475 flag aSign; 5476 int aExp; 5477 uint64_t aSig, zSig0, zSig1; 5478 5479 if (floatx80_invalid_encoding(a)) { 5480 float_raise(float_flag_invalid, status); 5481 return float128_default_nan(status); 5482 } 5483 aSig = extractFloatx80Frac( a ); 5484 aExp = extractFloatx80Exp( a ); 5485 aSign = extractFloatx80Sign( a ); 5486 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5487 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5488 } 5489 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5490 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5491 5492 } 5493 5494 /*---------------------------------------------------------------------------- 5495 | Rounds the extended double-precision floating-point value `a' 5496 | to the precision provided by floatx80_rounding_precision and returns the 5497 | result as an extended double-precision floating-point value. 5498 | The operation is performed according to the IEC/IEEE Standard for Binary 5499 | Floating-Point Arithmetic. 5500 *----------------------------------------------------------------------------*/ 5501 5502 floatx80 floatx80_round(floatx80 a, float_status *status) 5503 { 5504 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5505 extractFloatx80Sign(a), 5506 extractFloatx80Exp(a), 5507 extractFloatx80Frac(a), 0, status); 5508 } 5509 5510 /*---------------------------------------------------------------------------- 5511 | Rounds the extended double-precision floating-point value `a' to an integer, 5512 | and returns the result as an extended quadruple-precision floating-point 5513 | value. The operation is performed according to the IEC/IEEE Standard for 5514 | Binary Floating-Point Arithmetic. 5515 *----------------------------------------------------------------------------*/ 5516 5517 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5518 { 5519 flag aSign; 5520 int32_t aExp; 5521 uint64_t lastBitMask, roundBitsMask; 5522 floatx80 z; 5523 5524 if (floatx80_invalid_encoding(a)) { 5525 float_raise(float_flag_invalid, status); 5526 return floatx80_default_nan(status); 5527 } 5528 aExp = extractFloatx80Exp( a ); 5529 if ( 0x403E <= aExp ) { 5530 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5531 return propagateFloatx80NaN(a, a, status); 5532 } 5533 return a; 5534 } 5535 if ( aExp < 0x3FFF ) { 5536 if ( ( aExp == 0 ) 5537 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5538 return a; 5539 } 5540 status->float_exception_flags |= float_flag_inexact; 5541 aSign = extractFloatx80Sign( a ); 5542 switch (status->float_rounding_mode) { 5543 case float_round_nearest_even: 5544 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5545 ) { 5546 return 5547 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5548 } 5549 break; 5550 case float_round_ties_away: 5551 if (aExp == 0x3FFE) { 5552 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5553 } 5554 break; 5555 case float_round_down: 5556 return 5557 aSign ? 5558 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5559 : packFloatx80( 0, 0, 0 ); 5560 case float_round_up: 5561 return 5562 aSign ? packFloatx80( 1, 0, 0 ) 5563 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5564 } 5565 return packFloatx80( aSign, 0, 0 ); 5566 } 5567 lastBitMask = 1; 5568 lastBitMask <<= 0x403E - aExp; 5569 roundBitsMask = lastBitMask - 1; 5570 z = a; 5571 switch (status->float_rounding_mode) { 5572 case float_round_nearest_even: 5573 z.low += lastBitMask>>1; 5574 if ((z.low & roundBitsMask) == 0) { 5575 z.low &= ~lastBitMask; 5576 } 5577 break; 5578 case float_round_ties_away: 5579 z.low += lastBitMask >> 1; 5580 break; 5581 case float_round_to_zero: 5582 break; 5583 case float_round_up: 5584 if (!extractFloatx80Sign(z)) { 5585 z.low += roundBitsMask; 5586 } 5587 break; 5588 case float_round_down: 5589 if (extractFloatx80Sign(z)) { 5590 z.low += roundBitsMask; 5591 } 5592 break; 5593 default: 5594 abort(); 5595 } 5596 z.low &= ~ roundBitsMask; 5597 if ( z.low == 0 ) { 5598 ++z.high; 5599 z.low = LIT64( 0x8000000000000000 ); 5600 } 5601 if (z.low != a.low) { 5602 status->float_exception_flags |= float_flag_inexact; 5603 } 5604 return z; 5605 5606 } 5607 5608 /*---------------------------------------------------------------------------- 5609 | Returns the result of adding the absolute values of the extended double- 5610 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5611 | negated before being returned. `zSign' is ignored if the result is a NaN. 5612 | The addition is performed according to the IEC/IEEE Standard for Binary 5613 | Floating-Point Arithmetic. 5614 *----------------------------------------------------------------------------*/ 5615 5616 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5617 float_status *status) 5618 { 5619 int32_t aExp, bExp, zExp; 5620 uint64_t aSig, bSig, zSig0, zSig1; 5621 int32_t expDiff; 5622 5623 aSig = extractFloatx80Frac( a ); 5624 aExp = extractFloatx80Exp( a ); 5625 bSig = extractFloatx80Frac( b ); 5626 bExp = extractFloatx80Exp( b ); 5627 expDiff = aExp - bExp; 5628 if ( 0 < expDiff ) { 5629 if ( aExp == 0x7FFF ) { 5630 if ((uint64_t)(aSig << 1)) { 5631 return propagateFloatx80NaN(a, b, status); 5632 } 5633 return a; 5634 } 5635 if ( bExp == 0 ) --expDiff; 5636 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5637 zExp = aExp; 5638 } 5639 else if ( expDiff < 0 ) { 5640 if ( bExp == 0x7FFF ) { 5641 if ((uint64_t)(bSig << 1)) { 5642 return propagateFloatx80NaN(a, b, status); 5643 } 5644 return packFloatx80(zSign, 5645 floatx80_infinity_high, 5646 floatx80_infinity_low); 5647 } 5648 if ( aExp == 0 ) ++expDiff; 5649 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5650 zExp = bExp; 5651 } 5652 else { 5653 if ( aExp == 0x7FFF ) { 5654 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5655 return propagateFloatx80NaN(a, b, status); 5656 } 5657 return a; 5658 } 5659 zSig1 = 0; 5660 zSig0 = aSig + bSig; 5661 if ( aExp == 0 ) { 5662 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5663 goto roundAndPack; 5664 } 5665 zExp = aExp; 5666 goto shiftRight1; 5667 } 5668 zSig0 = aSig + bSig; 5669 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5670 shiftRight1: 5671 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5672 zSig0 |= LIT64( 0x8000000000000000 ); 5673 ++zExp; 5674 roundAndPack: 5675 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5676 zSign, zExp, zSig0, zSig1, status); 5677 } 5678 5679 /*---------------------------------------------------------------------------- 5680 | Returns the result of subtracting the absolute values of the extended 5681 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5682 | difference is negated before being returned. `zSign' is ignored if the 5683 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5684 | Standard for Binary Floating-Point Arithmetic. 5685 *----------------------------------------------------------------------------*/ 5686 5687 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5688 float_status *status) 5689 { 5690 int32_t aExp, bExp, zExp; 5691 uint64_t aSig, bSig, zSig0, zSig1; 5692 int32_t expDiff; 5693 5694 aSig = extractFloatx80Frac( a ); 5695 aExp = extractFloatx80Exp( a ); 5696 bSig = extractFloatx80Frac( b ); 5697 bExp = extractFloatx80Exp( b ); 5698 expDiff = aExp - bExp; 5699 if ( 0 < expDiff ) goto aExpBigger; 5700 if ( expDiff < 0 ) goto bExpBigger; 5701 if ( aExp == 0x7FFF ) { 5702 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5703 return propagateFloatx80NaN(a, b, status); 5704 } 5705 float_raise(float_flag_invalid, status); 5706 return floatx80_default_nan(status); 5707 } 5708 if ( aExp == 0 ) { 5709 aExp = 1; 5710 bExp = 1; 5711 } 5712 zSig1 = 0; 5713 if ( bSig < aSig ) goto aBigger; 5714 if ( aSig < bSig ) goto bBigger; 5715 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5716 bExpBigger: 5717 if ( bExp == 0x7FFF ) { 5718 if ((uint64_t)(bSig << 1)) { 5719 return propagateFloatx80NaN(a, b, status); 5720 } 5721 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5722 floatx80_infinity_low); 5723 } 5724 if ( aExp == 0 ) ++expDiff; 5725 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5726 bBigger: 5727 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5728 zExp = bExp; 5729 zSign ^= 1; 5730 goto normalizeRoundAndPack; 5731 aExpBigger: 5732 if ( aExp == 0x7FFF ) { 5733 if ((uint64_t)(aSig << 1)) { 5734 return propagateFloatx80NaN(a, b, status); 5735 } 5736 return a; 5737 } 5738 if ( bExp == 0 ) --expDiff; 5739 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5740 aBigger: 5741 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5742 zExp = aExp; 5743 normalizeRoundAndPack: 5744 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5745 zSign, zExp, zSig0, zSig1, status); 5746 } 5747 5748 /*---------------------------------------------------------------------------- 5749 | Returns the result of adding the extended double-precision floating-point 5750 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5751 | Standard for Binary Floating-Point Arithmetic. 5752 *----------------------------------------------------------------------------*/ 5753 5754 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5755 { 5756 flag aSign, bSign; 5757 5758 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5759 float_raise(float_flag_invalid, status); 5760 return floatx80_default_nan(status); 5761 } 5762 aSign = extractFloatx80Sign( a ); 5763 bSign = extractFloatx80Sign( b ); 5764 if ( aSign == bSign ) { 5765 return addFloatx80Sigs(a, b, aSign, status); 5766 } 5767 else { 5768 return subFloatx80Sigs(a, b, aSign, status); 5769 } 5770 5771 } 5772 5773 /*---------------------------------------------------------------------------- 5774 | Returns the result of subtracting the extended double-precision floating- 5775 | point values `a' and `b'. The operation is performed according to the 5776 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5777 *----------------------------------------------------------------------------*/ 5778 5779 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5780 { 5781 flag aSign, bSign; 5782 5783 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5784 float_raise(float_flag_invalid, status); 5785 return floatx80_default_nan(status); 5786 } 5787 aSign = extractFloatx80Sign( a ); 5788 bSign = extractFloatx80Sign( b ); 5789 if ( aSign == bSign ) { 5790 return subFloatx80Sigs(a, b, aSign, status); 5791 } 5792 else { 5793 return addFloatx80Sigs(a, b, aSign, status); 5794 } 5795 5796 } 5797 5798 /*---------------------------------------------------------------------------- 5799 | Returns the result of multiplying the extended double-precision floating- 5800 | point values `a' and `b'. The operation is performed according to the 5801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5802 *----------------------------------------------------------------------------*/ 5803 5804 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5805 { 5806 flag aSign, bSign, zSign; 5807 int32_t aExp, bExp, zExp; 5808 uint64_t aSig, bSig, zSig0, zSig1; 5809 5810 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5811 float_raise(float_flag_invalid, status); 5812 return floatx80_default_nan(status); 5813 } 5814 aSig = extractFloatx80Frac( a ); 5815 aExp = extractFloatx80Exp( a ); 5816 aSign = extractFloatx80Sign( a ); 5817 bSig = extractFloatx80Frac( b ); 5818 bExp = extractFloatx80Exp( b ); 5819 bSign = extractFloatx80Sign( b ); 5820 zSign = aSign ^ bSign; 5821 if ( aExp == 0x7FFF ) { 5822 if ( (uint64_t) ( aSig<<1 ) 5823 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5824 return propagateFloatx80NaN(a, b, status); 5825 } 5826 if ( ( bExp | bSig ) == 0 ) goto invalid; 5827 return packFloatx80(zSign, floatx80_infinity_high, 5828 floatx80_infinity_low); 5829 } 5830 if ( bExp == 0x7FFF ) { 5831 if ((uint64_t)(bSig << 1)) { 5832 return propagateFloatx80NaN(a, b, status); 5833 } 5834 if ( ( aExp | aSig ) == 0 ) { 5835 invalid: 5836 float_raise(float_flag_invalid, status); 5837 return floatx80_default_nan(status); 5838 } 5839 return packFloatx80(zSign, floatx80_infinity_high, 5840 floatx80_infinity_low); 5841 } 5842 if ( aExp == 0 ) { 5843 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5844 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5845 } 5846 if ( bExp == 0 ) { 5847 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5848 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5849 } 5850 zExp = aExp + bExp - 0x3FFE; 5851 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5852 if ( 0 < (int64_t) zSig0 ) { 5853 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5854 --zExp; 5855 } 5856 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5857 zSign, zExp, zSig0, zSig1, status); 5858 } 5859 5860 /*---------------------------------------------------------------------------- 5861 | Returns the result of dividing the extended double-precision floating-point 5862 | value `a' by the corresponding value `b'. The operation is performed 5863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5864 *----------------------------------------------------------------------------*/ 5865 5866 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5867 { 5868 flag aSign, bSign, zSign; 5869 int32_t aExp, bExp, zExp; 5870 uint64_t aSig, bSig, zSig0, zSig1; 5871 uint64_t rem0, rem1, rem2, term0, term1, term2; 5872 5873 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5874 float_raise(float_flag_invalid, status); 5875 return floatx80_default_nan(status); 5876 } 5877 aSig = extractFloatx80Frac( a ); 5878 aExp = extractFloatx80Exp( a ); 5879 aSign = extractFloatx80Sign( a ); 5880 bSig = extractFloatx80Frac( b ); 5881 bExp = extractFloatx80Exp( b ); 5882 bSign = extractFloatx80Sign( b ); 5883 zSign = aSign ^ bSign; 5884 if ( aExp == 0x7FFF ) { 5885 if ((uint64_t)(aSig << 1)) { 5886 return propagateFloatx80NaN(a, b, status); 5887 } 5888 if ( bExp == 0x7FFF ) { 5889 if ((uint64_t)(bSig << 1)) { 5890 return propagateFloatx80NaN(a, b, status); 5891 } 5892 goto invalid; 5893 } 5894 return packFloatx80(zSign, floatx80_infinity_high, 5895 floatx80_infinity_low); 5896 } 5897 if ( bExp == 0x7FFF ) { 5898 if ((uint64_t)(bSig << 1)) { 5899 return propagateFloatx80NaN(a, b, status); 5900 } 5901 return packFloatx80( zSign, 0, 0 ); 5902 } 5903 if ( bExp == 0 ) { 5904 if ( bSig == 0 ) { 5905 if ( ( aExp | aSig ) == 0 ) { 5906 invalid: 5907 float_raise(float_flag_invalid, status); 5908 return floatx80_default_nan(status); 5909 } 5910 float_raise(float_flag_divbyzero, status); 5911 return packFloatx80(zSign, floatx80_infinity_high, 5912 floatx80_infinity_low); 5913 } 5914 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5915 } 5916 if ( aExp == 0 ) { 5917 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5918 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5919 } 5920 zExp = aExp - bExp + 0x3FFE; 5921 rem1 = 0; 5922 if ( bSig <= aSig ) { 5923 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5924 ++zExp; 5925 } 5926 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5927 mul64To128( bSig, zSig0, &term0, &term1 ); 5928 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5929 while ( (int64_t) rem0 < 0 ) { 5930 --zSig0; 5931 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5932 } 5933 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5934 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5935 mul64To128( bSig, zSig1, &term1, &term2 ); 5936 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5937 while ( (int64_t) rem1 < 0 ) { 5938 --zSig1; 5939 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5940 } 5941 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5942 } 5943 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5944 zSign, zExp, zSig0, zSig1, status); 5945 } 5946 5947 /*---------------------------------------------------------------------------- 5948 | Returns the remainder of the extended double-precision floating-point value 5949 | `a' with respect to the corresponding value `b'. The operation is performed 5950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5951 *----------------------------------------------------------------------------*/ 5952 5953 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5954 { 5955 flag aSign, zSign; 5956 int32_t aExp, bExp, expDiff; 5957 uint64_t aSig0, aSig1, bSig; 5958 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5959 5960 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5961 float_raise(float_flag_invalid, status); 5962 return floatx80_default_nan(status); 5963 } 5964 aSig0 = extractFloatx80Frac( a ); 5965 aExp = extractFloatx80Exp( a ); 5966 aSign = extractFloatx80Sign( a ); 5967 bSig = extractFloatx80Frac( b ); 5968 bExp = extractFloatx80Exp( b ); 5969 if ( aExp == 0x7FFF ) { 5970 if ( (uint64_t) ( aSig0<<1 ) 5971 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5972 return propagateFloatx80NaN(a, b, status); 5973 } 5974 goto invalid; 5975 } 5976 if ( bExp == 0x7FFF ) { 5977 if ((uint64_t)(bSig << 1)) { 5978 return propagateFloatx80NaN(a, b, status); 5979 } 5980 return a; 5981 } 5982 if ( bExp == 0 ) { 5983 if ( bSig == 0 ) { 5984 invalid: 5985 float_raise(float_flag_invalid, status); 5986 return floatx80_default_nan(status); 5987 } 5988 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5989 } 5990 if ( aExp == 0 ) { 5991 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5992 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5993 } 5994 bSig |= LIT64( 0x8000000000000000 ); 5995 zSign = aSign; 5996 expDiff = aExp - bExp; 5997 aSig1 = 0; 5998 if ( expDiff < 0 ) { 5999 if ( expDiff < -1 ) return a; 6000 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6001 expDiff = 0; 6002 } 6003 q = ( bSig <= aSig0 ); 6004 if ( q ) aSig0 -= bSig; 6005 expDiff -= 64; 6006 while ( 0 < expDiff ) { 6007 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6008 q = ( 2 < q ) ? q - 2 : 0; 6009 mul64To128( bSig, q, &term0, &term1 ); 6010 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6011 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6012 expDiff -= 62; 6013 } 6014 expDiff += 64; 6015 if ( 0 < expDiff ) { 6016 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6017 q = ( 2 < q ) ? q - 2 : 0; 6018 q >>= 64 - expDiff; 6019 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6020 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6021 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6022 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6023 ++q; 6024 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6025 } 6026 } 6027 else { 6028 term1 = 0; 6029 term0 = bSig; 6030 } 6031 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6032 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6033 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6034 && ( q & 1 ) ) 6035 ) { 6036 aSig0 = alternateASig0; 6037 aSig1 = alternateASig1; 6038 zSign = ! zSign; 6039 } 6040 return 6041 normalizeRoundAndPackFloatx80( 6042 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6043 6044 } 6045 6046 /*---------------------------------------------------------------------------- 6047 | Returns the square root of the extended double-precision floating-point 6048 | value `a'. The operation is performed according to the IEC/IEEE Standard 6049 | for Binary Floating-Point Arithmetic. 6050 *----------------------------------------------------------------------------*/ 6051 6052 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6053 { 6054 flag aSign; 6055 int32_t aExp, zExp; 6056 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6057 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6058 6059 if (floatx80_invalid_encoding(a)) { 6060 float_raise(float_flag_invalid, status); 6061 return floatx80_default_nan(status); 6062 } 6063 aSig0 = extractFloatx80Frac( a ); 6064 aExp = extractFloatx80Exp( a ); 6065 aSign = extractFloatx80Sign( a ); 6066 if ( aExp == 0x7FFF ) { 6067 if ((uint64_t)(aSig0 << 1)) { 6068 return propagateFloatx80NaN(a, a, status); 6069 } 6070 if ( ! aSign ) return a; 6071 goto invalid; 6072 } 6073 if ( aSign ) { 6074 if ( ( aExp | aSig0 ) == 0 ) return a; 6075 invalid: 6076 float_raise(float_flag_invalid, status); 6077 return floatx80_default_nan(status); 6078 } 6079 if ( aExp == 0 ) { 6080 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6081 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6082 } 6083 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6084 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6085 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6086 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6087 doubleZSig0 = zSig0<<1; 6088 mul64To128( zSig0, zSig0, &term0, &term1 ); 6089 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6090 while ( (int64_t) rem0 < 0 ) { 6091 --zSig0; 6092 doubleZSig0 -= 2; 6093 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6094 } 6095 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6096 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 6097 if ( zSig1 == 0 ) zSig1 = 1; 6098 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6099 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6100 mul64To128( zSig1, zSig1, &term2, &term3 ); 6101 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6102 while ( (int64_t) rem1 < 0 ) { 6103 --zSig1; 6104 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6105 term3 |= 1; 6106 term2 |= doubleZSig0; 6107 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6108 } 6109 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6110 } 6111 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6112 zSig0 |= doubleZSig0; 6113 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6114 0, zExp, zSig0, zSig1, status); 6115 } 6116 6117 /*---------------------------------------------------------------------------- 6118 | Returns 1 if the extended double-precision floating-point value `a' is equal 6119 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6120 | raised if either operand is a NaN. Otherwise, the comparison is performed 6121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6122 *----------------------------------------------------------------------------*/ 6123 6124 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6125 { 6126 6127 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6128 || (extractFloatx80Exp(a) == 0x7FFF 6129 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6130 || (extractFloatx80Exp(b) == 0x7FFF 6131 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6132 ) { 6133 float_raise(float_flag_invalid, status); 6134 return 0; 6135 } 6136 return 6137 ( a.low == b.low ) 6138 && ( ( a.high == b.high ) 6139 || ( ( a.low == 0 ) 6140 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6141 ); 6142 6143 } 6144 6145 /*---------------------------------------------------------------------------- 6146 | Returns 1 if the extended double-precision floating-point value `a' is 6147 | less than or equal to the corresponding value `b', and 0 otherwise. The 6148 | invalid exception is raised if either operand is a NaN. The comparison is 6149 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6150 | Arithmetic. 6151 *----------------------------------------------------------------------------*/ 6152 6153 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6154 { 6155 flag aSign, bSign; 6156 6157 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6158 || (extractFloatx80Exp(a) == 0x7FFF 6159 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6160 || (extractFloatx80Exp(b) == 0x7FFF 6161 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6162 ) { 6163 float_raise(float_flag_invalid, status); 6164 return 0; 6165 } 6166 aSign = extractFloatx80Sign( a ); 6167 bSign = extractFloatx80Sign( b ); 6168 if ( aSign != bSign ) { 6169 return 6170 aSign 6171 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6172 == 0 ); 6173 } 6174 return 6175 aSign ? le128( b.high, b.low, a.high, a.low ) 6176 : le128( a.high, a.low, b.high, b.low ); 6177 6178 } 6179 6180 /*---------------------------------------------------------------------------- 6181 | Returns 1 if the extended double-precision floating-point value `a' is 6182 | less than the corresponding value `b', and 0 otherwise. The invalid 6183 | exception is raised if either operand is a NaN. The comparison is performed 6184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6185 *----------------------------------------------------------------------------*/ 6186 6187 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6188 { 6189 flag aSign, bSign; 6190 6191 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6192 || (extractFloatx80Exp(a) == 0x7FFF 6193 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6194 || (extractFloatx80Exp(b) == 0x7FFF 6195 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6196 ) { 6197 float_raise(float_flag_invalid, status); 6198 return 0; 6199 } 6200 aSign = extractFloatx80Sign( a ); 6201 bSign = extractFloatx80Sign( b ); 6202 if ( aSign != bSign ) { 6203 return 6204 aSign 6205 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6206 != 0 ); 6207 } 6208 return 6209 aSign ? lt128( b.high, b.low, a.high, a.low ) 6210 : lt128( a.high, a.low, b.high, b.low ); 6211 6212 } 6213 6214 /*---------------------------------------------------------------------------- 6215 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6216 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6217 | either operand is a NaN. The comparison is performed according to the 6218 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6219 *----------------------------------------------------------------------------*/ 6220 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6221 { 6222 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6223 || (extractFloatx80Exp(a) == 0x7FFF 6224 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6225 || (extractFloatx80Exp(b) == 0x7FFF 6226 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6227 ) { 6228 float_raise(float_flag_invalid, status); 6229 return 1; 6230 } 6231 return 0; 6232 } 6233 6234 /*---------------------------------------------------------------------------- 6235 | Returns 1 if the extended double-precision floating-point value `a' is 6236 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6237 | cause an exception. The comparison is performed according to the IEC/IEEE 6238 | Standard for Binary Floating-Point Arithmetic. 6239 *----------------------------------------------------------------------------*/ 6240 6241 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6242 { 6243 6244 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6245 float_raise(float_flag_invalid, status); 6246 return 0; 6247 } 6248 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6249 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6250 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6251 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6252 ) { 6253 if (floatx80_is_signaling_nan(a, status) 6254 || floatx80_is_signaling_nan(b, status)) { 6255 float_raise(float_flag_invalid, status); 6256 } 6257 return 0; 6258 } 6259 return 6260 ( a.low == b.low ) 6261 && ( ( a.high == b.high ) 6262 || ( ( a.low == 0 ) 6263 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6264 ); 6265 6266 } 6267 6268 /*---------------------------------------------------------------------------- 6269 | Returns 1 if the extended double-precision floating-point value `a' is less 6270 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6271 | do not cause an exception. Otherwise, the comparison is performed according 6272 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6273 *----------------------------------------------------------------------------*/ 6274 6275 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6276 { 6277 flag aSign, bSign; 6278 6279 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6280 float_raise(float_flag_invalid, status); 6281 return 0; 6282 } 6283 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6284 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6285 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6286 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6287 ) { 6288 if (floatx80_is_signaling_nan(a, status) 6289 || floatx80_is_signaling_nan(b, status)) { 6290 float_raise(float_flag_invalid, status); 6291 } 6292 return 0; 6293 } 6294 aSign = extractFloatx80Sign( a ); 6295 bSign = extractFloatx80Sign( b ); 6296 if ( aSign != bSign ) { 6297 return 6298 aSign 6299 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6300 == 0 ); 6301 } 6302 return 6303 aSign ? le128( b.high, b.low, a.high, a.low ) 6304 : le128( a.high, a.low, b.high, b.low ); 6305 6306 } 6307 6308 /*---------------------------------------------------------------------------- 6309 | Returns 1 if the extended double-precision floating-point value `a' is less 6310 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6311 | an exception. Otherwise, the comparison is performed according to the 6312 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6313 *----------------------------------------------------------------------------*/ 6314 6315 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6316 { 6317 flag aSign, bSign; 6318 6319 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6320 float_raise(float_flag_invalid, status); 6321 return 0; 6322 } 6323 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6324 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6325 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6326 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6327 ) { 6328 if (floatx80_is_signaling_nan(a, status) 6329 || floatx80_is_signaling_nan(b, status)) { 6330 float_raise(float_flag_invalid, status); 6331 } 6332 return 0; 6333 } 6334 aSign = extractFloatx80Sign( a ); 6335 bSign = extractFloatx80Sign( b ); 6336 if ( aSign != bSign ) { 6337 return 6338 aSign 6339 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6340 != 0 ); 6341 } 6342 return 6343 aSign ? lt128( b.high, b.low, a.high, a.low ) 6344 : lt128( a.high, a.low, b.high, b.low ); 6345 6346 } 6347 6348 /*---------------------------------------------------------------------------- 6349 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6350 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6351 | The comparison is performed according to the IEC/IEEE Standard for Binary 6352 | Floating-Point Arithmetic. 6353 *----------------------------------------------------------------------------*/ 6354 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6355 { 6356 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6357 float_raise(float_flag_invalid, status); 6358 return 1; 6359 } 6360 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6361 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6362 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6363 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6364 ) { 6365 if (floatx80_is_signaling_nan(a, status) 6366 || floatx80_is_signaling_nan(b, status)) { 6367 float_raise(float_flag_invalid, status); 6368 } 6369 return 1; 6370 } 6371 return 0; 6372 } 6373 6374 /*---------------------------------------------------------------------------- 6375 | Returns the result of converting the quadruple-precision floating-point 6376 | value `a' to the 32-bit two's complement integer format. The conversion 6377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6378 | Arithmetic---which means in particular that the conversion is rounded 6379 | according to the current rounding mode. If `a' is a NaN, the largest 6380 | positive integer is returned. Otherwise, if the conversion overflows, the 6381 | largest integer with the same sign as `a' is returned. 6382 *----------------------------------------------------------------------------*/ 6383 6384 int32_t float128_to_int32(float128 a, float_status *status) 6385 { 6386 flag aSign; 6387 int32_t aExp, shiftCount; 6388 uint64_t aSig0, aSig1; 6389 6390 aSig1 = extractFloat128Frac1( a ); 6391 aSig0 = extractFloat128Frac0( a ); 6392 aExp = extractFloat128Exp( a ); 6393 aSign = extractFloat128Sign( a ); 6394 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6395 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6396 aSig0 |= ( aSig1 != 0 ); 6397 shiftCount = 0x4028 - aExp; 6398 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6399 return roundAndPackInt32(aSign, aSig0, status); 6400 6401 } 6402 6403 /*---------------------------------------------------------------------------- 6404 | Returns the result of converting the quadruple-precision floating-point 6405 | value `a' to the 32-bit two's complement integer format. The conversion 6406 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6407 | Arithmetic, except that the conversion is always rounded toward zero. If 6408 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6409 | conversion overflows, the largest integer with the same sign as `a' is 6410 | returned. 6411 *----------------------------------------------------------------------------*/ 6412 6413 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6414 { 6415 flag aSign; 6416 int32_t aExp, shiftCount; 6417 uint64_t aSig0, aSig1, savedASig; 6418 int32_t z; 6419 6420 aSig1 = extractFloat128Frac1( a ); 6421 aSig0 = extractFloat128Frac0( a ); 6422 aExp = extractFloat128Exp( a ); 6423 aSign = extractFloat128Sign( a ); 6424 aSig0 |= ( aSig1 != 0 ); 6425 if ( 0x401E < aExp ) { 6426 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6427 goto invalid; 6428 } 6429 else if ( aExp < 0x3FFF ) { 6430 if (aExp || aSig0) { 6431 status->float_exception_flags |= float_flag_inexact; 6432 } 6433 return 0; 6434 } 6435 aSig0 |= LIT64( 0x0001000000000000 ); 6436 shiftCount = 0x402F - aExp; 6437 savedASig = aSig0; 6438 aSig0 >>= shiftCount; 6439 z = aSig0; 6440 if ( aSign ) z = - z; 6441 if ( ( z < 0 ) ^ aSign ) { 6442 invalid: 6443 float_raise(float_flag_invalid, status); 6444 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6445 } 6446 if ( ( aSig0<<shiftCount ) != savedASig ) { 6447 status->float_exception_flags |= float_flag_inexact; 6448 } 6449 return z; 6450 6451 } 6452 6453 /*---------------------------------------------------------------------------- 6454 | Returns the result of converting the quadruple-precision floating-point 6455 | value `a' to the 64-bit two's complement integer format. The conversion 6456 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6457 | Arithmetic---which means in particular that the conversion is rounded 6458 | according to the current rounding mode. If `a' is a NaN, the largest 6459 | positive integer is returned. Otherwise, if the conversion overflows, the 6460 | largest integer with the same sign as `a' is returned. 6461 *----------------------------------------------------------------------------*/ 6462 6463 int64_t float128_to_int64(float128 a, float_status *status) 6464 { 6465 flag aSign; 6466 int32_t aExp, shiftCount; 6467 uint64_t aSig0, aSig1; 6468 6469 aSig1 = extractFloat128Frac1( a ); 6470 aSig0 = extractFloat128Frac0( a ); 6471 aExp = extractFloat128Exp( a ); 6472 aSign = extractFloat128Sign( a ); 6473 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6474 shiftCount = 0x402F - aExp; 6475 if ( shiftCount <= 0 ) { 6476 if ( 0x403E < aExp ) { 6477 float_raise(float_flag_invalid, status); 6478 if ( ! aSign 6479 || ( ( aExp == 0x7FFF ) 6480 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6481 ) 6482 ) { 6483 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6484 } 6485 return (int64_t) LIT64( 0x8000000000000000 ); 6486 } 6487 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6488 } 6489 else { 6490 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6491 } 6492 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6493 6494 } 6495 6496 /*---------------------------------------------------------------------------- 6497 | Returns the result of converting the quadruple-precision floating-point 6498 | value `a' to the 64-bit two's complement integer format. The conversion 6499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6500 | Arithmetic, except that the conversion is always rounded toward zero. 6501 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6502 | the conversion overflows, the largest integer with the same sign as `a' is 6503 | returned. 6504 *----------------------------------------------------------------------------*/ 6505 6506 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6507 { 6508 flag aSign; 6509 int32_t aExp, shiftCount; 6510 uint64_t aSig0, aSig1; 6511 int64_t z; 6512 6513 aSig1 = extractFloat128Frac1( a ); 6514 aSig0 = extractFloat128Frac0( a ); 6515 aExp = extractFloat128Exp( a ); 6516 aSign = extractFloat128Sign( a ); 6517 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6518 shiftCount = aExp - 0x402F; 6519 if ( 0 < shiftCount ) { 6520 if ( 0x403E <= aExp ) { 6521 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6522 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6523 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6524 if (aSig1) { 6525 status->float_exception_flags |= float_flag_inexact; 6526 } 6527 } 6528 else { 6529 float_raise(float_flag_invalid, status); 6530 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6531 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6532 } 6533 } 6534 return (int64_t) LIT64( 0x8000000000000000 ); 6535 } 6536 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6537 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6538 status->float_exception_flags |= float_flag_inexact; 6539 } 6540 } 6541 else { 6542 if ( aExp < 0x3FFF ) { 6543 if ( aExp | aSig0 | aSig1 ) { 6544 status->float_exception_flags |= float_flag_inexact; 6545 } 6546 return 0; 6547 } 6548 z = aSig0>>( - shiftCount ); 6549 if ( aSig1 6550 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6551 status->float_exception_flags |= float_flag_inexact; 6552 } 6553 } 6554 if ( aSign ) z = - z; 6555 return z; 6556 6557 } 6558 6559 /*---------------------------------------------------------------------------- 6560 | Returns the result of converting the quadruple-precision floating-point value 6561 | `a' to the 64-bit unsigned integer format. The conversion is 6562 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6563 | Arithmetic---which means in particular that the conversion is rounded 6564 | according to the current rounding mode. If `a' is a NaN, the largest 6565 | positive integer is returned. If the conversion overflows, the 6566 | largest unsigned integer is returned. If 'a' is negative, the value is 6567 | rounded and zero is returned; negative values that do not round to zero 6568 | will raise the inexact exception. 6569 *----------------------------------------------------------------------------*/ 6570 6571 uint64_t float128_to_uint64(float128 a, float_status *status) 6572 { 6573 flag aSign; 6574 int aExp; 6575 int shiftCount; 6576 uint64_t aSig0, aSig1; 6577 6578 aSig0 = extractFloat128Frac0(a); 6579 aSig1 = extractFloat128Frac1(a); 6580 aExp = extractFloat128Exp(a); 6581 aSign = extractFloat128Sign(a); 6582 if (aSign && (aExp > 0x3FFE)) { 6583 float_raise(float_flag_invalid, status); 6584 if (float128_is_any_nan(a)) { 6585 return LIT64(0xFFFFFFFFFFFFFFFF); 6586 } else { 6587 return 0; 6588 } 6589 } 6590 if (aExp) { 6591 aSig0 |= LIT64(0x0001000000000000); 6592 } 6593 shiftCount = 0x402F - aExp; 6594 if (shiftCount <= 0) { 6595 if (0x403E < aExp) { 6596 float_raise(float_flag_invalid, status); 6597 return LIT64(0xFFFFFFFFFFFFFFFF); 6598 } 6599 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6600 } else { 6601 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6602 } 6603 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6604 } 6605 6606 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6607 { 6608 uint64_t v; 6609 signed char current_rounding_mode = status->float_rounding_mode; 6610 6611 set_float_rounding_mode(float_round_to_zero, status); 6612 v = float128_to_uint64(a, status); 6613 set_float_rounding_mode(current_rounding_mode, status); 6614 6615 return v; 6616 } 6617 6618 /*---------------------------------------------------------------------------- 6619 | Returns the result of converting the quadruple-precision floating-point 6620 | value `a' to the 32-bit unsigned integer format. The conversion 6621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6622 | Arithmetic except that the conversion is always rounded toward zero. 6623 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6624 | if the conversion overflows, the largest unsigned integer is returned. 6625 | If 'a' is negative, the value is rounded and zero is returned; negative 6626 | values that do not round to zero will raise the inexact exception. 6627 *----------------------------------------------------------------------------*/ 6628 6629 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6630 { 6631 uint64_t v; 6632 uint32_t res; 6633 int old_exc_flags = get_float_exception_flags(status); 6634 6635 v = float128_to_uint64_round_to_zero(a, status); 6636 if (v > 0xffffffff) { 6637 res = 0xffffffff; 6638 } else { 6639 return v; 6640 } 6641 set_float_exception_flags(old_exc_flags, status); 6642 float_raise(float_flag_invalid, status); 6643 return res; 6644 } 6645 6646 /*---------------------------------------------------------------------------- 6647 | Returns the result of converting the quadruple-precision floating-point 6648 | value `a' to the single-precision floating-point format. The conversion 6649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6650 | Arithmetic. 6651 *----------------------------------------------------------------------------*/ 6652 6653 float32 float128_to_float32(float128 a, float_status *status) 6654 { 6655 flag aSign; 6656 int32_t aExp; 6657 uint64_t aSig0, aSig1; 6658 uint32_t zSig; 6659 6660 aSig1 = extractFloat128Frac1( a ); 6661 aSig0 = extractFloat128Frac0( a ); 6662 aExp = extractFloat128Exp( a ); 6663 aSign = extractFloat128Sign( a ); 6664 if ( aExp == 0x7FFF ) { 6665 if ( aSig0 | aSig1 ) { 6666 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6667 } 6668 return packFloat32( aSign, 0xFF, 0 ); 6669 } 6670 aSig0 |= ( aSig1 != 0 ); 6671 shift64RightJamming( aSig0, 18, &aSig0 ); 6672 zSig = aSig0; 6673 if ( aExp || zSig ) { 6674 zSig |= 0x40000000; 6675 aExp -= 0x3F81; 6676 } 6677 return roundAndPackFloat32(aSign, aExp, zSig, status); 6678 6679 } 6680 6681 /*---------------------------------------------------------------------------- 6682 | Returns the result of converting the quadruple-precision floating-point 6683 | value `a' to the double-precision floating-point format. The conversion 6684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6685 | Arithmetic. 6686 *----------------------------------------------------------------------------*/ 6687 6688 float64 float128_to_float64(float128 a, float_status *status) 6689 { 6690 flag aSign; 6691 int32_t aExp; 6692 uint64_t aSig0, aSig1; 6693 6694 aSig1 = extractFloat128Frac1( a ); 6695 aSig0 = extractFloat128Frac0( a ); 6696 aExp = extractFloat128Exp( a ); 6697 aSign = extractFloat128Sign( a ); 6698 if ( aExp == 0x7FFF ) { 6699 if ( aSig0 | aSig1 ) { 6700 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6701 } 6702 return packFloat64( aSign, 0x7FF, 0 ); 6703 } 6704 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6705 aSig0 |= ( aSig1 != 0 ); 6706 if ( aExp || aSig0 ) { 6707 aSig0 |= LIT64( 0x4000000000000000 ); 6708 aExp -= 0x3C01; 6709 } 6710 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6711 6712 } 6713 6714 /*---------------------------------------------------------------------------- 6715 | Returns the result of converting the quadruple-precision floating-point 6716 | value `a' to the extended double-precision floating-point format. The 6717 | conversion is performed according to the IEC/IEEE Standard for Binary 6718 | Floating-Point Arithmetic. 6719 *----------------------------------------------------------------------------*/ 6720 6721 floatx80 float128_to_floatx80(float128 a, float_status *status) 6722 { 6723 flag aSign; 6724 int32_t aExp; 6725 uint64_t aSig0, aSig1; 6726 6727 aSig1 = extractFloat128Frac1( a ); 6728 aSig0 = extractFloat128Frac0( a ); 6729 aExp = extractFloat128Exp( a ); 6730 aSign = extractFloat128Sign( a ); 6731 if ( aExp == 0x7FFF ) { 6732 if ( aSig0 | aSig1 ) { 6733 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6734 } 6735 return packFloatx80(aSign, floatx80_infinity_high, 6736 floatx80_infinity_low); 6737 } 6738 if ( aExp == 0 ) { 6739 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6740 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6741 } 6742 else { 6743 aSig0 |= LIT64( 0x0001000000000000 ); 6744 } 6745 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6746 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6747 6748 } 6749 6750 /*---------------------------------------------------------------------------- 6751 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6752 | returns the result as a quadruple-precision floating-point value. The 6753 | operation is performed according to the IEC/IEEE Standard for Binary 6754 | Floating-Point Arithmetic. 6755 *----------------------------------------------------------------------------*/ 6756 6757 float128 float128_round_to_int(float128 a, float_status *status) 6758 { 6759 flag aSign; 6760 int32_t aExp; 6761 uint64_t lastBitMask, roundBitsMask; 6762 float128 z; 6763 6764 aExp = extractFloat128Exp( a ); 6765 if ( 0x402F <= aExp ) { 6766 if ( 0x406F <= aExp ) { 6767 if ( ( aExp == 0x7FFF ) 6768 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6769 ) { 6770 return propagateFloat128NaN(a, a, status); 6771 } 6772 return a; 6773 } 6774 lastBitMask = 1; 6775 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6776 roundBitsMask = lastBitMask - 1; 6777 z = a; 6778 switch (status->float_rounding_mode) { 6779 case float_round_nearest_even: 6780 if ( lastBitMask ) { 6781 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6782 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6783 } 6784 else { 6785 if ( (int64_t) z.low < 0 ) { 6786 ++z.high; 6787 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6788 } 6789 } 6790 break; 6791 case float_round_ties_away: 6792 if (lastBitMask) { 6793 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6794 } else { 6795 if ((int64_t) z.low < 0) { 6796 ++z.high; 6797 } 6798 } 6799 break; 6800 case float_round_to_zero: 6801 break; 6802 case float_round_up: 6803 if (!extractFloat128Sign(z)) { 6804 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6805 } 6806 break; 6807 case float_round_down: 6808 if (extractFloat128Sign(z)) { 6809 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6810 } 6811 break; 6812 default: 6813 abort(); 6814 } 6815 z.low &= ~ roundBitsMask; 6816 } 6817 else { 6818 if ( aExp < 0x3FFF ) { 6819 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6820 status->float_exception_flags |= float_flag_inexact; 6821 aSign = extractFloat128Sign( a ); 6822 switch (status->float_rounding_mode) { 6823 case float_round_nearest_even: 6824 if ( ( aExp == 0x3FFE ) 6825 && ( extractFloat128Frac0( a ) 6826 | extractFloat128Frac1( a ) ) 6827 ) { 6828 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6829 } 6830 break; 6831 case float_round_ties_away: 6832 if (aExp == 0x3FFE) { 6833 return packFloat128(aSign, 0x3FFF, 0, 0); 6834 } 6835 break; 6836 case float_round_down: 6837 return 6838 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6839 : packFloat128( 0, 0, 0, 0 ); 6840 case float_round_up: 6841 return 6842 aSign ? packFloat128( 1, 0, 0, 0 ) 6843 : packFloat128( 0, 0x3FFF, 0, 0 ); 6844 } 6845 return packFloat128( aSign, 0, 0, 0 ); 6846 } 6847 lastBitMask = 1; 6848 lastBitMask <<= 0x402F - aExp; 6849 roundBitsMask = lastBitMask - 1; 6850 z.low = 0; 6851 z.high = a.high; 6852 switch (status->float_rounding_mode) { 6853 case float_round_nearest_even: 6854 z.high += lastBitMask>>1; 6855 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6856 z.high &= ~ lastBitMask; 6857 } 6858 break; 6859 case float_round_ties_away: 6860 z.high += lastBitMask>>1; 6861 break; 6862 case float_round_to_zero: 6863 break; 6864 case float_round_up: 6865 if (!extractFloat128Sign(z)) { 6866 z.high |= ( a.low != 0 ); 6867 z.high += roundBitsMask; 6868 } 6869 break; 6870 case float_round_down: 6871 if (extractFloat128Sign(z)) { 6872 z.high |= (a.low != 0); 6873 z.high += roundBitsMask; 6874 } 6875 break; 6876 default: 6877 abort(); 6878 } 6879 z.high &= ~ roundBitsMask; 6880 } 6881 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6882 status->float_exception_flags |= float_flag_inexact; 6883 } 6884 return z; 6885 6886 } 6887 6888 /*---------------------------------------------------------------------------- 6889 | Returns the result of adding the absolute values of the quadruple-precision 6890 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6891 | before being returned. `zSign' is ignored if the result is a NaN. 6892 | The addition is performed according to the IEC/IEEE Standard for Binary 6893 | Floating-Point Arithmetic. 6894 *----------------------------------------------------------------------------*/ 6895 6896 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6897 float_status *status) 6898 { 6899 int32_t aExp, bExp, zExp; 6900 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6901 int32_t expDiff; 6902 6903 aSig1 = extractFloat128Frac1( a ); 6904 aSig0 = extractFloat128Frac0( a ); 6905 aExp = extractFloat128Exp( a ); 6906 bSig1 = extractFloat128Frac1( b ); 6907 bSig0 = extractFloat128Frac0( b ); 6908 bExp = extractFloat128Exp( b ); 6909 expDiff = aExp - bExp; 6910 if ( 0 < expDiff ) { 6911 if ( aExp == 0x7FFF ) { 6912 if (aSig0 | aSig1) { 6913 return propagateFloat128NaN(a, b, status); 6914 } 6915 return a; 6916 } 6917 if ( bExp == 0 ) { 6918 --expDiff; 6919 } 6920 else { 6921 bSig0 |= LIT64( 0x0001000000000000 ); 6922 } 6923 shift128ExtraRightJamming( 6924 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6925 zExp = aExp; 6926 } 6927 else if ( expDiff < 0 ) { 6928 if ( bExp == 0x7FFF ) { 6929 if (bSig0 | bSig1) { 6930 return propagateFloat128NaN(a, b, status); 6931 } 6932 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6933 } 6934 if ( aExp == 0 ) { 6935 ++expDiff; 6936 } 6937 else { 6938 aSig0 |= LIT64( 0x0001000000000000 ); 6939 } 6940 shift128ExtraRightJamming( 6941 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6942 zExp = bExp; 6943 } 6944 else { 6945 if ( aExp == 0x7FFF ) { 6946 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6947 return propagateFloat128NaN(a, b, status); 6948 } 6949 return a; 6950 } 6951 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6952 if ( aExp == 0 ) { 6953 if (status->flush_to_zero) { 6954 if (zSig0 | zSig1) { 6955 float_raise(float_flag_output_denormal, status); 6956 } 6957 return packFloat128(zSign, 0, 0, 0); 6958 } 6959 return packFloat128( zSign, 0, zSig0, zSig1 ); 6960 } 6961 zSig2 = 0; 6962 zSig0 |= LIT64( 0x0002000000000000 ); 6963 zExp = aExp; 6964 goto shiftRight1; 6965 } 6966 aSig0 |= LIT64( 0x0001000000000000 ); 6967 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6968 --zExp; 6969 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6970 ++zExp; 6971 shiftRight1: 6972 shift128ExtraRightJamming( 6973 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6974 roundAndPack: 6975 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6976 6977 } 6978 6979 /*---------------------------------------------------------------------------- 6980 | Returns the result of subtracting the absolute values of the quadruple- 6981 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6982 | difference is negated before being returned. `zSign' is ignored if the 6983 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6984 | Standard for Binary Floating-Point Arithmetic. 6985 *----------------------------------------------------------------------------*/ 6986 6987 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6988 float_status *status) 6989 { 6990 int32_t aExp, bExp, zExp; 6991 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6992 int32_t expDiff; 6993 6994 aSig1 = extractFloat128Frac1( a ); 6995 aSig0 = extractFloat128Frac0( a ); 6996 aExp = extractFloat128Exp( a ); 6997 bSig1 = extractFloat128Frac1( b ); 6998 bSig0 = extractFloat128Frac0( b ); 6999 bExp = extractFloat128Exp( b ); 7000 expDiff = aExp - bExp; 7001 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7002 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7003 if ( 0 < expDiff ) goto aExpBigger; 7004 if ( expDiff < 0 ) goto bExpBigger; 7005 if ( aExp == 0x7FFF ) { 7006 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7007 return propagateFloat128NaN(a, b, status); 7008 } 7009 float_raise(float_flag_invalid, status); 7010 return float128_default_nan(status); 7011 } 7012 if ( aExp == 0 ) { 7013 aExp = 1; 7014 bExp = 1; 7015 } 7016 if ( bSig0 < aSig0 ) goto aBigger; 7017 if ( aSig0 < bSig0 ) goto bBigger; 7018 if ( bSig1 < aSig1 ) goto aBigger; 7019 if ( aSig1 < bSig1 ) goto bBigger; 7020 return packFloat128(status->float_rounding_mode == float_round_down, 7021 0, 0, 0); 7022 bExpBigger: 7023 if ( bExp == 0x7FFF ) { 7024 if (bSig0 | bSig1) { 7025 return propagateFloat128NaN(a, b, status); 7026 } 7027 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7028 } 7029 if ( aExp == 0 ) { 7030 ++expDiff; 7031 } 7032 else { 7033 aSig0 |= LIT64( 0x4000000000000000 ); 7034 } 7035 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7036 bSig0 |= LIT64( 0x4000000000000000 ); 7037 bBigger: 7038 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7039 zExp = bExp; 7040 zSign ^= 1; 7041 goto normalizeRoundAndPack; 7042 aExpBigger: 7043 if ( aExp == 0x7FFF ) { 7044 if (aSig0 | aSig1) { 7045 return propagateFloat128NaN(a, b, status); 7046 } 7047 return a; 7048 } 7049 if ( bExp == 0 ) { 7050 --expDiff; 7051 } 7052 else { 7053 bSig0 |= LIT64( 0x4000000000000000 ); 7054 } 7055 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7056 aSig0 |= LIT64( 0x4000000000000000 ); 7057 aBigger: 7058 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7059 zExp = aExp; 7060 normalizeRoundAndPack: 7061 --zExp; 7062 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7063 status); 7064 7065 } 7066 7067 /*---------------------------------------------------------------------------- 7068 | Returns the result of adding the quadruple-precision floating-point values 7069 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7070 | for Binary Floating-Point Arithmetic. 7071 *----------------------------------------------------------------------------*/ 7072 7073 float128 float128_add(float128 a, float128 b, float_status *status) 7074 { 7075 flag aSign, bSign; 7076 7077 aSign = extractFloat128Sign( a ); 7078 bSign = extractFloat128Sign( b ); 7079 if ( aSign == bSign ) { 7080 return addFloat128Sigs(a, b, aSign, status); 7081 } 7082 else { 7083 return subFloat128Sigs(a, b, aSign, status); 7084 } 7085 7086 } 7087 7088 /*---------------------------------------------------------------------------- 7089 | Returns the result of subtracting the quadruple-precision floating-point 7090 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7091 | Standard for Binary Floating-Point Arithmetic. 7092 *----------------------------------------------------------------------------*/ 7093 7094 float128 float128_sub(float128 a, float128 b, float_status *status) 7095 { 7096 flag aSign, bSign; 7097 7098 aSign = extractFloat128Sign( a ); 7099 bSign = extractFloat128Sign( b ); 7100 if ( aSign == bSign ) { 7101 return subFloat128Sigs(a, b, aSign, status); 7102 } 7103 else { 7104 return addFloat128Sigs(a, b, aSign, status); 7105 } 7106 7107 } 7108 7109 /*---------------------------------------------------------------------------- 7110 | Returns the result of multiplying the quadruple-precision floating-point 7111 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7112 | Standard for Binary Floating-Point Arithmetic. 7113 *----------------------------------------------------------------------------*/ 7114 7115 float128 float128_mul(float128 a, float128 b, float_status *status) 7116 { 7117 flag aSign, bSign, zSign; 7118 int32_t aExp, bExp, zExp; 7119 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7120 7121 aSig1 = extractFloat128Frac1( a ); 7122 aSig0 = extractFloat128Frac0( a ); 7123 aExp = extractFloat128Exp( a ); 7124 aSign = extractFloat128Sign( a ); 7125 bSig1 = extractFloat128Frac1( b ); 7126 bSig0 = extractFloat128Frac0( b ); 7127 bExp = extractFloat128Exp( b ); 7128 bSign = extractFloat128Sign( b ); 7129 zSign = aSign ^ bSign; 7130 if ( aExp == 0x7FFF ) { 7131 if ( ( aSig0 | aSig1 ) 7132 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7133 return propagateFloat128NaN(a, b, status); 7134 } 7135 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7136 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7137 } 7138 if ( bExp == 0x7FFF ) { 7139 if (bSig0 | bSig1) { 7140 return propagateFloat128NaN(a, b, status); 7141 } 7142 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7143 invalid: 7144 float_raise(float_flag_invalid, status); 7145 return float128_default_nan(status); 7146 } 7147 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7148 } 7149 if ( aExp == 0 ) { 7150 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7151 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7152 } 7153 if ( bExp == 0 ) { 7154 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7155 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7156 } 7157 zExp = aExp + bExp - 0x4000; 7158 aSig0 |= LIT64( 0x0001000000000000 ); 7159 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7160 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7161 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7162 zSig2 |= ( zSig3 != 0 ); 7163 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7164 shift128ExtraRightJamming( 7165 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7166 ++zExp; 7167 } 7168 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7169 7170 } 7171 7172 /*---------------------------------------------------------------------------- 7173 | Returns the result of dividing the quadruple-precision floating-point value 7174 | `a' by the corresponding value `b'. The operation is performed according to 7175 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7176 *----------------------------------------------------------------------------*/ 7177 7178 float128 float128_div(float128 a, float128 b, float_status *status) 7179 { 7180 flag aSign, bSign, zSign; 7181 int32_t aExp, bExp, zExp; 7182 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7183 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7184 7185 aSig1 = extractFloat128Frac1( a ); 7186 aSig0 = extractFloat128Frac0( a ); 7187 aExp = extractFloat128Exp( a ); 7188 aSign = extractFloat128Sign( a ); 7189 bSig1 = extractFloat128Frac1( b ); 7190 bSig0 = extractFloat128Frac0( b ); 7191 bExp = extractFloat128Exp( b ); 7192 bSign = extractFloat128Sign( b ); 7193 zSign = aSign ^ bSign; 7194 if ( aExp == 0x7FFF ) { 7195 if (aSig0 | aSig1) { 7196 return propagateFloat128NaN(a, b, status); 7197 } 7198 if ( bExp == 0x7FFF ) { 7199 if (bSig0 | bSig1) { 7200 return propagateFloat128NaN(a, b, status); 7201 } 7202 goto invalid; 7203 } 7204 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7205 } 7206 if ( bExp == 0x7FFF ) { 7207 if (bSig0 | bSig1) { 7208 return propagateFloat128NaN(a, b, status); 7209 } 7210 return packFloat128( zSign, 0, 0, 0 ); 7211 } 7212 if ( bExp == 0 ) { 7213 if ( ( bSig0 | bSig1 ) == 0 ) { 7214 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7215 invalid: 7216 float_raise(float_flag_invalid, status); 7217 return float128_default_nan(status); 7218 } 7219 float_raise(float_flag_divbyzero, status); 7220 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7221 } 7222 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7223 } 7224 if ( aExp == 0 ) { 7225 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7226 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7227 } 7228 zExp = aExp - bExp + 0x3FFD; 7229 shortShift128Left( 7230 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7231 shortShift128Left( 7232 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7233 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7234 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7235 ++zExp; 7236 } 7237 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7238 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7239 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7240 while ( (int64_t) rem0 < 0 ) { 7241 --zSig0; 7242 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7243 } 7244 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7245 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7246 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7247 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7248 while ( (int64_t) rem1 < 0 ) { 7249 --zSig1; 7250 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7251 } 7252 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7253 } 7254 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7255 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7256 7257 } 7258 7259 /*---------------------------------------------------------------------------- 7260 | Returns the remainder of the quadruple-precision floating-point value `a' 7261 | with respect to the corresponding value `b'. The operation is performed 7262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7263 *----------------------------------------------------------------------------*/ 7264 7265 float128 float128_rem(float128 a, float128 b, float_status *status) 7266 { 7267 flag aSign, zSign; 7268 int32_t aExp, bExp, expDiff; 7269 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7270 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7271 int64_t sigMean0; 7272 7273 aSig1 = extractFloat128Frac1( a ); 7274 aSig0 = extractFloat128Frac0( a ); 7275 aExp = extractFloat128Exp( a ); 7276 aSign = extractFloat128Sign( a ); 7277 bSig1 = extractFloat128Frac1( b ); 7278 bSig0 = extractFloat128Frac0( b ); 7279 bExp = extractFloat128Exp( b ); 7280 if ( aExp == 0x7FFF ) { 7281 if ( ( aSig0 | aSig1 ) 7282 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7283 return propagateFloat128NaN(a, b, status); 7284 } 7285 goto invalid; 7286 } 7287 if ( bExp == 0x7FFF ) { 7288 if (bSig0 | bSig1) { 7289 return propagateFloat128NaN(a, b, status); 7290 } 7291 return a; 7292 } 7293 if ( bExp == 0 ) { 7294 if ( ( bSig0 | bSig1 ) == 0 ) { 7295 invalid: 7296 float_raise(float_flag_invalid, status); 7297 return float128_default_nan(status); 7298 } 7299 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7300 } 7301 if ( aExp == 0 ) { 7302 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7303 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7304 } 7305 expDiff = aExp - bExp; 7306 if ( expDiff < -1 ) return a; 7307 shortShift128Left( 7308 aSig0 | LIT64( 0x0001000000000000 ), 7309 aSig1, 7310 15 - ( expDiff < 0 ), 7311 &aSig0, 7312 &aSig1 7313 ); 7314 shortShift128Left( 7315 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7316 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7317 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7318 expDiff -= 64; 7319 while ( 0 < expDiff ) { 7320 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7321 q = ( 4 < q ) ? q - 4 : 0; 7322 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7323 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7324 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7325 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7326 expDiff -= 61; 7327 } 7328 if ( -64 < expDiff ) { 7329 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7330 q = ( 4 < q ) ? q - 4 : 0; 7331 q >>= - expDiff; 7332 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7333 expDiff += 52; 7334 if ( expDiff < 0 ) { 7335 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7336 } 7337 else { 7338 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7339 } 7340 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7341 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7342 } 7343 else { 7344 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7345 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7346 } 7347 do { 7348 alternateASig0 = aSig0; 7349 alternateASig1 = aSig1; 7350 ++q; 7351 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7352 } while ( 0 <= (int64_t) aSig0 ); 7353 add128( 7354 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7355 if ( ( sigMean0 < 0 ) 7356 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7357 aSig0 = alternateASig0; 7358 aSig1 = alternateASig1; 7359 } 7360 zSign = ( (int64_t) aSig0 < 0 ); 7361 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7362 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7363 status); 7364 } 7365 7366 /*---------------------------------------------------------------------------- 7367 | Returns the square root of the quadruple-precision floating-point value `a'. 7368 | The operation is performed according to the IEC/IEEE Standard for Binary 7369 | Floating-Point Arithmetic. 7370 *----------------------------------------------------------------------------*/ 7371 7372 float128 float128_sqrt(float128 a, float_status *status) 7373 { 7374 flag aSign; 7375 int32_t aExp, zExp; 7376 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7377 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7378 7379 aSig1 = extractFloat128Frac1( a ); 7380 aSig0 = extractFloat128Frac0( a ); 7381 aExp = extractFloat128Exp( a ); 7382 aSign = extractFloat128Sign( a ); 7383 if ( aExp == 0x7FFF ) { 7384 if (aSig0 | aSig1) { 7385 return propagateFloat128NaN(a, a, status); 7386 } 7387 if ( ! aSign ) return a; 7388 goto invalid; 7389 } 7390 if ( aSign ) { 7391 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7392 invalid: 7393 float_raise(float_flag_invalid, status); 7394 return float128_default_nan(status); 7395 } 7396 if ( aExp == 0 ) { 7397 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7398 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7399 } 7400 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7401 aSig0 |= LIT64( 0x0001000000000000 ); 7402 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7403 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7404 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7405 doubleZSig0 = zSig0<<1; 7406 mul64To128( zSig0, zSig0, &term0, &term1 ); 7407 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7408 while ( (int64_t) rem0 < 0 ) { 7409 --zSig0; 7410 doubleZSig0 -= 2; 7411 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7412 } 7413 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7414 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7415 if ( zSig1 == 0 ) zSig1 = 1; 7416 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7417 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7418 mul64To128( zSig1, zSig1, &term2, &term3 ); 7419 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7420 while ( (int64_t) rem1 < 0 ) { 7421 --zSig1; 7422 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7423 term3 |= 1; 7424 term2 |= doubleZSig0; 7425 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7426 } 7427 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7428 } 7429 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7430 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7431 7432 } 7433 7434 /*---------------------------------------------------------------------------- 7435 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7436 | the corresponding value `b', and 0 otherwise. The invalid exception is 7437 | raised if either operand is a NaN. Otherwise, the comparison is performed 7438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7439 *----------------------------------------------------------------------------*/ 7440 7441 int float128_eq(float128 a, float128 b, float_status *status) 7442 { 7443 7444 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7445 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7446 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7447 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7448 ) { 7449 float_raise(float_flag_invalid, status); 7450 return 0; 7451 } 7452 return 7453 ( a.low == b.low ) 7454 && ( ( a.high == b.high ) 7455 || ( ( a.low == 0 ) 7456 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7457 ); 7458 7459 } 7460 7461 /*---------------------------------------------------------------------------- 7462 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7463 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7464 | exception is raised if either operand is a NaN. The comparison is performed 7465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7466 *----------------------------------------------------------------------------*/ 7467 7468 int float128_le(float128 a, float128 b, float_status *status) 7469 { 7470 flag aSign, bSign; 7471 7472 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7473 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7474 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7475 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7476 ) { 7477 float_raise(float_flag_invalid, status); 7478 return 0; 7479 } 7480 aSign = extractFloat128Sign( a ); 7481 bSign = extractFloat128Sign( b ); 7482 if ( aSign != bSign ) { 7483 return 7484 aSign 7485 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7486 == 0 ); 7487 } 7488 return 7489 aSign ? le128( b.high, b.low, a.high, a.low ) 7490 : le128( a.high, a.low, b.high, b.low ); 7491 7492 } 7493 7494 /*---------------------------------------------------------------------------- 7495 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7496 | the corresponding value `b', and 0 otherwise. The invalid exception is 7497 | raised if either operand is a NaN. The comparison is performed according 7498 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7499 *----------------------------------------------------------------------------*/ 7500 7501 int float128_lt(float128 a, float128 b, float_status *status) 7502 { 7503 flag aSign, bSign; 7504 7505 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7506 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7507 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7508 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7509 ) { 7510 float_raise(float_flag_invalid, status); 7511 return 0; 7512 } 7513 aSign = extractFloat128Sign( a ); 7514 bSign = extractFloat128Sign( b ); 7515 if ( aSign != bSign ) { 7516 return 7517 aSign 7518 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7519 != 0 ); 7520 } 7521 return 7522 aSign ? lt128( b.high, b.low, a.high, a.low ) 7523 : lt128( a.high, a.low, b.high, b.low ); 7524 7525 } 7526 7527 /*---------------------------------------------------------------------------- 7528 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7529 | be compared, and 0 otherwise. The invalid exception is raised if either 7530 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7531 | Standard for Binary Floating-Point Arithmetic. 7532 *----------------------------------------------------------------------------*/ 7533 7534 int float128_unordered(float128 a, float128 b, float_status *status) 7535 { 7536 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7537 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7538 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7539 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7540 ) { 7541 float_raise(float_flag_invalid, status); 7542 return 1; 7543 } 7544 return 0; 7545 } 7546 7547 /*---------------------------------------------------------------------------- 7548 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7549 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7550 | exception. The comparison is performed according to the IEC/IEEE Standard 7551 | for Binary Floating-Point Arithmetic. 7552 *----------------------------------------------------------------------------*/ 7553 7554 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7555 { 7556 7557 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7558 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7559 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7560 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7561 ) { 7562 if (float128_is_signaling_nan(a, status) 7563 || float128_is_signaling_nan(b, status)) { 7564 float_raise(float_flag_invalid, status); 7565 } 7566 return 0; 7567 } 7568 return 7569 ( a.low == b.low ) 7570 && ( ( a.high == b.high ) 7571 || ( ( a.low == 0 ) 7572 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7573 ); 7574 7575 } 7576 7577 /*---------------------------------------------------------------------------- 7578 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7579 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7580 | cause an exception. Otherwise, the comparison is performed according to the 7581 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7582 *----------------------------------------------------------------------------*/ 7583 7584 int float128_le_quiet(float128 a, float128 b, float_status *status) 7585 { 7586 flag aSign, bSign; 7587 7588 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7589 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7590 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7591 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7592 ) { 7593 if (float128_is_signaling_nan(a, status) 7594 || float128_is_signaling_nan(b, status)) { 7595 float_raise(float_flag_invalid, status); 7596 } 7597 return 0; 7598 } 7599 aSign = extractFloat128Sign( a ); 7600 bSign = extractFloat128Sign( b ); 7601 if ( aSign != bSign ) { 7602 return 7603 aSign 7604 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7605 == 0 ); 7606 } 7607 return 7608 aSign ? le128( b.high, b.low, a.high, a.low ) 7609 : le128( a.high, a.low, b.high, b.low ); 7610 7611 } 7612 7613 /*---------------------------------------------------------------------------- 7614 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7615 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7616 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7617 | Standard for Binary Floating-Point Arithmetic. 7618 *----------------------------------------------------------------------------*/ 7619 7620 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7621 { 7622 flag aSign, bSign; 7623 7624 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7625 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7626 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7627 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7628 ) { 7629 if (float128_is_signaling_nan(a, status) 7630 || float128_is_signaling_nan(b, status)) { 7631 float_raise(float_flag_invalid, status); 7632 } 7633 return 0; 7634 } 7635 aSign = extractFloat128Sign( a ); 7636 bSign = extractFloat128Sign( b ); 7637 if ( aSign != bSign ) { 7638 return 7639 aSign 7640 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7641 != 0 ); 7642 } 7643 return 7644 aSign ? lt128( b.high, b.low, a.high, a.low ) 7645 : lt128( a.high, a.low, b.high, b.low ); 7646 7647 } 7648 7649 /*---------------------------------------------------------------------------- 7650 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7651 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7652 | comparison is performed according to the IEC/IEEE Standard for Binary 7653 | Floating-Point Arithmetic. 7654 *----------------------------------------------------------------------------*/ 7655 7656 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7657 { 7658 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7659 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7660 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7661 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7662 ) { 7663 if (float128_is_signaling_nan(a, status) 7664 || float128_is_signaling_nan(b, status)) { 7665 float_raise(float_flag_invalid, status); 7666 } 7667 return 1; 7668 } 7669 return 0; 7670 } 7671 7672 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7673 int is_quiet, float_status *status) 7674 { 7675 flag aSign, bSign; 7676 7677 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7678 float_raise(float_flag_invalid, status); 7679 return float_relation_unordered; 7680 } 7681 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7682 ( extractFloatx80Frac( a )<<1 ) ) || 7683 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7684 ( extractFloatx80Frac( b )<<1 ) )) { 7685 if (!is_quiet || 7686 floatx80_is_signaling_nan(a, status) || 7687 floatx80_is_signaling_nan(b, status)) { 7688 float_raise(float_flag_invalid, status); 7689 } 7690 return float_relation_unordered; 7691 } 7692 aSign = extractFloatx80Sign( a ); 7693 bSign = extractFloatx80Sign( b ); 7694 if ( aSign != bSign ) { 7695 7696 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7697 ( ( a.low | b.low ) == 0 ) ) { 7698 /* zero case */ 7699 return float_relation_equal; 7700 } else { 7701 return 1 - (2 * aSign); 7702 } 7703 } else { 7704 if (a.low == b.low && a.high == b.high) { 7705 return float_relation_equal; 7706 } else { 7707 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7708 } 7709 } 7710 } 7711 7712 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7713 { 7714 return floatx80_compare_internal(a, b, 0, status); 7715 } 7716 7717 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7718 { 7719 return floatx80_compare_internal(a, b, 1, status); 7720 } 7721 7722 static inline int float128_compare_internal(float128 a, float128 b, 7723 int is_quiet, float_status *status) 7724 { 7725 flag aSign, bSign; 7726 7727 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7728 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7729 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7730 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7731 if (!is_quiet || 7732 float128_is_signaling_nan(a, status) || 7733 float128_is_signaling_nan(b, status)) { 7734 float_raise(float_flag_invalid, status); 7735 } 7736 return float_relation_unordered; 7737 } 7738 aSign = extractFloat128Sign( a ); 7739 bSign = extractFloat128Sign( b ); 7740 if ( aSign != bSign ) { 7741 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7742 /* zero case */ 7743 return float_relation_equal; 7744 } else { 7745 return 1 - (2 * aSign); 7746 } 7747 } else { 7748 if (a.low == b.low && a.high == b.high) { 7749 return float_relation_equal; 7750 } else { 7751 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7752 } 7753 } 7754 } 7755 7756 int float128_compare(float128 a, float128 b, float_status *status) 7757 { 7758 return float128_compare_internal(a, b, 0, status); 7759 } 7760 7761 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7762 { 7763 return float128_compare_internal(a, b, 1, status); 7764 } 7765 7766 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7767 { 7768 flag aSign; 7769 int32_t aExp; 7770 uint64_t aSig; 7771 7772 if (floatx80_invalid_encoding(a)) { 7773 float_raise(float_flag_invalid, status); 7774 return floatx80_default_nan(status); 7775 } 7776 aSig = extractFloatx80Frac( a ); 7777 aExp = extractFloatx80Exp( a ); 7778 aSign = extractFloatx80Sign( a ); 7779 7780 if ( aExp == 0x7FFF ) { 7781 if ( aSig<<1 ) { 7782 return propagateFloatx80NaN(a, a, status); 7783 } 7784 return a; 7785 } 7786 7787 if (aExp == 0) { 7788 if (aSig == 0) { 7789 return a; 7790 } 7791 aExp++; 7792 } 7793 7794 if (n > 0x10000) { 7795 n = 0x10000; 7796 } else if (n < -0x10000) { 7797 n = -0x10000; 7798 } 7799 7800 aExp += n; 7801 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7802 aSign, aExp, aSig, 0, status); 7803 } 7804 7805 float128 float128_scalbn(float128 a, int n, float_status *status) 7806 { 7807 flag aSign; 7808 int32_t aExp; 7809 uint64_t aSig0, aSig1; 7810 7811 aSig1 = extractFloat128Frac1( a ); 7812 aSig0 = extractFloat128Frac0( a ); 7813 aExp = extractFloat128Exp( a ); 7814 aSign = extractFloat128Sign( a ); 7815 if ( aExp == 0x7FFF ) { 7816 if ( aSig0 | aSig1 ) { 7817 return propagateFloat128NaN(a, a, status); 7818 } 7819 return a; 7820 } 7821 if (aExp != 0) { 7822 aSig0 |= LIT64( 0x0001000000000000 ); 7823 } else if (aSig0 == 0 && aSig1 == 0) { 7824 return a; 7825 } else { 7826 aExp++; 7827 } 7828 7829 if (n > 0x10000) { 7830 n = 0x10000; 7831 } else if (n < -0x10000) { 7832 n = -0x10000; 7833 } 7834 7835 aExp += n - 1; 7836 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7837 , status); 7838 7839 } 7840