1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status) 1058 { 1059 FloatParts pa = float32_unpack_canonical(a, status); 1060 FloatParts pb = float32_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, false, status); 1062 1063 return float32_round_pack_canonical(pr, status); 1064 } 1065 1066 float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status) 1067 { 1068 FloatParts pa = float64_unpack_canonical(a, status); 1069 FloatParts pb = float64_unpack_canonical(b, status); 1070 FloatParts pr = addsub_floats(pa, pb, false, status); 1071 1072 return float64_round_pack_canonical(pr, status); 1073 } 1074 1075 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1076 { 1077 FloatParts pa = float16_unpack_canonical(a, status); 1078 FloatParts pb = float16_unpack_canonical(b, status); 1079 FloatParts pr = addsub_floats(pa, pb, true, status); 1080 1081 return float16_round_pack_canonical(pr, status); 1082 } 1083 1084 float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status) 1085 { 1086 FloatParts pa = float32_unpack_canonical(a, status); 1087 FloatParts pb = float32_unpack_canonical(b, status); 1088 FloatParts pr = addsub_floats(pa, pb, true, status); 1089 1090 return float32_round_pack_canonical(pr, status); 1091 } 1092 1093 float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status) 1094 { 1095 FloatParts pa = float64_unpack_canonical(a, status); 1096 FloatParts pb = float64_unpack_canonical(b, status); 1097 FloatParts pr = addsub_floats(pa, pb, true, status); 1098 1099 return float64_round_pack_canonical(pr, status); 1100 } 1101 1102 /* 1103 * Returns the result of multiplying the floating-point values `a' and 1104 * `b'. The operation is performed according to the IEC/IEEE Standard 1105 * for Binary Floating-Point Arithmetic. 1106 */ 1107 1108 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1109 { 1110 bool sign = a.sign ^ b.sign; 1111 1112 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1113 uint64_t hi, lo; 1114 int exp = a.exp + b.exp; 1115 1116 mul64To128(a.frac, b.frac, &hi, &lo); 1117 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1118 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1119 shift64RightJamming(lo, 1, &lo); 1120 exp += 1; 1121 } 1122 1123 /* Re-use a */ 1124 a.exp = exp; 1125 a.sign = sign; 1126 a.frac = lo; 1127 return a; 1128 } 1129 /* handle all the NaN cases */ 1130 if (is_nan(a.cls) || is_nan(b.cls)) { 1131 return pick_nan(a, b, s); 1132 } 1133 /* Inf * Zero == NaN */ 1134 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1135 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1136 s->float_exception_flags |= float_flag_invalid; 1137 return parts_default_nan(s); 1138 } 1139 /* Multiply by 0 or Inf */ 1140 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1141 a.sign = sign; 1142 return a; 1143 } 1144 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1145 b.sign = sign; 1146 return b; 1147 } 1148 g_assert_not_reached(); 1149 } 1150 1151 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1152 { 1153 FloatParts pa = float16_unpack_canonical(a, status); 1154 FloatParts pb = float16_unpack_canonical(b, status); 1155 FloatParts pr = mul_floats(pa, pb, status); 1156 1157 return float16_round_pack_canonical(pr, status); 1158 } 1159 1160 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status) 1161 { 1162 FloatParts pa = float32_unpack_canonical(a, status); 1163 FloatParts pb = float32_unpack_canonical(b, status); 1164 FloatParts pr = mul_floats(pa, pb, status); 1165 1166 return float32_round_pack_canonical(pr, status); 1167 } 1168 1169 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status) 1170 { 1171 FloatParts pa = float64_unpack_canonical(a, status); 1172 FloatParts pb = float64_unpack_canonical(b, status); 1173 FloatParts pr = mul_floats(pa, pb, status); 1174 1175 return float64_round_pack_canonical(pr, status); 1176 } 1177 1178 /* 1179 * Returns the result of multiplying the floating-point values `a' and 1180 * `b' then adding 'c', with no intermediate rounding step after the 1181 * multiplication. The operation is performed according to the 1182 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1183 * The flags argument allows the caller to select negation of the 1184 * addend, the intermediate product, or the final result. (The 1185 * difference between this and having the caller do a separate 1186 * negation is that negating externally will flip the sign bit on 1187 * NaNs.) 1188 */ 1189 1190 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1191 int flags, float_status *s) 1192 { 1193 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1194 ((1 << float_class_inf) | (1 << float_class_zero)); 1195 bool p_sign; 1196 bool sign_flip = flags & float_muladd_negate_result; 1197 FloatClass p_class; 1198 uint64_t hi, lo; 1199 int p_exp; 1200 1201 /* It is implementation-defined whether the cases of (0,inf,qnan) 1202 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1203 * they return if they do), so we have to hand this information 1204 * off to the target-specific pick-a-NaN routine. 1205 */ 1206 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1207 return pick_nan_muladd(a, b, c, inf_zero, s); 1208 } 1209 1210 if (inf_zero) { 1211 s->float_exception_flags |= float_flag_invalid; 1212 return parts_default_nan(s); 1213 } 1214 1215 if (flags & float_muladd_negate_c) { 1216 c.sign ^= 1; 1217 } 1218 1219 p_sign = a.sign ^ b.sign; 1220 1221 if (flags & float_muladd_negate_product) { 1222 p_sign ^= 1; 1223 } 1224 1225 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1226 p_class = float_class_inf; 1227 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1228 p_class = float_class_zero; 1229 } else { 1230 p_class = float_class_normal; 1231 } 1232 1233 if (c.cls == float_class_inf) { 1234 if (p_class == float_class_inf && p_sign != c.sign) { 1235 s->float_exception_flags |= float_flag_invalid; 1236 return parts_default_nan(s); 1237 } else { 1238 a.cls = float_class_inf; 1239 a.sign = c.sign ^ sign_flip; 1240 return a; 1241 } 1242 } 1243 1244 if (p_class == float_class_inf) { 1245 a.cls = float_class_inf; 1246 a.sign = p_sign ^ sign_flip; 1247 return a; 1248 } 1249 1250 if (p_class == float_class_zero) { 1251 if (c.cls == float_class_zero) { 1252 if (p_sign != c.sign) { 1253 p_sign = s->float_rounding_mode == float_round_down; 1254 } 1255 c.sign = p_sign; 1256 } else if (flags & float_muladd_halve_result) { 1257 c.exp -= 1; 1258 } 1259 c.sign ^= sign_flip; 1260 return c; 1261 } 1262 1263 /* a & b should be normals now... */ 1264 assert(a.cls == float_class_normal && 1265 b.cls == float_class_normal); 1266 1267 p_exp = a.exp + b.exp; 1268 1269 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1270 * result. 1271 */ 1272 mul64To128(a.frac, b.frac, &hi, &lo); 1273 /* binary point now at bit 124 */ 1274 1275 /* check for overflow */ 1276 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1277 shift128RightJamming(hi, lo, 1, &hi, &lo); 1278 p_exp += 1; 1279 } 1280 1281 /* + add/sub */ 1282 if (c.cls == float_class_zero) { 1283 /* move binary point back to 62 */ 1284 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1285 } else { 1286 int exp_diff = p_exp - c.exp; 1287 if (p_sign == c.sign) { 1288 /* Addition */ 1289 if (exp_diff <= 0) { 1290 shift128RightJamming(hi, lo, 1291 DECOMPOSED_BINARY_POINT - exp_diff, 1292 &hi, &lo); 1293 lo += c.frac; 1294 p_exp = c.exp; 1295 } else { 1296 uint64_t c_hi, c_lo; 1297 /* shift c to the same binary point as the product (124) */ 1298 c_hi = c.frac >> 2; 1299 c_lo = 0; 1300 shift128RightJamming(c_hi, c_lo, 1301 exp_diff, 1302 &c_hi, &c_lo); 1303 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1304 /* move binary point back to 62 */ 1305 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1306 } 1307 1308 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1309 shift64RightJamming(lo, 1, &lo); 1310 p_exp += 1; 1311 } 1312 1313 } else { 1314 /* Subtraction */ 1315 uint64_t c_hi, c_lo; 1316 /* make C binary point match product at bit 124 */ 1317 c_hi = c.frac >> 2; 1318 c_lo = 0; 1319 1320 if (exp_diff <= 0) { 1321 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1322 if (exp_diff == 0 1323 && 1324 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1325 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1326 } else { 1327 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1328 p_sign ^= 1; 1329 p_exp = c.exp; 1330 } 1331 } else { 1332 shift128RightJamming(c_hi, c_lo, 1333 exp_diff, 1334 &c_hi, &c_lo); 1335 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1336 } 1337 1338 if (hi == 0 && lo == 0) { 1339 a.cls = float_class_zero; 1340 a.sign = s->float_rounding_mode == float_round_down; 1341 a.sign ^= sign_flip; 1342 return a; 1343 } else { 1344 int shift; 1345 if (hi != 0) { 1346 shift = clz64(hi); 1347 } else { 1348 shift = clz64(lo) + 64; 1349 } 1350 /* Normalizing to a binary point of 124 is the 1351 correct adjust for the exponent. However since we're 1352 shifting, we might as well put the binary point back 1353 at 62 where we really want it. Therefore shift as 1354 if we're leaving 1 bit at the top of the word, but 1355 adjust the exponent as if we're leaving 3 bits. */ 1356 shift -= 1; 1357 if (shift >= 64) { 1358 lo = lo << (shift - 64); 1359 } else { 1360 hi = (hi << shift) | (lo >> (64 - shift)); 1361 lo = hi | ((lo << shift) != 0); 1362 } 1363 p_exp -= shift - 2; 1364 } 1365 } 1366 } 1367 1368 if (flags & float_muladd_halve_result) { 1369 p_exp -= 1; 1370 } 1371 1372 /* finally prepare our result */ 1373 a.cls = float_class_normal; 1374 a.sign = p_sign ^ sign_flip; 1375 a.exp = p_exp; 1376 a.frac = lo; 1377 1378 return a; 1379 } 1380 1381 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1382 int flags, float_status *status) 1383 { 1384 FloatParts pa = float16_unpack_canonical(a, status); 1385 FloatParts pb = float16_unpack_canonical(b, status); 1386 FloatParts pc = float16_unpack_canonical(c, status); 1387 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1388 1389 return float16_round_pack_canonical(pr, status); 1390 } 1391 1392 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, 1393 int flags, float_status *status) 1394 { 1395 FloatParts pa = float32_unpack_canonical(a, status); 1396 FloatParts pb = float32_unpack_canonical(b, status); 1397 FloatParts pc = float32_unpack_canonical(c, status); 1398 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1399 1400 return float32_round_pack_canonical(pr, status); 1401 } 1402 1403 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, 1404 int flags, float_status *status) 1405 { 1406 FloatParts pa = float64_unpack_canonical(a, status); 1407 FloatParts pb = float64_unpack_canonical(b, status); 1408 FloatParts pc = float64_unpack_canonical(c, status); 1409 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1410 1411 return float64_round_pack_canonical(pr, status); 1412 } 1413 1414 /* 1415 * Returns the result of dividing the floating-point value `a' by the 1416 * corresponding value `b'. The operation is performed according to 1417 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1418 */ 1419 1420 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1421 { 1422 bool sign = a.sign ^ b.sign; 1423 1424 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1425 uint64_t n0, n1, q, r; 1426 int exp = a.exp - b.exp; 1427 1428 /* 1429 * We want a 2*N / N-bit division to produce exactly an N-bit 1430 * result, so that we do not lose any precision and so that we 1431 * do not have to renormalize afterward. If A.frac < B.frac, 1432 * then division would produce an (N-1)-bit result; shift A left 1433 * by one to produce the an N-bit result, and decrement the 1434 * exponent to match. 1435 * 1436 * The udiv_qrnnd algorithm that we're using requires normalization, 1437 * i.e. the msb of the denominator must be set. Since we know that 1438 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1439 * by one (more), and the remainder must be shifted right by one. 1440 */ 1441 if (a.frac < b.frac) { 1442 exp -= 1; 1443 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1444 } else { 1445 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1446 } 1447 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1448 1449 /* 1450 * Set lsb if there is a remainder, to set inexact. 1451 * As mentioned above, to find the actual value of the remainder we 1452 * would need to shift right, but (1) we are only concerned about 1453 * non-zero-ness, and (2) the remainder will always be even because 1454 * both inputs to the division primitive are even. 1455 */ 1456 a.frac = q | (r != 0); 1457 a.sign = sign; 1458 a.exp = exp; 1459 return a; 1460 } 1461 /* handle all the NaN cases */ 1462 if (is_nan(a.cls) || is_nan(b.cls)) { 1463 return pick_nan(a, b, s); 1464 } 1465 /* 0/0 or Inf/Inf */ 1466 if (a.cls == b.cls 1467 && 1468 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1469 s->float_exception_flags |= float_flag_invalid; 1470 return parts_default_nan(s); 1471 } 1472 /* Inf / x or 0 / x */ 1473 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1474 a.sign = sign; 1475 return a; 1476 } 1477 /* Div 0 => Inf */ 1478 if (b.cls == float_class_zero) { 1479 s->float_exception_flags |= float_flag_divbyzero; 1480 a.cls = float_class_inf; 1481 a.sign = sign; 1482 return a; 1483 } 1484 /* Div by Inf */ 1485 if (b.cls == float_class_inf) { 1486 a.cls = float_class_zero; 1487 a.sign = sign; 1488 return a; 1489 } 1490 g_assert_not_reached(); 1491 } 1492 1493 float16 float16_div(float16 a, float16 b, float_status *status) 1494 { 1495 FloatParts pa = float16_unpack_canonical(a, status); 1496 FloatParts pb = float16_unpack_canonical(b, status); 1497 FloatParts pr = div_floats(pa, pb, status); 1498 1499 return float16_round_pack_canonical(pr, status); 1500 } 1501 1502 float32 float32_div(float32 a, float32 b, float_status *status) 1503 { 1504 FloatParts pa = float32_unpack_canonical(a, status); 1505 FloatParts pb = float32_unpack_canonical(b, status); 1506 FloatParts pr = div_floats(pa, pb, status); 1507 1508 return float32_round_pack_canonical(pr, status); 1509 } 1510 1511 float64 float64_div(float64 a, float64 b, float_status *status) 1512 { 1513 FloatParts pa = float64_unpack_canonical(a, status); 1514 FloatParts pb = float64_unpack_canonical(b, status); 1515 FloatParts pr = div_floats(pa, pb, status); 1516 1517 return float64_round_pack_canonical(pr, status); 1518 } 1519 1520 /* 1521 * Float to Float conversions 1522 * 1523 * Returns the result of converting one float format to another. The 1524 * conversion is performed according to the IEC/IEEE Standard for 1525 * Binary Floating-Point Arithmetic. 1526 * 1527 * The float_to_float helper only needs to take care of raising 1528 * invalid exceptions and handling the conversion on NaNs. 1529 */ 1530 1531 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1532 float_status *s) 1533 { 1534 if (dstf->arm_althp) { 1535 switch (a.cls) { 1536 case float_class_qnan: 1537 case float_class_snan: 1538 /* There is no NaN in the destination format. Raise Invalid 1539 * and return a zero with the sign of the input NaN. 1540 */ 1541 s->float_exception_flags |= float_flag_invalid; 1542 a.cls = float_class_zero; 1543 a.frac = 0; 1544 a.exp = 0; 1545 break; 1546 1547 case float_class_inf: 1548 /* There is no Inf in the destination format. Raise Invalid 1549 * and return the maximum normal with the correct sign. 1550 */ 1551 s->float_exception_flags |= float_flag_invalid; 1552 a.cls = float_class_normal; 1553 a.exp = dstf->exp_max; 1554 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1555 break; 1556 1557 default: 1558 break; 1559 } 1560 } else if (is_nan(a.cls)) { 1561 if (is_snan(a.cls)) { 1562 s->float_exception_flags |= float_flag_invalid; 1563 a = parts_silence_nan(a, s); 1564 } 1565 if (s->default_nan_mode) { 1566 return parts_default_nan(s); 1567 } 1568 } 1569 return a; 1570 } 1571 1572 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1573 { 1574 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1575 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1576 FloatParts pr = float_to_float(p, &float32_params, s); 1577 return float32_round_pack_canonical(pr, s); 1578 } 1579 1580 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1581 { 1582 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1583 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1584 FloatParts pr = float_to_float(p, &float64_params, s); 1585 return float64_round_pack_canonical(pr, s); 1586 } 1587 1588 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1589 { 1590 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1591 FloatParts p = float32_unpack_canonical(a, s); 1592 FloatParts pr = float_to_float(p, fmt16, s); 1593 return float16a_round_pack_canonical(pr, s, fmt16); 1594 } 1595 1596 float64 float32_to_float64(float32 a, float_status *s) 1597 { 1598 FloatParts p = float32_unpack_canonical(a, s); 1599 FloatParts pr = float_to_float(p, &float64_params, s); 1600 return float64_round_pack_canonical(pr, s); 1601 } 1602 1603 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1604 { 1605 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1606 FloatParts p = float64_unpack_canonical(a, s); 1607 FloatParts pr = float_to_float(p, fmt16, s); 1608 return float16a_round_pack_canonical(pr, s, fmt16); 1609 } 1610 1611 float32 float64_to_float32(float64 a, float_status *s) 1612 { 1613 FloatParts p = float64_unpack_canonical(a, s); 1614 FloatParts pr = float_to_float(p, &float32_params, s); 1615 return float32_round_pack_canonical(pr, s); 1616 } 1617 1618 /* 1619 * Rounds the floating-point value `a' to an integer, and returns the 1620 * result as a floating-point value. The operation is performed 1621 * according to the IEC/IEEE Standard for Binary Floating-Point 1622 * Arithmetic. 1623 */ 1624 1625 static FloatParts round_to_int(FloatParts a, int rmode, 1626 int scale, float_status *s) 1627 { 1628 switch (a.cls) { 1629 case float_class_qnan: 1630 case float_class_snan: 1631 return return_nan(a, s); 1632 1633 case float_class_zero: 1634 case float_class_inf: 1635 /* already "integral" */ 1636 break; 1637 1638 case float_class_normal: 1639 scale = MIN(MAX(scale, -0x10000), 0x10000); 1640 a.exp += scale; 1641 1642 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1643 /* already integral */ 1644 break; 1645 } 1646 if (a.exp < 0) { 1647 bool one; 1648 /* all fractional */ 1649 s->float_exception_flags |= float_flag_inexact; 1650 switch (rmode) { 1651 case float_round_nearest_even: 1652 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1653 break; 1654 case float_round_ties_away: 1655 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1656 break; 1657 case float_round_to_zero: 1658 one = false; 1659 break; 1660 case float_round_up: 1661 one = !a.sign; 1662 break; 1663 case float_round_down: 1664 one = a.sign; 1665 break; 1666 default: 1667 g_assert_not_reached(); 1668 } 1669 1670 if (one) { 1671 a.frac = DECOMPOSED_IMPLICIT_BIT; 1672 a.exp = 0; 1673 } else { 1674 a.cls = float_class_zero; 1675 } 1676 } else { 1677 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1678 uint64_t frac_lsbm1 = frac_lsb >> 1; 1679 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1680 uint64_t rnd_mask = rnd_even_mask >> 1; 1681 uint64_t inc; 1682 1683 switch (rmode) { 1684 case float_round_nearest_even: 1685 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1686 break; 1687 case float_round_ties_away: 1688 inc = frac_lsbm1; 1689 break; 1690 case float_round_to_zero: 1691 inc = 0; 1692 break; 1693 case float_round_up: 1694 inc = a.sign ? 0 : rnd_mask; 1695 break; 1696 case float_round_down: 1697 inc = a.sign ? rnd_mask : 0; 1698 break; 1699 default: 1700 g_assert_not_reached(); 1701 } 1702 1703 if (a.frac & rnd_mask) { 1704 s->float_exception_flags |= float_flag_inexact; 1705 a.frac += inc; 1706 a.frac &= ~rnd_mask; 1707 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1708 a.frac >>= 1; 1709 a.exp++; 1710 } 1711 } 1712 } 1713 break; 1714 default: 1715 g_assert_not_reached(); 1716 } 1717 return a; 1718 } 1719 1720 float16 float16_round_to_int(float16 a, float_status *s) 1721 { 1722 FloatParts pa = float16_unpack_canonical(a, s); 1723 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1724 return float16_round_pack_canonical(pr, s); 1725 } 1726 1727 float32 float32_round_to_int(float32 a, float_status *s) 1728 { 1729 FloatParts pa = float32_unpack_canonical(a, s); 1730 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1731 return float32_round_pack_canonical(pr, s); 1732 } 1733 1734 float64 float64_round_to_int(float64 a, float_status *s) 1735 { 1736 FloatParts pa = float64_unpack_canonical(a, s); 1737 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1738 return float64_round_pack_canonical(pr, s); 1739 } 1740 1741 /* 1742 * Returns the result of converting the floating-point value `a' to 1743 * the two's complement integer format. The conversion is performed 1744 * according to the IEC/IEEE Standard for Binary Floating-Point 1745 * Arithmetic---which means in particular that the conversion is 1746 * rounded according to the current rounding mode. If `a' is a NaN, 1747 * the largest positive integer is returned. Otherwise, if the 1748 * conversion overflows, the largest integer with the same sign as `a' 1749 * is returned. 1750 */ 1751 1752 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1753 int64_t min, int64_t max, 1754 float_status *s) 1755 { 1756 uint64_t r; 1757 int orig_flags = get_float_exception_flags(s); 1758 FloatParts p = round_to_int(in, rmode, scale, s); 1759 1760 switch (p.cls) { 1761 case float_class_snan: 1762 case float_class_qnan: 1763 s->float_exception_flags = orig_flags | float_flag_invalid; 1764 return max; 1765 case float_class_inf: 1766 s->float_exception_flags = orig_flags | float_flag_invalid; 1767 return p.sign ? min : max; 1768 case float_class_zero: 1769 return 0; 1770 case float_class_normal: 1771 if (p.exp < DECOMPOSED_BINARY_POINT) { 1772 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1773 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1774 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1775 } else { 1776 r = UINT64_MAX; 1777 } 1778 if (p.sign) { 1779 if (r <= -(uint64_t) min) { 1780 return -r; 1781 } else { 1782 s->float_exception_flags = orig_flags | float_flag_invalid; 1783 return min; 1784 } 1785 } else { 1786 if (r <= max) { 1787 return r; 1788 } else { 1789 s->float_exception_flags = orig_flags | float_flag_invalid; 1790 return max; 1791 } 1792 } 1793 default: 1794 g_assert_not_reached(); 1795 } 1796 } 1797 1798 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1799 float_status *s) 1800 { 1801 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1802 rmode, scale, INT16_MIN, INT16_MAX, s); 1803 } 1804 1805 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1806 float_status *s) 1807 { 1808 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1809 rmode, scale, INT32_MIN, INT32_MAX, s); 1810 } 1811 1812 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 1813 float_status *s) 1814 { 1815 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1816 rmode, scale, INT64_MIN, INT64_MAX, s); 1817 } 1818 1819 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 1820 float_status *s) 1821 { 1822 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1823 rmode, scale, INT16_MIN, INT16_MAX, s); 1824 } 1825 1826 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 1827 float_status *s) 1828 { 1829 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1830 rmode, scale, INT32_MIN, INT32_MAX, s); 1831 } 1832 1833 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 1834 float_status *s) 1835 { 1836 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1837 rmode, scale, INT64_MIN, INT64_MAX, s); 1838 } 1839 1840 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 1841 float_status *s) 1842 { 1843 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1844 rmode, scale, INT16_MIN, INT16_MAX, s); 1845 } 1846 1847 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 1848 float_status *s) 1849 { 1850 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1851 rmode, scale, INT32_MIN, INT32_MAX, s); 1852 } 1853 1854 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 1855 float_status *s) 1856 { 1857 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1858 rmode, scale, INT64_MIN, INT64_MAX, s); 1859 } 1860 1861 int16_t float16_to_int16(float16 a, float_status *s) 1862 { 1863 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1864 } 1865 1866 int32_t float16_to_int32(float16 a, float_status *s) 1867 { 1868 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1869 } 1870 1871 int64_t float16_to_int64(float16 a, float_status *s) 1872 { 1873 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1874 } 1875 1876 int16_t float32_to_int16(float32 a, float_status *s) 1877 { 1878 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1879 } 1880 1881 int32_t float32_to_int32(float32 a, float_status *s) 1882 { 1883 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1884 } 1885 1886 int64_t float32_to_int64(float32 a, float_status *s) 1887 { 1888 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1889 } 1890 1891 int16_t float64_to_int16(float64 a, float_status *s) 1892 { 1893 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1894 } 1895 1896 int32_t float64_to_int32(float64 a, float_status *s) 1897 { 1898 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1899 } 1900 1901 int64_t float64_to_int64(float64 a, float_status *s) 1902 { 1903 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1904 } 1905 1906 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 1907 { 1908 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 1909 } 1910 1911 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 1912 { 1913 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 1914 } 1915 1916 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 1917 { 1918 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 1919 } 1920 1921 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 1922 { 1923 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 1924 } 1925 1926 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 1927 { 1928 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 1929 } 1930 1931 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 1932 { 1933 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 1934 } 1935 1936 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 1937 { 1938 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 1939 } 1940 1941 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 1942 { 1943 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 1944 } 1945 1946 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 1947 { 1948 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 1949 } 1950 1951 /* 1952 * Returns the result of converting the floating-point value `a' to 1953 * the unsigned integer format. The conversion is performed according 1954 * to the IEC/IEEE Standard for Binary Floating-Point 1955 * Arithmetic---which means in particular that the conversion is 1956 * rounded according to the current rounding mode. If `a' is a NaN, 1957 * the largest unsigned integer is returned. Otherwise, if the 1958 * conversion overflows, the largest unsigned integer is returned. If 1959 * the 'a' is negative, the result is rounded and zero is returned; 1960 * values that do not round to zero will raise the inexact exception 1961 * flag. 1962 */ 1963 1964 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 1965 uint64_t max, float_status *s) 1966 { 1967 int orig_flags = get_float_exception_flags(s); 1968 FloatParts p = round_to_int(in, rmode, scale, s); 1969 uint64_t r; 1970 1971 switch (p.cls) { 1972 case float_class_snan: 1973 case float_class_qnan: 1974 s->float_exception_flags = orig_flags | float_flag_invalid; 1975 return max; 1976 case float_class_inf: 1977 s->float_exception_flags = orig_flags | float_flag_invalid; 1978 return p.sign ? 0 : max; 1979 case float_class_zero: 1980 return 0; 1981 case float_class_normal: 1982 if (p.sign) { 1983 s->float_exception_flags = orig_flags | float_flag_invalid; 1984 return 0; 1985 } 1986 1987 if (p.exp < DECOMPOSED_BINARY_POINT) { 1988 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1989 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1990 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1991 } else { 1992 s->float_exception_flags = orig_flags | float_flag_invalid; 1993 return max; 1994 } 1995 1996 /* For uint64 this will never trip, but if p.exp is too large 1997 * to shift a decomposed fraction we shall have exited via the 1998 * 3rd leg above. 1999 */ 2000 if (r > max) { 2001 s->float_exception_flags = orig_flags | float_flag_invalid; 2002 return max; 2003 } 2004 return r; 2005 default: 2006 g_assert_not_reached(); 2007 } 2008 } 2009 2010 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2011 float_status *s) 2012 { 2013 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2014 rmode, scale, UINT16_MAX, s); 2015 } 2016 2017 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2018 float_status *s) 2019 { 2020 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2021 rmode, scale, UINT32_MAX, s); 2022 } 2023 2024 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2025 float_status *s) 2026 { 2027 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2028 rmode, scale, UINT64_MAX, s); 2029 } 2030 2031 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2032 float_status *s) 2033 { 2034 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2035 rmode, scale, UINT16_MAX, s); 2036 } 2037 2038 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2039 float_status *s) 2040 { 2041 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2042 rmode, scale, UINT32_MAX, s); 2043 } 2044 2045 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2046 float_status *s) 2047 { 2048 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2049 rmode, scale, UINT64_MAX, s); 2050 } 2051 2052 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2053 float_status *s) 2054 { 2055 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2056 rmode, scale, UINT16_MAX, s); 2057 } 2058 2059 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2060 float_status *s) 2061 { 2062 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2063 rmode, scale, UINT32_MAX, s); 2064 } 2065 2066 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2067 float_status *s) 2068 { 2069 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2070 rmode, scale, UINT64_MAX, s); 2071 } 2072 2073 uint16_t float16_to_uint16(float16 a, float_status *s) 2074 { 2075 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2076 } 2077 2078 uint32_t float16_to_uint32(float16 a, float_status *s) 2079 { 2080 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2081 } 2082 2083 uint64_t float16_to_uint64(float16 a, float_status *s) 2084 { 2085 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2086 } 2087 2088 uint16_t float32_to_uint16(float32 a, float_status *s) 2089 { 2090 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2091 } 2092 2093 uint32_t float32_to_uint32(float32 a, float_status *s) 2094 { 2095 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2096 } 2097 2098 uint64_t float32_to_uint64(float32 a, float_status *s) 2099 { 2100 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2101 } 2102 2103 uint16_t float64_to_uint16(float64 a, float_status *s) 2104 { 2105 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2106 } 2107 2108 uint32_t float64_to_uint32(float64 a, float_status *s) 2109 { 2110 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2111 } 2112 2113 uint64_t float64_to_uint64(float64 a, float_status *s) 2114 { 2115 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2116 } 2117 2118 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2119 { 2120 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2121 } 2122 2123 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2124 { 2125 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2126 } 2127 2128 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2129 { 2130 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2131 } 2132 2133 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2134 { 2135 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2136 } 2137 2138 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2139 { 2140 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2141 } 2142 2143 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2144 { 2145 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2146 } 2147 2148 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2149 { 2150 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2151 } 2152 2153 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2154 { 2155 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2156 } 2157 2158 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2159 { 2160 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2161 } 2162 2163 /* 2164 * Integer to float conversions 2165 * 2166 * Returns the result of converting the two's complement integer `a' 2167 * to the floating-point format. The conversion is performed according 2168 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2169 */ 2170 2171 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2172 { 2173 FloatParts r = { .sign = false }; 2174 2175 if (a == 0) { 2176 r.cls = float_class_zero; 2177 } else { 2178 uint64_t f = a; 2179 int shift; 2180 2181 r.cls = float_class_normal; 2182 if (a < 0) { 2183 f = -f; 2184 r.sign = true; 2185 } 2186 shift = clz64(f) - 1; 2187 scale = MIN(MAX(scale, -0x10000), 0x10000); 2188 2189 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2190 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2191 } 2192 2193 return r; 2194 } 2195 2196 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2197 { 2198 FloatParts pa = int_to_float(a, scale, status); 2199 return float16_round_pack_canonical(pa, status); 2200 } 2201 2202 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2203 { 2204 return int64_to_float16_scalbn(a, scale, status); 2205 } 2206 2207 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2208 { 2209 return int64_to_float16_scalbn(a, scale, status); 2210 } 2211 2212 float16 int64_to_float16(int64_t a, float_status *status) 2213 { 2214 return int64_to_float16_scalbn(a, 0, status); 2215 } 2216 2217 float16 int32_to_float16(int32_t a, float_status *status) 2218 { 2219 return int64_to_float16_scalbn(a, 0, status); 2220 } 2221 2222 float16 int16_to_float16(int16_t a, float_status *status) 2223 { 2224 return int64_to_float16_scalbn(a, 0, status); 2225 } 2226 2227 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2228 { 2229 FloatParts pa = int_to_float(a, scale, status); 2230 return float32_round_pack_canonical(pa, status); 2231 } 2232 2233 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2234 { 2235 return int64_to_float32_scalbn(a, scale, status); 2236 } 2237 2238 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2239 { 2240 return int64_to_float32_scalbn(a, scale, status); 2241 } 2242 2243 float32 int64_to_float32(int64_t a, float_status *status) 2244 { 2245 return int64_to_float32_scalbn(a, 0, status); 2246 } 2247 2248 float32 int32_to_float32(int32_t a, float_status *status) 2249 { 2250 return int64_to_float32_scalbn(a, 0, status); 2251 } 2252 2253 float32 int16_to_float32(int16_t a, float_status *status) 2254 { 2255 return int64_to_float32_scalbn(a, 0, status); 2256 } 2257 2258 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2259 { 2260 FloatParts pa = int_to_float(a, scale, status); 2261 return float64_round_pack_canonical(pa, status); 2262 } 2263 2264 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2265 { 2266 return int64_to_float64_scalbn(a, scale, status); 2267 } 2268 2269 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2270 { 2271 return int64_to_float64_scalbn(a, scale, status); 2272 } 2273 2274 float64 int64_to_float64(int64_t a, float_status *status) 2275 { 2276 return int64_to_float64_scalbn(a, 0, status); 2277 } 2278 2279 float64 int32_to_float64(int32_t a, float_status *status) 2280 { 2281 return int64_to_float64_scalbn(a, 0, status); 2282 } 2283 2284 float64 int16_to_float64(int16_t a, float_status *status) 2285 { 2286 return int64_to_float64_scalbn(a, 0, status); 2287 } 2288 2289 2290 /* 2291 * Unsigned Integer to float conversions 2292 * 2293 * Returns the result of converting the unsigned integer `a' to the 2294 * floating-point format. The conversion is performed according to the 2295 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2296 */ 2297 2298 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2299 { 2300 FloatParts r = { .sign = false }; 2301 2302 if (a == 0) { 2303 r.cls = float_class_zero; 2304 } else { 2305 scale = MIN(MAX(scale, -0x10000), 0x10000); 2306 r.cls = float_class_normal; 2307 if ((int64_t)a < 0) { 2308 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2309 shift64RightJamming(a, 1, &a); 2310 r.frac = a; 2311 } else { 2312 int shift = clz64(a) - 1; 2313 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2314 r.frac = a << shift; 2315 } 2316 } 2317 2318 return r; 2319 } 2320 2321 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2322 { 2323 FloatParts pa = uint_to_float(a, scale, status); 2324 return float16_round_pack_canonical(pa, status); 2325 } 2326 2327 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2328 { 2329 return uint64_to_float16_scalbn(a, scale, status); 2330 } 2331 2332 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2333 { 2334 return uint64_to_float16_scalbn(a, scale, status); 2335 } 2336 2337 float16 uint64_to_float16(uint64_t a, float_status *status) 2338 { 2339 return uint64_to_float16_scalbn(a, 0, status); 2340 } 2341 2342 float16 uint32_to_float16(uint32_t a, float_status *status) 2343 { 2344 return uint64_to_float16_scalbn(a, 0, status); 2345 } 2346 2347 float16 uint16_to_float16(uint16_t a, float_status *status) 2348 { 2349 return uint64_to_float16_scalbn(a, 0, status); 2350 } 2351 2352 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2353 { 2354 FloatParts pa = uint_to_float(a, scale, status); 2355 return float32_round_pack_canonical(pa, status); 2356 } 2357 2358 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2359 { 2360 return uint64_to_float32_scalbn(a, scale, status); 2361 } 2362 2363 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2364 { 2365 return uint64_to_float32_scalbn(a, scale, status); 2366 } 2367 2368 float32 uint64_to_float32(uint64_t a, float_status *status) 2369 { 2370 return uint64_to_float32_scalbn(a, 0, status); 2371 } 2372 2373 float32 uint32_to_float32(uint32_t a, float_status *status) 2374 { 2375 return uint64_to_float32_scalbn(a, 0, status); 2376 } 2377 2378 float32 uint16_to_float32(uint16_t a, float_status *status) 2379 { 2380 return uint64_to_float32_scalbn(a, 0, status); 2381 } 2382 2383 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2384 { 2385 FloatParts pa = uint_to_float(a, scale, status); 2386 return float64_round_pack_canonical(pa, status); 2387 } 2388 2389 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2390 { 2391 return uint64_to_float64_scalbn(a, scale, status); 2392 } 2393 2394 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2395 { 2396 return uint64_to_float64_scalbn(a, scale, status); 2397 } 2398 2399 float64 uint64_to_float64(uint64_t a, float_status *status) 2400 { 2401 return uint64_to_float64_scalbn(a, 0, status); 2402 } 2403 2404 float64 uint32_to_float64(uint32_t a, float_status *status) 2405 { 2406 return uint64_to_float64_scalbn(a, 0, status); 2407 } 2408 2409 float64 uint16_to_float64(uint16_t a, float_status *status) 2410 { 2411 return uint64_to_float64_scalbn(a, 0, status); 2412 } 2413 2414 /* Float Min/Max */ 2415 /* min() and max() functions. These can't be implemented as 2416 * 'compare and pick one input' because that would mishandle 2417 * NaNs and +0 vs -0. 2418 * 2419 * minnum() and maxnum() functions. These are similar to the min() 2420 * and max() functions but if one of the arguments is a QNaN and 2421 * the other is numerical then the numerical argument is returned. 2422 * SNaNs will get quietened before being returned. 2423 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2424 * and maxNum() operations. min() and max() are the typical min/max 2425 * semantics provided by many CPUs which predate that specification. 2426 * 2427 * minnummag() and maxnummag() functions correspond to minNumMag() 2428 * and minNumMag() from the IEEE-754 2008. 2429 */ 2430 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2431 bool ieee, bool ismag, float_status *s) 2432 { 2433 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2434 if (ieee) { 2435 /* Takes two floating-point values `a' and `b', one of 2436 * which is a NaN, and returns the appropriate NaN 2437 * result. If either `a' or `b' is a signaling NaN, 2438 * the invalid exception is raised. 2439 */ 2440 if (is_snan(a.cls) || is_snan(b.cls)) { 2441 return pick_nan(a, b, s); 2442 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2443 return b; 2444 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2445 return a; 2446 } 2447 } 2448 return pick_nan(a, b, s); 2449 } else { 2450 int a_exp, b_exp; 2451 2452 switch (a.cls) { 2453 case float_class_normal: 2454 a_exp = a.exp; 2455 break; 2456 case float_class_inf: 2457 a_exp = INT_MAX; 2458 break; 2459 case float_class_zero: 2460 a_exp = INT_MIN; 2461 break; 2462 default: 2463 g_assert_not_reached(); 2464 break; 2465 } 2466 switch (b.cls) { 2467 case float_class_normal: 2468 b_exp = b.exp; 2469 break; 2470 case float_class_inf: 2471 b_exp = INT_MAX; 2472 break; 2473 case float_class_zero: 2474 b_exp = INT_MIN; 2475 break; 2476 default: 2477 g_assert_not_reached(); 2478 break; 2479 } 2480 2481 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2482 bool a_less = a_exp < b_exp; 2483 if (a_exp == b_exp) { 2484 a_less = a.frac < b.frac; 2485 } 2486 return a_less ^ ismin ? b : a; 2487 } 2488 2489 if (a.sign == b.sign) { 2490 bool a_less = a_exp < b_exp; 2491 if (a_exp == b_exp) { 2492 a_less = a.frac < b.frac; 2493 } 2494 return a.sign ^ a_less ^ ismin ? b : a; 2495 } else { 2496 return a.sign ^ ismin ? b : a; 2497 } 2498 } 2499 } 2500 2501 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2502 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2503 float_status *s) \ 2504 { \ 2505 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2506 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2507 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2508 \ 2509 return float ## sz ## _round_pack_canonical(pr, s); \ 2510 } 2511 2512 MINMAX(16, min, true, false, false) 2513 MINMAX(16, minnum, true, true, false) 2514 MINMAX(16, minnummag, true, true, true) 2515 MINMAX(16, max, false, false, false) 2516 MINMAX(16, maxnum, false, true, false) 2517 MINMAX(16, maxnummag, false, true, true) 2518 2519 MINMAX(32, min, true, false, false) 2520 MINMAX(32, minnum, true, true, false) 2521 MINMAX(32, minnummag, true, true, true) 2522 MINMAX(32, max, false, false, false) 2523 MINMAX(32, maxnum, false, true, false) 2524 MINMAX(32, maxnummag, false, true, true) 2525 2526 MINMAX(64, min, true, false, false) 2527 MINMAX(64, minnum, true, true, false) 2528 MINMAX(64, minnummag, true, true, true) 2529 MINMAX(64, max, false, false, false) 2530 MINMAX(64, maxnum, false, true, false) 2531 MINMAX(64, maxnummag, false, true, true) 2532 2533 #undef MINMAX 2534 2535 /* Floating point compare */ 2536 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2537 float_status *s) 2538 { 2539 if (is_nan(a.cls) || is_nan(b.cls)) { 2540 if (!is_quiet || 2541 a.cls == float_class_snan || 2542 b.cls == float_class_snan) { 2543 s->float_exception_flags |= float_flag_invalid; 2544 } 2545 return float_relation_unordered; 2546 } 2547 2548 if (a.cls == float_class_zero) { 2549 if (b.cls == float_class_zero) { 2550 return float_relation_equal; 2551 } 2552 return b.sign ? float_relation_greater : float_relation_less; 2553 } else if (b.cls == float_class_zero) { 2554 return a.sign ? float_relation_less : float_relation_greater; 2555 } 2556 2557 /* The only really important thing about infinity is its sign. If 2558 * both are infinities the sign marks the smallest of the two. 2559 */ 2560 if (a.cls == float_class_inf) { 2561 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2562 return float_relation_equal; 2563 } 2564 return a.sign ? float_relation_less : float_relation_greater; 2565 } else if (b.cls == float_class_inf) { 2566 return b.sign ? float_relation_greater : float_relation_less; 2567 } 2568 2569 if (a.sign != b.sign) { 2570 return a.sign ? float_relation_less : float_relation_greater; 2571 } 2572 2573 if (a.exp == b.exp) { 2574 if (a.frac == b.frac) { 2575 return float_relation_equal; 2576 } 2577 if (a.sign) { 2578 return a.frac > b.frac ? 2579 float_relation_less : float_relation_greater; 2580 } else { 2581 return a.frac > b.frac ? 2582 float_relation_greater : float_relation_less; 2583 } 2584 } else { 2585 if (a.sign) { 2586 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2587 } else { 2588 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2589 } 2590 } 2591 } 2592 2593 #define COMPARE(sz) \ 2594 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2595 float_status *s) \ 2596 { \ 2597 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2598 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2599 return compare_floats(pa, pb, false, s); \ 2600 } \ 2601 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2602 float_status *s) \ 2603 { \ 2604 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2605 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2606 return compare_floats(pa, pb, true, s); \ 2607 } 2608 2609 COMPARE(16) 2610 COMPARE(32) 2611 COMPARE(64) 2612 2613 #undef COMPARE 2614 2615 /* Multiply A by 2 raised to the power N. */ 2616 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2617 { 2618 if (unlikely(is_nan(a.cls))) { 2619 return return_nan(a, s); 2620 } 2621 if (a.cls == float_class_normal) { 2622 /* The largest float type (even though not supported by FloatParts) 2623 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2624 * still allows rounding to infinity, without allowing overflow 2625 * within the int32_t that backs FloatParts.exp. 2626 */ 2627 n = MIN(MAX(n, -0x10000), 0x10000); 2628 a.exp += n; 2629 } 2630 return a; 2631 } 2632 2633 float16 float16_scalbn(float16 a, int n, float_status *status) 2634 { 2635 FloatParts pa = float16_unpack_canonical(a, status); 2636 FloatParts pr = scalbn_decomposed(pa, n, status); 2637 return float16_round_pack_canonical(pr, status); 2638 } 2639 2640 float32 float32_scalbn(float32 a, int n, float_status *status) 2641 { 2642 FloatParts pa = float32_unpack_canonical(a, status); 2643 FloatParts pr = scalbn_decomposed(pa, n, status); 2644 return float32_round_pack_canonical(pr, status); 2645 } 2646 2647 float64 float64_scalbn(float64 a, int n, float_status *status) 2648 { 2649 FloatParts pa = float64_unpack_canonical(a, status); 2650 FloatParts pr = scalbn_decomposed(pa, n, status); 2651 return float64_round_pack_canonical(pr, status); 2652 } 2653 2654 /* 2655 * Square Root 2656 * 2657 * The old softfloat code did an approximation step before zeroing in 2658 * on the final result. However for simpleness we just compute the 2659 * square root by iterating down from the implicit bit to enough extra 2660 * bits to ensure we get a correctly rounded result. 2661 * 2662 * This does mean however the calculation is slower than before, 2663 * especially for 64 bit floats. 2664 */ 2665 2666 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2667 { 2668 uint64_t a_frac, r_frac, s_frac; 2669 int bit, last_bit; 2670 2671 if (is_nan(a.cls)) { 2672 return return_nan(a, s); 2673 } 2674 if (a.cls == float_class_zero) { 2675 return a; /* sqrt(+-0) = +-0 */ 2676 } 2677 if (a.sign) { 2678 s->float_exception_flags |= float_flag_invalid; 2679 return parts_default_nan(s); 2680 } 2681 if (a.cls == float_class_inf) { 2682 return a; /* sqrt(+inf) = +inf */ 2683 } 2684 2685 assert(a.cls == float_class_normal); 2686 2687 /* We need two overflow bits at the top. Adding room for that is a 2688 * right shift. If the exponent is odd, we can discard the low bit 2689 * by multiplying the fraction by 2; that's a left shift. Combine 2690 * those and we shift right if the exponent is even. 2691 */ 2692 a_frac = a.frac; 2693 if (!(a.exp & 1)) { 2694 a_frac >>= 1; 2695 } 2696 a.exp >>= 1; 2697 2698 /* Bit-by-bit computation of sqrt. */ 2699 r_frac = 0; 2700 s_frac = 0; 2701 2702 /* Iterate from implicit bit down to the 3 extra bits to compute a 2703 * properly rounded result. Remember we've inserted one more bit 2704 * at the top, so these positions are one less. 2705 */ 2706 bit = DECOMPOSED_BINARY_POINT - 1; 2707 last_bit = MAX(p->frac_shift - 4, 0); 2708 do { 2709 uint64_t q = 1ULL << bit; 2710 uint64_t t_frac = s_frac + q; 2711 if (t_frac <= a_frac) { 2712 s_frac = t_frac + q; 2713 a_frac -= t_frac; 2714 r_frac += q; 2715 } 2716 a_frac <<= 1; 2717 } while (--bit >= last_bit); 2718 2719 /* Undo the right shift done above. If there is any remaining 2720 * fraction, the result is inexact. Set the sticky bit. 2721 */ 2722 a.frac = (r_frac << 1) + (a_frac != 0); 2723 2724 return a; 2725 } 2726 2727 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 2728 { 2729 FloatParts pa = float16_unpack_canonical(a, status); 2730 FloatParts pr = sqrt_float(pa, status, &float16_params); 2731 return float16_round_pack_canonical(pr, status); 2732 } 2733 2734 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 2735 { 2736 FloatParts pa = float32_unpack_canonical(a, status); 2737 FloatParts pr = sqrt_float(pa, status, &float32_params); 2738 return float32_round_pack_canonical(pr, status); 2739 } 2740 2741 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 2742 { 2743 FloatParts pa = float64_unpack_canonical(a, status); 2744 FloatParts pr = sqrt_float(pa, status, &float64_params); 2745 return float64_round_pack_canonical(pr, status); 2746 } 2747 2748 /*---------------------------------------------------------------------------- 2749 | The pattern for a default generated NaN. 2750 *----------------------------------------------------------------------------*/ 2751 2752 float16 float16_default_nan(float_status *status) 2753 { 2754 FloatParts p = parts_default_nan(status); 2755 p.frac >>= float16_params.frac_shift; 2756 return float16_pack_raw(p); 2757 } 2758 2759 float32 float32_default_nan(float_status *status) 2760 { 2761 FloatParts p = parts_default_nan(status); 2762 p.frac >>= float32_params.frac_shift; 2763 return float32_pack_raw(p); 2764 } 2765 2766 float64 float64_default_nan(float_status *status) 2767 { 2768 FloatParts p = parts_default_nan(status); 2769 p.frac >>= float64_params.frac_shift; 2770 return float64_pack_raw(p); 2771 } 2772 2773 float128 float128_default_nan(float_status *status) 2774 { 2775 FloatParts p = parts_default_nan(status); 2776 float128 r; 2777 2778 /* Extrapolate from the choices made by parts_default_nan to fill 2779 * in the quad-floating format. If the low bit is set, assume we 2780 * want to set all non-snan bits. 2781 */ 2782 r.low = -(p.frac & 1); 2783 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2784 r.high |= LIT64(0x7FFF000000000000); 2785 r.high |= (uint64_t)p.sign << 63; 2786 2787 return r; 2788 } 2789 2790 /*---------------------------------------------------------------------------- 2791 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2792 *----------------------------------------------------------------------------*/ 2793 2794 float16 float16_silence_nan(float16 a, float_status *status) 2795 { 2796 FloatParts p = float16_unpack_raw(a); 2797 p.frac <<= float16_params.frac_shift; 2798 p = parts_silence_nan(p, status); 2799 p.frac >>= float16_params.frac_shift; 2800 return float16_pack_raw(p); 2801 } 2802 2803 float32 float32_silence_nan(float32 a, float_status *status) 2804 { 2805 FloatParts p = float32_unpack_raw(a); 2806 p.frac <<= float32_params.frac_shift; 2807 p = parts_silence_nan(p, status); 2808 p.frac >>= float32_params.frac_shift; 2809 return float32_pack_raw(p); 2810 } 2811 2812 float64 float64_silence_nan(float64 a, float_status *status) 2813 { 2814 FloatParts p = float64_unpack_raw(a); 2815 p.frac <<= float64_params.frac_shift; 2816 p = parts_silence_nan(p, status); 2817 p.frac >>= float64_params.frac_shift; 2818 return float64_pack_raw(p); 2819 } 2820 2821 /*---------------------------------------------------------------------------- 2822 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2823 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2824 | input. If `zSign' is 1, the input is negated before being converted to an 2825 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2826 | is simply rounded to an integer, with the inexact exception raised if the 2827 | input cannot be represented exactly as an integer. However, if the fixed- 2828 | point input is too large, the invalid exception is raised and the largest 2829 | positive or negative integer is returned. 2830 *----------------------------------------------------------------------------*/ 2831 2832 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2833 { 2834 int8_t roundingMode; 2835 flag roundNearestEven; 2836 int8_t roundIncrement, roundBits; 2837 int32_t z; 2838 2839 roundingMode = status->float_rounding_mode; 2840 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2841 switch (roundingMode) { 2842 case float_round_nearest_even: 2843 case float_round_ties_away: 2844 roundIncrement = 0x40; 2845 break; 2846 case float_round_to_zero: 2847 roundIncrement = 0; 2848 break; 2849 case float_round_up: 2850 roundIncrement = zSign ? 0 : 0x7f; 2851 break; 2852 case float_round_down: 2853 roundIncrement = zSign ? 0x7f : 0; 2854 break; 2855 default: 2856 abort(); 2857 } 2858 roundBits = absZ & 0x7F; 2859 absZ = ( absZ + roundIncrement )>>7; 2860 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2861 z = absZ; 2862 if ( zSign ) z = - z; 2863 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2864 float_raise(float_flag_invalid, status); 2865 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2866 } 2867 if (roundBits) { 2868 status->float_exception_flags |= float_flag_inexact; 2869 } 2870 return z; 2871 2872 } 2873 2874 /*---------------------------------------------------------------------------- 2875 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2876 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2877 | and returns the properly rounded 64-bit integer corresponding to the input. 2878 | If `zSign' is 1, the input is negated before being converted to an integer. 2879 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2880 | the inexact exception raised if the input cannot be represented exactly as 2881 | an integer. However, if the fixed-point input is too large, the invalid 2882 | exception is raised and the largest positive or negative integer is 2883 | returned. 2884 *----------------------------------------------------------------------------*/ 2885 2886 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2887 float_status *status) 2888 { 2889 int8_t roundingMode; 2890 flag roundNearestEven, increment; 2891 int64_t z; 2892 2893 roundingMode = status->float_rounding_mode; 2894 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2895 switch (roundingMode) { 2896 case float_round_nearest_even: 2897 case float_round_ties_away: 2898 increment = ((int64_t) absZ1 < 0); 2899 break; 2900 case float_round_to_zero: 2901 increment = 0; 2902 break; 2903 case float_round_up: 2904 increment = !zSign && absZ1; 2905 break; 2906 case float_round_down: 2907 increment = zSign && absZ1; 2908 break; 2909 default: 2910 abort(); 2911 } 2912 if ( increment ) { 2913 ++absZ0; 2914 if ( absZ0 == 0 ) goto overflow; 2915 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2916 } 2917 z = absZ0; 2918 if ( zSign ) z = - z; 2919 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2920 overflow: 2921 float_raise(float_flag_invalid, status); 2922 return 2923 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2924 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2925 } 2926 if (absZ1) { 2927 status->float_exception_flags |= float_flag_inexact; 2928 } 2929 return z; 2930 2931 } 2932 2933 /*---------------------------------------------------------------------------- 2934 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2935 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2936 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2937 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2938 | with the inexact exception raised if the input cannot be represented exactly 2939 | as an integer. However, if the fixed-point input is too large, the invalid 2940 | exception is raised and the largest unsigned integer is returned. 2941 *----------------------------------------------------------------------------*/ 2942 2943 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2944 uint64_t absZ1, float_status *status) 2945 { 2946 int8_t roundingMode; 2947 flag roundNearestEven, increment; 2948 2949 roundingMode = status->float_rounding_mode; 2950 roundNearestEven = (roundingMode == float_round_nearest_even); 2951 switch (roundingMode) { 2952 case float_round_nearest_even: 2953 case float_round_ties_away: 2954 increment = ((int64_t)absZ1 < 0); 2955 break; 2956 case float_round_to_zero: 2957 increment = 0; 2958 break; 2959 case float_round_up: 2960 increment = !zSign && absZ1; 2961 break; 2962 case float_round_down: 2963 increment = zSign && absZ1; 2964 break; 2965 default: 2966 abort(); 2967 } 2968 if (increment) { 2969 ++absZ0; 2970 if (absZ0 == 0) { 2971 float_raise(float_flag_invalid, status); 2972 return LIT64(0xFFFFFFFFFFFFFFFF); 2973 } 2974 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2975 } 2976 2977 if (zSign && absZ0) { 2978 float_raise(float_flag_invalid, status); 2979 return 0; 2980 } 2981 2982 if (absZ1) { 2983 status->float_exception_flags |= float_flag_inexact; 2984 } 2985 return absZ0; 2986 } 2987 2988 /*---------------------------------------------------------------------------- 2989 | If `a' is denormal and we are in flush-to-zero mode then set the 2990 | input-denormal exception and return zero. Otherwise just return the value. 2991 *----------------------------------------------------------------------------*/ 2992 float32 float32_squash_input_denormal(float32 a, float_status *status) 2993 { 2994 if (status->flush_inputs_to_zero) { 2995 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2996 float_raise(float_flag_input_denormal, status); 2997 return make_float32(float32_val(a) & 0x80000000); 2998 } 2999 } 3000 return a; 3001 } 3002 3003 /*---------------------------------------------------------------------------- 3004 | Normalizes the subnormal single-precision floating-point value represented 3005 | by the denormalized significand `aSig'. The normalized exponent and 3006 | significand are stored at the locations pointed to by `zExpPtr' and 3007 | `zSigPtr', respectively. 3008 *----------------------------------------------------------------------------*/ 3009 3010 static void 3011 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3012 { 3013 int8_t shiftCount; 3014 3015 shiftCount = clz32(aSig) - 8; 3016 *zSigPtr = aSig<<shiftCount; 3017 *zExpPtr = 1 - shiftCount; 3018 3019 } 3020 3021 /*---------------------------------------------------------------------------- 3022 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3023 | and significand `zSig', and returns the proper single-precision floating- 3024 | point value corresponding to the abstract input. Ordinarily, the abstract 3025 | value is simply rounded and packed into the single-precision format, with 3026 | the inexact exception raised if the abstract input cannot be represented 3027 | exactly. However, if the abstract value is too large, the overflow and 3028 | inexact exceptions are raised and an infinity or maximal finite value is 3029 | returned. If the abstract value is too small, the input value is rounded to 3030 | a subnormal number, and the underflow and inexact exceptions are raised if 3031 | the abstract input cannot be represented exactly as a subnormal single- 3032 | precision floating-point number. 3033 | The input significand `zSig' has its binary point between bits 30 3034 | and 29, which is 7 bits to the left of the usual location. This shifted 3035 | significand must be normalized or smaller. If `zSig' is not normalized, 3036 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3037 | and it must not require rounding. In the usual case that `zSig' is 3038 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3039 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3040 | Binary Floating-Point Arithmetic. 3041 *----------------------------------------------------------------------------*/ 3042 3043 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3044 float_status *status) 3045 { 3046 int8_t roundingMode; 3047 flag roundNearestEven; 3048 int8_t roundIncrement, roundBits; 3049 flag isTiny; 3050 3051 roundingMode = status->float_rounding_mode; 3052 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3053 switch (roundingMode) { 3054 case float_round_nearest_even: 3055 case float_round_ties_away: 3056 roundIncrement = 0x40; 3057 break; 3058 case float_round_to_zero: 3059 roundIncrement = 0; 3060 break; 3061 case float_round_up: 3062 roundIncrement = zSign ? 0 : 0x7f; 3063 break; 3064 case float_round_down: 3065 roundIncrement = zSign ? 0x7f : 0; 3066 break; 3067 default: 3068 abort(); 3069 break; 3070 } 3071 roundBits = zSig & 0x7F; 3072 if ( 0xFD <= (uint16_t) zExp ) { 3073 if ( ( 0xFD < zExp ) 3074 || ( ( zExp == 0xFD ) 3075 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3076 ) { 3077 float_raise(float_flag_overflow | float_flag_inexact, status); 3078 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3079 } 3080 if ( zExp < 0 ) { 3081 if (status->flush_to_zero) { 3082 float_raise(float_flag_output_denormal, status); 3083 return packFloat32(zSign, 0, 0); 3084 } 3085 isTiny = 3086 (status->float_detect_tininess 3087 == float_tininess_before_rounding) 3088 || ( zExp < -1 ) 3089 || ( zSig + roundIncrement < 0x80000000 ); 3090 shift32RightJamming( zSig, - zExp, &zSig ); 3091 zExp = 0; 3092 roundBits = zSig & 0x7F; 3093 if (isTiny && roundBits) { 3094 float_raise(float_flag_underflow, status); 3095 } 3096 } 3097 } 3098 if (roundBits) { 3099 status->float_exception_flags |= float_flag_inexact; 3100 } 3101 zSig = ( zSig + roundIncrement )>>7; 3102 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3103 if ( zSig == 0 ) zExp = 0; 3104 return packFloat32( zSign, zExp, zSig ); 3105 3106 } 3107 3108 /*---------------------------------------------------------------------------- 3109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3110 | and significand `zSig', and returns the proper single-precision floating- 3111 | point value corresponding to the abstract input. This routine is just like 3112 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3113 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3114 | floating-point exponent. 3115 *----------------------------------------------------------------------------*/ 3116 3117 static float32 3118 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3119 float_status *status) 3120 { 3121 int8_t shiftCount; 3122 3123 shiftCount = clz32(zSig) - 1; 3124 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3125 status); 3126 3127 } 3128 3129 /*---------------------------------------------------------------------------- 3130 | If `a' is denormal and we are in flush-to-zero mode then set the 3131 | input-denormal exception and return zero. Otherwise just return the value. 3132 *----------------------------------------------------------------------------*/ 3133 float64 float64_squash_input_denormal(float64 a, float_status *status) 3134 { 3135 if (status->flush_inputs_to_zero) { 3136 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3137 float_raise(float_flag_input_denormal, status); 3138 return make_float64(float64_val(a) & (1ULL << 63)); 3139 } 3140 } 3141 return a; 3142 } 3143 3144 /*---------------------------------------------------------------------------- 3145 | Normalizes the subnormal double-precision floating-point value represented 3146 | by the denormalized significand `aSig'. The normalized exponent and 3147 | significand are stored at the locations pointed to by `zExpPtr' and 3148 | `zSigPtr', respectively. 3149 *----------------------------------------------------------------------------*/ 3150 3151 static void 3152 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3153 { 3154 int8_t shiftCount; 3155 3156 shiftCount = clz64(aSig) - 11; 3157 *zSigPtr = aSig<<shiftCount; 3158 *zExpPtr = 1 - shiftCount; 3159 3160 } 3161 3162 /*---------------------------------------------------------------------------- 3163 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3164 | double-precision floating-point value, returning the result. After being 3165 | shifted into the proper positions, the three fields are simply added 3166 | together to form the result. This means that any integer portion of `zSig' 3167 | will be added into the exponent. Since a properly normalized significand 3168 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3169 | than the desired result exponent whenever `zSig' is a complete, normalized 3170 | significand. 3171 *----------------------------------------------------------------------------*/ 3172 3173 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3174 { 3175 3176 return make_float64( 3177 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3178 3179 } 3180 3181 /*---------------------------------------------------------------------------- 3182 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3183 | and significand `zSig', and returns the proper double-precision floating- 3184 | point value corresponding to the abstract input. Ordinarily, the abstract 3185 | value is simply rounded and packed into the double-precision format, with 3186 | the inexact exception raised if the abstract input cannot be represented 3187 | exactly. However, if the abstract value is too large, the overflow and 3188 | inexact exceptions are raised and an infinity or maximal finite value is 3189 | returned. If the abstract value is too small, the input value is rounded to 3190 | a subnormal number, and the underflow and inexact exceptions are raised if 3191 | the abstract input cannot be represented exactly as a subnormal double- 3192 | precision floating-point number. 3193 | The input significand `zSig' has its binary point between bits 62 3194 | and 61, which is 10 bits to the left of the usual location. This shifted 3195 | significand must be normalized or smaller. If `zSig' is not normalized, 3196 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3197 | and it must not require rounding. In the usual case that `zSig' is 3198 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3199 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3200 | Binary Floating-Point Arithmetic. 3201 *----------------------------------------------------------------------------*/ 3202 3203 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3204 float_status *status) 3205 { 3206 int8_t roundingMode; 3207 flag roundNearestEven; 3208 int roundIncrement, roundBits; 3209 flag isTiny; 3210 3211 roundingMode = status->float_rounding_mode; 3212 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3213 switch (roundingMode) { 3214 case float_round_nearest_even: 3215 case float_round_ties_away: 3216 roundIncrement = 0x200; 3217 break; 3218 case float_round_to_zero: 3219 roundIncrement = 0; 3220 break; 3221 case float_round_up: 3222 roundIncrement = zSign ? 0 : 0x3ff; 3223 break; 3224 case float_round_down: 3225 roundIncrement = zSign ? 0x3ff : 0; 3226 break; 3227 case float_round_to_odd: 3228 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3229 break; 3230 default: 3231 abort(); 3232 } 3233 roundBits = zSig & 0x3FF; 3234 if ( 0x7FD <= (uint16_t) zExp ) { 3235 if ( ( 0x7FD < zExp ) 3236 || ( ( zExp == 0x7FD ) 3237 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3238 ) { 3239 bool overflow_to_inf = roundingMode != float_round_to_odd && 3240 roundIncrement != 0; 3241 float_raise(float_flag_overflow | float_flag_inexact, status); 3242 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3243 } 3244 if ( zExp < 0 ) { 3245 if (status->flush_to_zero) { 3246 float_raise(float_flag_output_denormal, status); 3247 return packFloat64(zSign, 0, 0); 3248 } 3249 isTiny = 3250 (status->float_detect_tininess 3251 == float_tininess_before_rounding) 3252 || ( zExp < -1 ) 3253 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3254 shift64RightJamming( zSig, - zExp, &zSig ); 3255 zExp = 0; 3256 roundBits = zSig & 0x3FF; 3257 if (isTiny && roundBits) { 3258 float_raise(float_flag_underflow, status); 3259 } 3260 if (roundingMode == float_round_to_odd) { 3261 /* 3262 * For round-to-odd case, the roundIncrement depends on 3263 * zSig which just changed. 3264 */ 3265 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3266 } 3267 } 3268 } 3269 if (roundBits) { 3270 status->float_exception_flags |= float_flag_inexact; 3271 } 3272 zSig = ( zSig + roundIncrement )>>10; 3273 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3274 if ( zSig == 0 ) zExp = 0; 3275 return packFloat64( zSign, zExp, zSig ); 3276 3277 } 3278 3279 /*---------------------------------------------------------------------------- 3280 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3281 | and significand `zSig', and returns the proper double-precision floating- 3282 | point value corresponding to the abstract input. This routine is just like 3283 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3284 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3285 | floating-point exponent. 3286 *----------------------------------------------------------------------------*/ 3287 3288 static float64 3289 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3290 float_status *status) 3291 { 3292 int8_t shiftCount; 3293 3294 shiftCount = clz64(zSig) - 1; 3295 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3296 status); 3297 3298 } 3299 3300 /*---------------------------------------------------------------------------- 3301 | Normalizes the subnormal extended double-precision floating-point value 3302 | represented by the denormalized significand `aSig'. The normalized exponent 3303 | and significand are stored at the locations pointed to by `zExpPtr' and 3304 | `zSigPtr', respectively. 3305 *----------------------------------------------------------------------------*/ 3306 3307 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3308 uint64_t *zSigPtr) 3309 { 3310 int8_t shiftCount; 3311 3312 shiftCount = clz64(aSig); 3313 *zSigPtr = aSig<<shiftCount; 3314 *zExpPtr = 1 - shiftCount; 3315 } 3316 3317 /*---------------------------------------------------------------------------- 3318 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3319 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3320 | and returns the proper extended double-precision floating-point value 3321 | corresponding to the abstract input. Ordinarily, the abstract value is 3322 | rounded and packed into the extended double-precision format, with the 3323 | inexact exception raised if the abstract input cannot be represented 3324 | exactly. However, if the abstract value is too large, the overflow and 3325 | inexact exceptions are raised and an infinity or maximal finite value is 3326 | returned. If the abstract value is too small, the input value is rounded to 3327 | a subnormal number, and the underflow and inexact exceptions are raised if 3328 | the abstract input cannot be represented exactly as a subnormal extended 3329 | double-precision floating-point number. 3330 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3331 | number of bits as single or double precision, respectively. Otherwise, the 3332 | result is rounded to the full precision of the extended double-precision 3333 | format. 3334 | The input significand must be normalized or smaller. If the input 3335 | significand is not normalized, `zExp' must be 0; in that case, the result 3336 | returned is a subnormal number, and it must not require rounding. The 3337 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3338 | Floating-Point Arithmetic. 3339 *----------------------------------------------------------------------------*/ 3340 3341 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3342 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3343 float_status *status) 3344 { 3345 int8_t roundingMode; 3346 flag roundNearestEven, increment, isTiny; 3347 int64_t roundIncrement, roundMask, roundBits; 3348 3349 roundingMode = status->float_rounding_mode; 3350 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3351 if ( roundingPrecision == 80 ) goto precision80; 3352 if ( roundingPrecision == 64 ) { 3353 roundIncrement = LIT64( 0x0000000000000400 ); 3354 roundMask = LIT64( 0x00000000000007FF ); 3355 } 3356 else if ( roundingPrecision == 32 ) { 3357 roundIncrement = LIT64( 0x0000008000000000 ); 3358 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3359 } 3360 else { 3361 goto precision80; 3362 } 3363 zSig0 |= ( zSig1 != 0 ); 3364 switch (roundingMode) { 3365 case float_round_nearest_even: 3366 case float_round_ties_away: 3367 break; 3368 case float_round_to_zero: 3369 roundIncrement = 0; 3370 break; 3371 case float_round_up: 3372 roundIncrement = zSign ? 0 : roundMask; 3373 break; 3374 case float_round_down: 3375 roundIncrement = zSign ? roundMask : 0; 3376 break; 3377 default: 3378 abort(); 3379 } 3380 roundBits = zSig0 & roundMask; 3381 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3382 if ( ( 0x7FFE < zExp ) 3383 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3384 ) { 3385 goto overflow; 3386 } 3387 if ( zExp <= 0 ) { 3388 if (status->flush_to_zero) { 3389 float_raise(float_flag_output_denormal, status); 3390 return packFloatx80(zSign, 0, 0); 3391 } 3392 isTiny = 3393 (status->float_detect_tininess 3394 == float_tininess_before_rounding) 3395 || ( zExp < 0 ) 3396 || ( zSig0 <= zSig0 + roundIncrement ); 3397 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3398 zExp = 0; 3399 roundBits = zSig0 & roundMask; 3400 if (isTiny && roundBits) { 3401 float_raise(float_flag_underflow, status); 3402 } 3403 if (roundBits) { 3404 status->float_exception_flags |= float_flag_inexact; 3405 } 3406 zSig0 += roundIncrement; 3407 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3408 roundIncrement = roundMask + 1; 3409 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3410 roundMask |= roundIncrement; 3411 } 3412 zSig0 &= ~ roundMask; 3413 return packFloatx80( zSign, zExp, zSig0 ); 3414 } 3415 } 3416 if (roundBits) { 3417 status->float_exception_flags |= float_flag_inexact; 3418 } 3419 zSig0 += roundIncrement; 3420 if ( zSig0 < roundIncrement ) { 3421 ++zExp; 3422 zSig0 = LIT64( 0x8000000000000000 ); 3423 } 3424 roundIncrement = roundMask + 1; 3425 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3426 roundMask |= roundIncrement; 3427 } 3428 zSig0 &= ~ roundMask; 3429 if ( zSig0 == 0 ) zExp = 0; 3430 return packFloatx80( zSign, zExp, zSig0 ); 3431 precision80: 3432 switch (roundingMode) { 3433 case float_round_nearest_even: 3434 case float_round_ties_away: 3435 increment = ((int64_t)zSig1 < 0); 3436 break; 3437 case float_round_to_zero: 3438 increment = 0; 3439 break; 3440 case float_round_up: 3441 increment = !zSign && zSig1; 3442 break; 3443 case float_round_down: 3444 increment = zSign && zSig1; 3445 break; 3446 default: 3447 abort(); 3448 } 3449 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3450 if ( ( 0x7FFE < zExp ) 3451 || ( ( zExp == 0x7FFE ) 3452 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3453 && increment 3454 ) 3455 ) { 3456 roundMask = 0; 3457 overflow: 3458 float_raise(float_flag_overflow | float_flag_inexact, status); 3459 if ( ( roundingMode == float_round_to_zero ) 3460 || ( zSign && ( roundingMode == float_round_up ) ) 3461 || ( ! zSign && ( roundingMode == float_round_down ) ) 3462 ) { 3463 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3464 } 3465 return packFloatx80(zSign, 3466 floatx80_infinity_high, 3467 floatx80_infinity_low); 3468 } 3469 if ( zExp <= 0 ) { 3470 isTiny = 3471 (status->float_detect_tininess 3472 == float_tininess_before_rounding) 3473 || ( zExp < 0 ) 3474 || ! increment 3475 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3476 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3477 zExp = 0; 3478 if (isTiny && zSig1) { 3479 float_raise(float_flag_underflow, status); 3480 } 3481 if (zSig1) { 3482 status->float_exception_flags |= float_flag_inexact; 3483 } 3484 switch (roundingMode) { 3485 case float_round_nearest_even: 3486 case float_round_ties_away: 3487 increment = ((int64_t)zSig1 < 0); 3488 break; 3489 case float_round_to_zero: 3490 increment = 0; 3491 break; 3492 case float_round_up: 3493 increment = !zSign && zSig1; 3494 break; 3495 case float_round_down: 3496 increment = zSign && zSig1; 3497 break; 3498 default: 3499 abort(); 3500 } 3501 if ( increment ) { 3502 ++zSig0; 3503 zSig0 &= 3504 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3505 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3506 } 3507 return packFloatx80( zSign, zExp, zSig0 ); 3508 } 3509 } 3510 if (zSig1) { 3511 status->float_exception_flags |= float_flag_inexact; 3512 } 3513 if ( increment ) { 3514 ++zSig0; 3515 if ( zSig0 == 0 ) { 3516 ++zExp; 3517 zSig0 = LIT64( 0x8000000000000000 ); 3518 } 3519 else { 3520 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3521 } 3522 } 3523 else { 3524 if ( zSig0 == 0 ) zExp = 0; 3525 } 3526 return packFloatx80( zSign, zExp, zSig0 ); 3527 3528 } 3529 3530 /*---------------------------------------------------------------------------- 3531 | Takes an abstract floating-point value having sign `zSign', exponent 3532 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3533 | and returns the proper extended double-precision floating-point value 3534 | corresponding to the abstract input. This routine is just like 3535 | `roundAndPackFloatx80' except that the input significand does not have to be 3536 | normalized. 3537 *----------------------------------------------------------------------------*/ 3538 3539 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3540 flag zSign, int32_t zExp, 3541 uint64_t zSig0, uint64_t zSig1, 3542 float_status *status) 3543 { 3544 int8_t shiftCount; 3545 3546 if ( zSig0 == 0 ) { 3547 zSig0 = zSig1; 3548 zSig1 = 0; 3549 zExp -= 64; 3550 } 3551 shiftCount = clz64(zSig0); 3552 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3553 zExp -= shiftCount; 3554 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3555 zSig0, zSig1, status); 3556 3557 } 3558 3559 /*---------------------------------------------------------------------------- 3560 | Returns the least-significant 64 fraction bits of the quadruple-precision 3561 | floating-point value `a'. 3562 *----------------------------------------------------------------------------*/ 3563 3564 static inline uint64_t extractFloat128Frac1( float128 a ) 3565 { 3566 3567 return a.low; 3568 3569 } 3570 3571 /*---------------------------------------------------------------------------- 3572 | Returns the most-significant 48 fraction bits of the quadruple-precision 3573 | floating-point value `a'. 3574 *----------------------------------------------------------------------------*/ 3575 3576 static inline uint64_t extractFloat128Frac0( float128 a ) 3577 { 3578 3579 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3580 3581 } 3582 3583 /*---------------------------------------------------------------------------- 3584 | Returns the exponent bits of the quadruple-precision floating-point value 3585 | `a'. 3586 *----------------------------------------------------------------------------*/ 3587 3588 static inline int32_t extractFloat128Exp( float128 a ) 3589 { 3590 3591 return ( a.high>>48 ) & 0x7FFF; 3592 3593 } 3594 3595 /*---------------------------------------------------------------------------- 3596 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3597 *----------------------------------------------------------------------------*/ 3598 3599 static inline flag extractFloat128Sign( float128 a ) 3600 { 3601 3602 return a.high>>63; 3603 3604 } 3605 3606 /*---------------------------------------------------------------------------- 3607 | Normalizes the subnormal quadruple-precision floating-point value 3608 | represented by the denormalized significand formed by the concatenation of 3609 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3610 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3611 | significand are stored at the location pointed to by `zSig0Ptr', and the 3612 | least significant 64 bits of the normalized significand are stored at the 3613 | location pointed to by `zSig1Ptr'. 3614 *----------------------------------------------------------------------------*/ 3615 3616 static void 3617 normalizeFloat128Subnormal( 3618 uint64_t aSig0, 3619 uint64_t aSig1, 3620 int32_t *zExpPtr, 3621 uint64_t *zSig0Ptr, 3622 uint64_t *zSig1Ptr 3623 ) 3624 { 3625 int8_t shiftCount; 3626 3627 if ( aSig0 == 0 ) { 3628 shiftCount = clz64(aSig1) - 15; 3629 if ( shiftCount < 0 ) { 3630 *zSig0Ptr = aSig1>>( - shiftCount ); 3631 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3632 } 3633 else { 3634 *zSig0Ptr = aSig1<<shiftCount; 3635 *zSig1Ptr = 0; 3636 } 3637 *zExpPtr = - shiftCount - 63; 3638 } 3639 else { 3640 shiftCount = clz64(aSig0) - 15; 3641 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3642 *zExpPtr = 1 - shiftCount; 3643 } 3644 3645 } 3646 3647 /*---------------------------------------------------------------------------- 3648 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3649 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3650 | floating-point value, returning the result. After being shifted into the 3651 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3652 | added together to form the most significant 32 bits of the result. This 3653 | means that any integer portion of `zSig0' will be added into the exponent. 3654 | Since a properly normalized significand will have an integer portion equal 3655 | to 1, the `zExp' input should be 1 less than the desired result exponent 3656 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3657 | significand. 3658 *----------------------------------------------------------------------------*/ 3659 3660 static inline float128 3661 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3662 { 3663 float128 z; 3664 3665 z.low = zSig1; 3666 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3667 return z; 3668 3669 } 3670 3671 /*---------------------------------------------------------------------------- 3672 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3673 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3674 | and `zSig2', and returns the proper quadruple-precision floating-point value 3675 | corresponding to the abstract input. Ordinarily, the abstract value is 3676 | simply rounded and packed into the quadruple-precision format, with the 3677 | inexact exception raised if the abstract input cannot be represented 3678 | exactly. However, if the abstract value is too large, the overflow and 3679 | inexact exceptions are raised and an infinity or maximal finite value is 3680 | returned. If the abstract value is too small, the input value is rounded to 3681 | a subnormal number, and the underflow and inexact exceptions are raised if 3682 | the abstract input cannot be represented exactly as a subnormal quadruple- 3683 | precision floating-point number. 3684 | The input significand must be normalized or smaller. If the input 3685 | significand is not normalized, `zExp' must be 0; in that case, the result 3686 | returned is a subnormal number, and it must not require rounding. In the 3687 | usual case that the input significand is normalized, `zExp' must be 1 less 3688 | than the ``true'' floating-point exponent. The handling of underflow and 3689 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3690 *----------------------------------------------------------------------------*/ 3691 3692 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3693 uint64_t zSig0, uint64_t zSig1, 3694 uint64_t zSig2, float_status *status) 3695 { 3696 int8_t roundingMode; 3697 flag roundNearestEven, increment, isTiny; 3698 3699 roundingMode = status->float_rounding_mode; 3700 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3701 switch (roundingMode) { 3702 case float_round_nearest_even: 3703 case float_round_ties_away: 3704 increment = ((int64_t)zSig2 < 0); 3705 break; 3706 case float_round_to_zero: 3707 increment = 0; 3708 break; 3709 case float_round_up: 3710 increment = !zSign && zSig2; 3711 break; 3712 case float_round_down: 3713 increment = zSign && zSig2; 3714 break; 3715 case float_round_to_odd: 3716 increment = !(zSig1 & 0x1) && zSig2; 3717 break; 3718 default: 3719 abort(); 3720 } 3721 if ( 0x7FFD <= (uint32_t) zExp ) { 3722 if ( ( 0x7FFD < zExp ) 3723 || ( ( zExp == 0x7FFD ) 3724 && eq128( 3725 LIT64( 0x0001FFFFFFFFFFFF ), 3726 LIT64( 0xFFFFFFFFFFFFFFFF ), 3727 zSig0, 3728 zSig1 3729 ) 3730 && increment 3731 ) 3732 ) { 3733 float_raise(float_flag_overflow | float_flag_inexact, status); 3734 if ( ( roundingMode == float_round_to_zero ) 3735 || ( zSign && ( roundingMode == float_round_up ) ) 3736 || ( ! zSign && ( roundingMode == float_round_down ) ) 3737 || (roundingMode == float_round_to_odd) 3738 ) { 3739 return 3740 packFloat128( 3741 zSign, 3742 0x7FFE, 3743 LIT64( 0x0000FFFFFFFFFFFF ), 3744 LIT64( 0xFFFFFFFFFFFFFFFF ) 3745 ); 3746 } 3747 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3748 } 3749 if ( zExp < 0 ) { 3750 if (status->flush_to_zero) { 3751 float_raise(float_flag_output_denormal, status); 3752 return packFloat128(zSign, 0, 0, 0); 3753 } 3754 isTiny = 3755 (status->float_detect_tininess 3756 == float_tininess_before_rounding) 3757 || ( zExp < -1 ) 3758 || ! increment 3759 || lt128( 3760 zSig0, 3761 zSig1, 3762 LIT64( 0x0001FFFFFFFFFFFF ), 3763 LIT64( 0xFFFFFFFFFFFFFFFF ) 3764 ); 3765 shift128ExtraRightJamming( 3766 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3767 zExp = 0; 3768 if (isTiny && zSig2) { 3769 float_raise(float_flag_underflow, status); 3770 } 3771 switch (roundingMode) { 3772 case float_round_nearest_even: 3773 case float_round_ties_away: 3774 increment = ((int64_t)zSig2 < 0); 3775 break; 3776 case float_round_to_zero: 3777 increment = 0; 3778 break; 3779 case float_round_up: 3780 increment = !zSign && zSig2; 3781 break; 3782 case float_round_down: 3783 increment = zSign && zSig2; 3784 break; 3785 case float_round_to_odd: 3786 increment = !(zSig1 & 0x1) && zSig2; 3787 break; 3788 default: 3789 abort(); 3790 } 3791 } 3792 } 3793 if (zSig2) { 3794 status->float_exception_flags |= float_flag_inexact; 3795 } 3796 if ( increment ) { 3797 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3798 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3799 } 3800 else { 3801 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3802 } 3803 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3804 3805 } 3806 3807 /*---------------------------------------------------------------------------- 3808 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3809 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3810 | returns the proper quadruple-precision floating-point value corresponding 3811 | to the abstract input. This routine is just like `roundAndPackFloat128' 3812 | except that the input significand has fewer bits and does not have to be 3813 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3814 | point exponent. 3815 *----------------------------------------------------------------------------*/ 3816 3817 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3818 uint64_t zSig0, uint64_t zSig1, 3819 float_status *status) 3820 { 3821 int8_t shiftCount; 3822 uint64_t zSig2; 3823 3824 if ( zSig0 == 0 ) { 3825 zSig0 = zSig1; 3826 zSig1 = 0; 3827 zExp -= 64; 3828 } 3829 shiftCount = clz64(zSig0) - 15; 3830 if ( 0 <= shiftCount ) { 3831 zSig2 = 0; 3832 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3833 } 3834 else { 3835 shift128ExtraRightJamming( 3836 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3837 } 3838 zExp -= shiftCount; 3839 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3840 3841 } 3842 3843 3844 /*---------------------------------------------------------------------------- 3845 | Returns the result of converting the 32-bit two's complement integer `a' 3846 | to the extended double-precision floating-point format. The conversion 3847 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3848 | Arithmetic. 3849 *----------------------------------------------------------------------------*/ 3850 3851 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3852 { 3853 flag zSign; 3854 uint32_t absA; 3855 int8_t shiftCount; 3856 uint64_t zSig; 3857 3858 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3859 zSign = ( a < 0 ); 3860 absA = zSign ? - a : a; 3861 shiftCount = clz32(absA) + 32; 3862 zSig = absA; 3863 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3864 3865 } 3866 3867 /*---------------------------------------------------------------------------- 3868 | Returns the result of converting the 32-bit two's complement integer `a' to 3869 | the quadruple-precision floating-point format. The conversion is performed 3870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3871 *----------------------------------------------------------------------------*/ 3872 3873 float128 int32_to_float128(int32_t a, float_status *status) 3874 { 3875 flag zSign; 3876 uint32_t absA; 3877 int8_t shiftCount; 3878 uint64_t zSig0; 3879 3880 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3881 zSign = ( a < 0 ); 3882 absA = zSign ? - a : a; 3883 shiftCount = clz32(absA) + 17; 3884 zSig0 = absA; 3885 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3886 3887 } 3888 3889 /*---------------------------------------------------------------------------- 3890 | Returns the result of converting the 64-bit two's complement integer `a' 3891 | to the extended double-precision floating-point format. The conversion 3892 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3893 | Arithmetic. 3894 *----------------------------------------------------------------------------*/ 3895 3896 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3897 { 3898 flag zSign; 3899 uint64_t absA; 3900 int8_t shiftCount; 3901 3902 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3903 zSign = ( a < 0 ); 3904 absA = zSign ? - a : a; 3905 shiftCount = clz64(absA); 3906 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3907 3908 } 3909 3910 /*---------------------------------------------------------------------------- 3911 | Returns the result of converting the 64-bit two's complement integer `a' to 3912 | the quadruple-precision floating-point format. The conversion is performed 3913 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3914 *----------------------------------------------------------------------------*/ 3915 3916 float128 int64_to_float128(int64_t a, float_status *status) 3917 { 3918 flag zSign; 3919 uint64_t absA; 3920 int8_t shiftCount; 3921 int32_t zExp; 3922 uint64_t zSig0, zSig1; 3923 3924 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3925 zSign = ( a < 0 ); 3926 absA = zSign ? - a : a; 3927 shiftCount = clz64(absA) + 49; 3928 zExp = 0x406E - shiftCount; 3929 if ( 64 <= shiftCount ) { 3930 zSig1 = 0; 3931 zSig0 = absA; 3932 shiftCount -= 64; 3933 } 3934 else { 3935 zSig1 = absA; 3936 zSig0 = 0; 3937 } 3938 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3939 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3940 3941 } 3942 3943 /*---------------------------------------------------------------------------- 3944 | Returns the result of converting the 64-bit unsigned integer `a' 3945 | to the quadruple-precision floating-point format. The conversion is performed 3946 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3947 *----------------------------------------------------------------------------*/ 3948 3949 float128 uint64_to_float128(uint64_t a, float_status *status) 3950 { 3951 if (a == 0) { 3952 return float128_zero; 3953 } 3954 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 3955 } 3956 3957 /*---------------------------------------------------------------------------- 3958 | Returns the result of converting the single-precision floating-point value 3959 | `a' to the extended double-precision floating-point format. The conversion 3960 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3961 | Arithmetic. 3962 *----------------------------------------------------------------------------*/ 3963 3964 floatx80 float32_to_floatx80(float32 a, float_status *status) 3965 { 3966 flag aSign; 3967 int aExp; 3968 uint32_t aSig; 3969 3970 a = float32_squash_input_denormal(a, status); 3971 aSig = extractFloat32Frac( a ); 3972 aExp = extractFloat32Exp( a ); 3973 aSign = extractFloat32Sign( a ); 3974 if ( aExp == 0xFF ) { 3975 if (aSig) { 3976 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3977 } 3978 return packFloatx80(aSign, 3979 floatx80_infinity_high, 3980 floatx80_infinity_low); 3981 } 3982 if ( aExp == 0 ) { 3983 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3984 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3985 } 3986 aSig |= 0x00800000; 3987 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3988 3989 } 3990 3991 /*---------------------------------------------------------------------------- 3992 | Returns the result of converting the single-precision floating-point value 3993 | `a' to the double-precision floating-point format. The conversion is 3994 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3995 | Arithmetic. 3996 *----------------------------------------------------------------------------*/ 3997 3998 float128 float32_to_float128(float32 a, float_status *status) 3999 { 4000 flag aSign; 4001 int aExp; 4002 uint32_t aSig; 4003 4004 a = float32_squash_input_denormal(a, status); 4005 aSig = extractFloat32Frac( a ); 4006 aExp = extractFloat32Exp( a ); 4007 aSign = extractFloat32Sign( a ); 4008 if ( aExp == 0xFF ) { 4009 if (aSig) { 4010 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4011 } 4012 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4013 } 4014 if ( aExp == 0 ) { 4015 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4016 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4017 --aExp; 4018 } 4019 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4020 4021 } 4022 4023 /*---------------------------------------------------------------------------- 4024 | Returns the remainder of the single-precision floating-point value `a' 4025 | with respect to the corresponding value `b'. The operation is performed 4026 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4027 *----------------------------------------------------------------------------*/ 4028 4029 float32 float32_rem(float32 a, float32 b, float_status *status) 4030 { 4031 flag aSign, zSign; 4032 int aExp, bExp, expDiff; 4033 uint32_t aSig, bSig; 4034 uint32_t q; 4035 uint64_t aSig64, bSig64, q64; 4036 uint32_t alternateASig; 4037 int32_t sigMean; 4038 a = float32_squash_input_denormal(a, status); 4039 b = float32_squash_input_denormal(b, status); 4040 4041 aSig = extractFloat32Frac( a ); 4042 aExp = extractFloat32Exp( a ); 4043 aSign = extractFloat32Sign( a ); 4044 bSig = extractFloat32Frac( b ); 4045 bExp = extractFloat32Exp( b ); 4046 if ( aExp == 0xFF ) { 4047 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4048 return propagateFloat32NaN(a, b, status); 4049 } 4050 float_raise(float_flag_invalid, status); 4051 return float32_default_nan(status); 4052 } 4053 if ( bExp == 0xFF ) { 4054 if (bSig) { 4055 return propagateFloat32NaN(a, b, status); 4056 } 4057 return a; 4058 } 4059 if ( bExp == 0 ) { 4060 if ( bSig == 0 ) { 4061 float_raise(float_flag_invalid, status); 4062 return float32_default_nan(status); 4063 } 4064 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4065 } 4066 if ( aExp == 0 ) { 4067 if ( aSig == 0 ) return a; 4068 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4069 } 4070 expDiff = aExp - bExp; 4071 aSig |= 0x00800000; 4072 bSig |= 0x00800000; 4073 if ( expDiff < 32 ) { 4074 aSig <<= 8; 4075 bSig <<= 8; 4076 if ( expDiff < 0 ) { 4077 if ( expDiff < -1 ) return a; 4078 aSig >>= 1; 4079 } 4080 q = ( bSig <= aSig ); 4081 if ( q ) aSig -= bSig; 4082 if ( 0 < expDiff ) { 4083 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4084 q >>= 32 - expDiff; 4085 bSig >>= 2; 4086 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4087 } 4088 else { 4089 aSig >>= 2; 4090 bSig >>= 2; 4091 } 4092 } 4093 else { 4094 if ( bSig <= aSig ) aSig -= bSig; 4095 aSig64 = ( (uint64_t) aSig )<<40; 4096 bSig64 = ( (uint64_t) bSig )<<40; 4097 expDiff -= 64; 4098 while ( 0 < expDiff ) { 4099 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4100 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4101 aSig64 = - ( ( bSig * q64 )<<38 ); 4102 expDiff -= 62; 4103 } 4104 expDiff += 64; 4105 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4106 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4107 q = q64>>( 64 - expDiff ); 4108 bSig <<= 6; 4109 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4110 } 4111 do { 4112 alternateASig = aSig; 4113 ++q; 4114 aSig -= bSig; 4115 } while ( 0 <= (int32_t) aSig ); 4116 sigMean = aSig + alternateASig; 4117 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4118 aSig = alternateASig; 4119 } 4120 zSign = ( (int32_t) aSig < 0 ); 4121 if ( zSign ) aSig = - aSig; 4122 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4123 } 4124 4125 4126 4127 /*---------------------------------------------------------------------------- 4128 | Returns the binary exponential of the single-precision floating-point value 4129 | `a'. The operation is performed according to the IEC/IEEE Standard for 4130 | Binary Floating-Point Arithmetic. 4131 | 4132 | Uses the following identities: 4133 | 4134 | 1. ------------------------------------------------------------------------- 4135 | x x*ln(2) 4136 | 2 = e 4137 | 4138 | 2. ------------------------------------------------------------------------- 4139 | 2 3 4 5 n 4140 | x x x x x x x 4141 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4142 | 1! 2! 3! 4! 5! n! 4143 *----------------------------------------------------------------------------*/ 4144 4145 static const float64 float32_exp2_coefficients[15] = 4146 { 4147 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4148 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4149 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4150 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4151 const_float64( 0x3f81111111111111ll ), /* 5 */ 4152 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4153 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4154 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4155 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4156 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4157 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4158 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4159 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4160 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4161 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4162 }; 4163 4164 float32 float32_exp2(float32 a, float_status *status) 4165 { 4166 flag aSign; 4167 int aExp; 4168 uint32_t aSig; 4169 float64 r, x, xn; 4170 int i; 4171 a = float32_squash_input_denormal(a, status); 4172 4173 aSig = extractFloat32Frac( a ); 4174 aExp = extractFloat32Exp( a ); 4175 aSign = extractFloat32Sign( a ); 4176 4177 if ( aExp == 0xFF) { 4178 if (aSig) { 4179 return propagateFloat32NaN(a, float32_zero, status); 4180 } 4181 return (aSign) ? float32_zero : a; 4182 } 4183 if (aExp == 0) { 4184 if (aSig == 0) return float32_one; 4185 } 4186 4187 float_raise(float_flag_inexact, status); 4188 4189 /* ******************************* */ 4190 /* using float64 for approximation */ 4191 /* ******************************* */ 4192 x = float32_to_float64(a, status); 4193 x = float64_mul(x, float64_ln2, status); 4194 4195 xn = x; 4196 r = float64_one; 4197 for (i = 0 ; i < 15 ; i++) { 4198 float64 f; 4199 4200 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4201 r = float64_add(r, f, status); 4202 4203 xn = float64_mul(xn, x, status); 4204 } 4205 4206 return float64_to_float32(r, status); 4207 } 4208 4209 /*---------------------------------------------------------------------------- 4210 | Returns the binary log of the single-precision floating-point value `a'. 4211 | The operation is performed according to the IEC/IEEE Standard for Binary 4212 | Floating-Point Arithmetic. 4213 *----------------------------------------------------------------------------*/ 4214 float32 float32_log2(float32 a, float_status *status) 4215 { 4216 flag aSign, zSign; 4217 int aExp; 4218 uint32_t aSig, zSig, i; 4219 4220 a = float32_squash_input_denormal(a, status); 4221 aSig = extractFloat32Frac( a ); 4222 aExp = extractFloat32Exp( a ); 4223 aSign = extractFloat32Sign( a ); 4224 4225 if ( aExp == 0 ) { 4226 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4227 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4228 } 4229 if ( aSign ) { 4230 float_raise(float_flag_invalid, status); 4231 return float32_default_nan(status); 4232 } 4233 if ( aExp == 0xFF ) { 4234 if (aSig) { 4235 return propagateFloat32NaN(a, float32_zero, status); 4236 } 4237 return a; 4238 } 4239 4240 aExp -= 0x7F; 4241 aSig |= 0x00800000; 4242 zSign = aExp < 0; 4243 zSig = aExp << 23; 4244 4245 for (i = 1 << 22; i > 0; i >>= 1) { 4246 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4247 if ( aSig & 0x01000000 ) { 4248 aSig >>= 1; 4249 zSig |= i; 4250 } 4251 } 4252 4253 if ( zSign ) 4254 zSig = -zSig; 4255 4256 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4257 } 4258 4259 /*---------------------------------------------------------------------------- 4260 | Returns 1 if the single-precision floating-point value `a' is equal to 4261 | the corresponding value `b', and 0 otherwise. The invalid exception is 4262 | raised if either operand is a NaN. Otherwise, the comparison is performed 4263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4264 *----------------------------------------------------------------------------*/ 4265 4266 int float32_eq(float32 a, float32 b, float_status *status) 4267 { 4268 uint32_t av, bv; 4269 a = float32_squash_input_denormal(a, status); 4270 b = float32_squash_input_denormal(b, status); 4271 4272 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4273 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4274 ) { 4275 float_raise(float_flag_invalid, status); 4276 return 0; 4277 } 4278 av = float32_val(a); 4279 bv = float32_val(b); 4280 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4281 } 4282 4283 /*---------------------------------------------------------------------------- 4284 | Returns 1 if the single-precision floating-point value `a' is less than 4285 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4286 | exception is raised if either operand is a NaN. The comparison is performed 4287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4288 *----------------------------------------------------------------------------*/ 4289 4290 int float32_le(float32 a, float32 b, float_status *status) 4291 { 4292 flag aSign, bSign; 4293 uint32_t av, bv; 4294 a = float32_squash_input_denormal(a, status); 4295 b = float32_squash_input_denormal(b, status); 4296 4297 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4298 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4299 ) { 4300 float_raise(float_flag_invalid, status); 4301 return 0; 4302 } 4303 aSign = extractFloat32Sign( a ); 4304 bSign = extractFloat32Sign( b ); 4305 av = float32_val(a); 4306 bv = float32_val(b); 4307 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4308 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4309 4310 } 4311 4312 /*---------------------------------------------------------------------------- 4313 | Returns 1 if the single-precision floating-point value `a' is less than 4314 | the corresponding value `b', and 0 otherwise. The invalid exception is 4315 | raised if either operand is a NaN. The comparison is performed according 4316 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4317 *----------------------------------------------------------------------------*/ 4318 4319 int float32_lt(float32 a, float32 b, float_status *status) 4320 { 4321 flag aSign, bSign; 4322 uint32_t av, bv; 4323 a = float32_squash_input_denormal(a, status); 4324 b = float32_squash_input_denormal(b, status); 4325 4326 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4327 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4328 ) { 4329 float_raise(float_flag_invalid, status); 4330 return 0; 4331 } 4332 aSign = extractFloat32Sign( a ); 4333 bSign = extractFloat32Sign( b ); 4334 av = float32_val(a); 4335 bv = float32_val(b); 4336 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4337 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4338 4339 } 4340 4341 /*---------------------------------------------------------------------------- 4342 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4343 | be compared, and 0 otherwise. The invalid exception is raised if either 4344 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4345 | Standard for Binary Floating-Point Arithmetic. 4346 *----------------------------------------------------------------------------*/ 4347 4348 int float32_unordered(float32 a, float32 b, float_status *status) 4349 { 4350 a = float32_squash_input_denormal(a, status); 4351 b = float32_squash_input_denormal(b, status); 4352 4353 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4354 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4355 ) { 4356 float_raise(float_flag_invalid, status); 4357 return 1; 4358 } 4359 return 0; 4360 } 4361 4362 /*---------------------------------------------------------------------------- 4363 | Returns 1 if the single-precision floating-point value `a' is equal to 4364 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4365 | exception. The comparison is performed according to the IEC/IEEE Standard 4366 | for Binary Floating-Point Arithmetic. 4367 *----------------------------------------------------------------------------*/ 4368 4369 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4370 { 4371 a = float32_squash_input_denormal(a, status); 4372 b = float32_squash_input_denormal(b, status); 4373 4374 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4375 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4376 ) { 4377 if (float32_is_signaling_nan(a, status) 4378 || float32_is_signaling_nan(b, status)) { 4379 float_raise(float_flag_invalid, status); 4380 } 4381 return 0; 4382 } 4383 return ( float32_val(a) == float32_val(b) ) || 4384 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4385 } 4386 4387 /*---------------------------------------------------------------------------- 4388 | Returns 1 if the single-precision floating-point value `a' is less than or 4389 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4390 | cause an exception. Otherwise, the comparison is performed according to the 4391 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4392 *----------------------------------------------------------------------------*/ 4393 4394 int float32_le_quiet(float32 a, float32 b, float_status *status) 4395 { 4396 flag aSign, bSign; 4397 uint32_t av, bv; 4398 a = float32_squash_input_denormal(a, status); 4399 b = float32_squash_input_denormal(b, status); 4400 4401 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4402 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4403 ) { 4404 if (float32_is_signaling_nan(a, status) 4405 || float32_is_signaling_nan(b, status)) { 4406 float_raise(float_flag_invalid, status); 4407 } 4408 return 0; 4409 } 4410 aSign = extractFloat32Sign( a ); 4411 bSign = extractFloat32Sign( b ); 4412 av = float32_val(a); 4413 bv = float32_val(b); 4414 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4415 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4416 4417 } 4418 4419 /*---------------------------------------------------------------------------- 4420 | Returns 1 if the single-precision floating-point value `a' is less than 4421 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4422 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4423 | Standard for Binary Floating-Point Arithmetic. 4424 *----------------------------------------------------------------------------*/ 4425 4426 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4427 { 4428 flag aSign, bSign; 4429 uint32_t av, bv; 4430 a = float32_squash_input_denormal(a, status); 4431 b = float32_squash_input_denormal(b, status); 4432 4433 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4434 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4435 ) { 4436 if (float32_is_signaling_nan(a, status) 4437 || float32_is_signaling_nan(b, status)) { 4438 float_raise(float_flag_invalid, status); 4439 } 4440 return 0; 4441 } 4442 aSign = extractFloat32Sign( a ); 4443 bSign = extractFloat32Sign( b ); 4444 av = float32_val(a); 4445 bv = float32_val(b); 4446 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4447 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4448 4449 } 4450 4451 /*---------------------------------------------------------------------------- 4452 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4453 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4454 | comparison is performed according to the IEC/IEEE Standard for Binary 4455 | Floating-Point Arithmetic. 4456 *----------------------------------------------------------------------------*/ 4457 4458 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4459 { 4460 a = float32_squash_input_denormal(a, status); 4461 b = float32_squash_input_denormal(b, status); 4462 4463 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4464 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4465 ) { 4466 if (float32_is_signaling_nan(a, status) 4467 || float32_is_signaling_nan(b, status)) { 4468 float_raise(float_flag_invalid, status); 4469 } 4470 return 1; 4471 } 4472 return 0; 4473 } 4474 4475 /*---------------------------------------------------------------------------- 4476 | If `a' is denormal and we are in flush-to-zero mode then set the 4477 | input-denormal exception and return zero. Otherwise just return the value. 4478 *----------------------------------------------------------------------------*/ 4479 float16 float16_squash_input_denormal(float16 a, float_status *status) 4480 { 4481 if (status->flush_inputs_to_zero) { 4482 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4483 float_raise(float_flag_input_denormal, status); 4484 return make_float16(float16_val(a) & 0x8000); 4485 } 4486 } 4487 return a; 4488 } 4489 4490 /*---------------------------------------------------------------------------- 4491 | Returns the result of converting the double-precision floating-point value 4492 | `a' to the extended double-precision floating-point format. The conversion 4493 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4494 | Arithmetic. 4495 *----------------------------------------------------------------------------*/ 4496 4497 floatx80 float64_to_floatx80(float64 a, float_status *status) 4498 { 4499 flag aSign; 4500 int aExp; 4501 uint64_t aSig; 4502 4503 a = float64_squash_input_denormal(a, status); 4504 aSig = extractFloat64Frac( a ); 4505 aExp = extractFloat64Exp( a ); 4506 aSign = extractFloat64Sign( a ); 4507 if ( aExp == 0x7FF ) { 4508 if (aSig) { 4509 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4510 } 4511 return packFloatx80(aSign, 4512 floatx80_infinity_high, 4513 floatx80_infinity_low); 4514 } 4515 if ( aExp == 0 ) { 4516 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4517 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4518 } 4519 return 4520 packFloatx80( 4521 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4522 4523 } 4524 4525 /*---------------------------------------------------------------------------- 4526 | Returns the result of converting the double-precision floating-point value 4527 | `a' to the quadruple-precision floating-point format. The conversion is 4528 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4529 | Arithmetic. 4530 *----------------------------------------------------------------------------*/ 4531 4532 float128 float64_to_float128(float64 a, float_status *status) 4533 { 4534 flag aSign; 4535 int aExp; 4536 uint64_t aSig, zSig0, zSig1; 4537 4538 a = float64_squash_input_denormal(a, status); 4539 aSig = extractFloat64Frac( a ); 4540 aExp = extractFloat64Exp( a ); 4541 aSign = extractFloat64Sign( a ); 4542 if ( aExp == 0x7FF ) { 4543 if (aSig) { 4544 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4545 } 4546 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4547 } 4548 if ( aExp == 0 ) { 4549 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4550 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4551 --aExp; 4552 } 4553 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4554 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4555 4556 } 4557 4558 4559 /*---------------------------------------------------------------------------- 4560 | Returns the remainder of the double-precision floating-point value `a' 4561 | with respect to the corresponding value `b'. The operation is performed 4562 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4563 *----------------------------------------------------------------------------*/ 4564 4565 float64 float64_rem(float64 a, float64 b, float_status *status) 4566 { 4567 flag aSign, zSign; 4568 int aExp, bExp, expDiff; 4569 uint64_t aSig, bSig; 4570 uint64_t q, alternateASig; 4571 int64_t sigMean; 4572 4573 a = float64_squash_input_denormal(a, status); 4574 b = float64_squash_input_denormal(b, status); 4575 aSig = extractFloat64Frac( a ); 4576 aExp = extractFloat64Exp( a ); 4577 aSign = extractFloat64Sign( a ); 4578 bSig = extractFloat64Frac( b ); 4579 bExp = extractFloat64Exp( b ); 4580 if ( aExp == 0x7FF ) { 4581 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4582 return propagateFloat64NaN(a, b, status); 4583 } 4584 float_raise(float_flag_invalid, status); 4585 return float64_default_nan(status); 4586 } 4587 if ( bExp == 0x7FF ) { 4588 if (bSig) { 4589 return propagateFloat64NaN(a, b, status); 4590 } 4591 return a; 4592 } 4593 if ( bExp == 0 ) { 4594 if ( bSig == 0 ) { 4595 float_raise(float_flag_invalid, status); 4596 return float64_default_nan(status); 4597 } 4598 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4599 } 4600 if ( aExp == 0 ) { 4601 if ( aSig == 0 ) return a; 4602 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4603 } 4604 expDiff = aExp - bExp; 4605 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4606 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4607 if ( expDiff < 0 ) { 4608 if ( expDiff < -1 ) return a; 4609 aSig >>= 1; 4610 } 4611 q = ( bSig <= aSig ); 4612 if ( q ) aSig -= bSig; 4613 expDiff -= 64; 4614 while ( 0 < expDiff ) { 4615 q = estimateDiv128To64( aSig, 0, bSig ); 4616 q = ( 2 < q ) ? q - 2 : 0; 4617 aSig = - ( ( bSig>>2 ) * q ); 4618 expDiff -= 62; 4619 } 4620 expDiff += 64; 4621 if ( 0 < expDiff ) { 4622 q = estimateDiv128To64( aSig, 0, bSig ); 4623 q = ( 2 < q ) ? q - 2 : 0; 4624 q >>= 64 - expDiff; 4625 bSig >>= 2; 4626 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4627 } 4628 else { 4629 aSig >>= 2; 4630 bSig >>= 2; 4631 } 4632 do { 4633 alternateASig = aSig; 4634 ++q; 4635 aSig -= bSig; 4636 } while ( 0 <= (int64_t) aSig ); 4637 sigMean = aSig + alternateASig; 4638 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4639 aSig = alternateASig; 4640 } 4641 zSign = ( (int64_t) aSig < 0 ); 4642 if ( zSign ) aSig = - aSig; 4643 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4644 4645 } 4646 4647 /*---------------------------------------------------------------------------- 4648 | Returns the binary log of the double-precision floating-point value `a'. 4649 | The operation is performed according to the IEC/IEEE Standard for Binary 4650 | Floating-Point Arithmetic. 4651 *----------------------------------------------------------------------------*/ 4652 float64 float64_log2(float64 a, float_status *status) 4653 { 4654 flag aSign, zSign; 4655 int aExp; 4656 uint64_t aSig, aSig0, aSig1, zSig, i; 4657 a = float64_squash_input_denormal(a, status); 4658 4659 aSig = extractFloat64Frac( a ); 4660 aExp = extractFloat64Exp( a ); 4661 aSign = extractFloat64Sign( a ); 4662 4663 if ( aExp == 0 ) { 4664 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4665 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4666 } 4667 if ( aSign ) { 4668 float_raise(float_flag_invalid, status); 4669 return float64_default_nan(status); 4670 } 4671 if ( aExp == 0x7FF ) { 4672 if (aSig) { 4673 return propagateFloat64NaN(a, float64_zero, status); 4674 } 4675 return a; 4676 } 4677 4678 aExp -= 0x3FF; 4679 aSig |= LIT64( 0x0010000000000000 ); 4680 zSign = aExp < 0; 4681 zSig = (uint64_t)aExp << 52; 4682 for (i = 1LL << 51; i > 0; i >>= 1) { 4683 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4684 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4685 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4686 aSig >>= 1; 4687 zSig |= i; 4688 } 4689 } 4690 4691 if ( zSign ) 4692 zSig = -zSig; 4693 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4694 } 4695 4696 /*---------------------------------------------------------------------------- 4697 | Returns 1 if the double-precision floating-point value `a' is equal to the 4698 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4699 | if either operand is a NaN. Otherwise, the comparison is performed 4700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4701 *----------------------------------------------------------------------------*/ 4702 4703 int float64_eq(float64 a, float64 b, float_status *status) 4704 { 4705 uint64_t av, bv; 4706 a = float64_squash_input_denormal(a, status); 4707 b = float64_squash_input_denormal(b, status); 4708 4709 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4710 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4711 ) { 4712 float_raise(float_flag_invalid, status); 4713 return 0; 4714 } 4715 av = float64_val(a); 4716 bv = float64_val(b); 4717 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4718 4719 } 4720 4721 /*---------------------------------------------------------------------------- 4722 | Returns 1 if the double-precision floating-point value `a' is less than or 4723 | equal to the corresponding value `b', and 0 otherwise. The invalid 4724 | exception is raised if either operand is a NaN. The comparison is performed 4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4726 *----------------------------------------------------------------------------*/ 4727 4728 int float64_le(float64 a, float64 b, float_status *status) 4729 { 4730 flag aSign, bSign; 4731 uint64_t av, bv; 4732 a = float64_squash_input_denormal(a, status); 4733 b = float64_squash_input_denormal(b, status); 4734 4735 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4736 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4737 ) { 4738 float_raise(float_flag_invalid, status); 4739 return 0; 4740 } 4741 aSign = extractFloat64Sign( a ); 4742 bSign = extractFloat64Sign( b ); 4743 av = float64_val(a); 4744 bv = float64_val(b); 4745 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4746 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4747 4748 } 4749 4750 /*---------------------------------------------------------------------------- 4751 | Returns 1 if the double-precision floating-point value `a' is less than 4752 | the corresponding value `b', and 0 otherwise. The invalid exception is 4753 | raised if either operand is a NaN. The comparison is performed according 4754 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4755 *----------------------------------------------------------------------------*/ 4756 4757 int float64_lt(float64 a, float64 b, float_status *status) 4758 { 4759 flag aSign, bSign; 4760 uint64_t av, bv; 4761 4762 a = float64_squash_input_denormal(a, status); 4763 b = float64_squash_input_denormal(b, status); 4764 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4765 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4766 ) { 4767 float_raise(float_flag_invalid, status); 4768 return 0; 4769 } 4770 aSign = extractFloat64Sign( a ); 4771 bSign = extractFloat64Sign( b ); 4772 av = float64_val(a); 4773 bv = float64_val(b); 4774 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4775 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4776 4777 } 4778 4779 /*---------------------------------------------------------------------------- 4780 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4781 | be compared, and 0 otherwise. The invalid exception is raised if either 4782 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4783 | Standard for Binary Floating-Point Arithmetic. 4784 *----------------------------------------------------------------------------*/ 4785 4786 int float64_unordered(float64 a, float64 b, float_status *status) 4787 { 4788 a = float64_squash_input_denormal(a, status); 4789 b = float64_squash_input_denormal(b, status); 4790 4791 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4792 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4793 ) { 4794 float_raise(float_flag_invalid, status); 4795 return 1; 4796 } 4797 return 0; 4798 } 4799 4800 /*---------------------------------------------------------------------------- 4801 | Returns 1 if the double-precision floating-point value `a' is equal to the 4802 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4803 | exception.The comparison is performed according to the IEC/IEEE Standard 4804 | for Binary Floating-Point Arithmetic. 4805 *----------------------------------------------------------------------------*/ 4806 4807 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4808 { 4809 uint64_t av, bv; 4810 a = float64_squash_input_denormal(a, status); 4811 b = float64_squash_input_denormal(b, status); 4812 4813 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4814 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4815 ) { 4816 if (float64_is_signaling_nan(a, status) 4817 || float64_is_signaling_nan(b, status)) { 4818 float_raise(float_flag_invalid, status); 4819 } 4820 return 0; 4821 } 4822 av = float64_val(a); 4823 bv = float64_val(b); 4824 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4825 4826 } 4827 4828 /*---------------------------------------------------------------------------- 4829 | Returns 1 if the double-precision floating-point value `a' is less than or 4830 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4831 | cause an exception. Otherwise, the comparison is performed according to the 4832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4833 *----------------------------------------------------------------------------*/ 4834 4835 int float64_le_quiet(float64 a, float64 b, float_status *status) 4836 { 4837 flag aSign, bSign; 4838 uint64_t av, bv; 4839 a = float64_squash_input_denormal(a, status); 4840 b = float64_squash_input_denormal(b, status); 4841 4842 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4843 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4844 ) { 4845 if (float64_is_signaling_nan(a, status) 4846 || float64_is_signaling_nan(b, status)) { 4847 float_raise(float_flag_invalid, status); 4848 } 4849 return 0; 4850 } 4851 aSign = extractFloat64Sign( a ); 4852 bSign = extractFloat64Sign( b ); 4853 av = float64_val(a); 4854 bv = float64_val(b); 4855 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4856 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4857 4858 } 4859 4860 /*---------------------------------------------------------------------------- 4861 | Returns 1 if the double-precision floating-point value `a' is less than 4862 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4863 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4864 | Standard for Binary Floating-Point Arithmetic. 4865 *----------------------------------------------------------------------------*/ 4866 4867 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4868 { 4869 flag aSign, bSign; 4870 uint64_t av, bv; 4871 a = float64_squash_input_denormal(a, status); 4872 b = float64_squash_input_denormal(b, status); 4873 4874 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4875 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4876 ) { 4877 if (float64_is_signaling_nan(a, status) 4878 || float64_is_signaling_nan(b, status)) { 4879 float_raise(float_flag_invalid, status); 4880 } 4881 return 0; 4882 } 4883 aSign = extractFloat64Sign( a ); 4884 bSign = extractFloat64Sign( b ); 4885 av = float64_val(a); 4886 bv = float64_val(b); 4887 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4888 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4889 4890 } 4891 4892 /*---------------------------------------------------------------------------- 4893 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4894 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4895 | comparison is performed according to the IEC/IEEE Standard for Binary 4896 | Floating-Point Arithmetic. 4897 *----------------------------------------------------------------------------*/ 4898 4899 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4900 { 4901 a = float64_squash_input_denormal(a, status); 4902 b = float64_squash_input_denormal(b, status); 4903 4904 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4905 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4906 ) { 4907 if (float64_is_signaling_nan(a, status) 4908 || float64_is_signaling_nan(b, status)) { 4909 float_raise(float_flag_invalid, status); 4910 } 4911 return 1; 4912 } 4913 return 0; 4914 } 4915 4916 /*---------------------------------------------------------------------------- 4917 | Returns the result of converting the extended double-precision floating- 4918 | point value `a' to the 32-bit two's complement integer format. The 4919 | conversion is performed according to the IEC/IEEE Standard for Binary 4920 | Floating-Point Arithmetic---which means in particular that the conversion 4921 | is rounded according to the current rounding mode. If `a' is a NaN, the 4922 | largest positive integer is returned. Otherwise, if the conversion 4923 | overflows, the largest integer with the same sign as `a' is returned. 4924 *----------------------------------------------------------------------------*/ 4925 4926 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4927 { 4928 flag aSign; 4929 int32_t aExp, shiftCount; 4930 uint64_t aSig; 4931 4932 if (floatx80_invalid_encoding(a)) { 4933 float_raise(float_flag_invalid, status); 4934 return 1 << 31; 4935 } 4936 aSig = extractFloatx80Frac( a ); 4937 aExp = extractFloatx80Exp( a ); 4938 aSign = extractFloatx80Sign( a ); 4939 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4940 shiftCount = 0x4037 - aExp; 4941 if ( shiftCount <= 0 ) shiftCount = 1; 4942 shift64RightJamming( aSig, shiftCount, &aSig ); 4943 return roundAndPackInt32(aSign, aSig, status); 4944 4945 } 4946 4947 /*---------------------------------------------------------------------------- 4948 | Returns the result of converting the extended double-precision floating- 4949 | point value `a' to the 32-bit two's complement integer format. The 4950 | conversion is performed according to the IEC/IEEE Standard for Binary 4951 | Floating-Point Arithmetic, except that the conversion is always rounded 4952 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4953 | Otherwise, if the conversion overflows, the largest integer with the same 4954 | sign as `a' is returned. 4955 *----------------------------------------------------------------------------*/ 4956 4957 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4958 { 4959 flag aSign; 4960 int32_t aExp, shiftCount; 4961 uint64_t aSig, savedASig; 4962 int32_t z; 4963 4964 if (floatx80_invalid_encoding(a)) { 4965 float_raise(float_flag_invalid, status); 4966 return 1 << 31; 4967 } 4968 aSig = extractFloatx80Frac( a ); 4969 aExp = extractFloatx80Exp( a ); 4970 aSign = extractFloatx80Sign( a ); 4971 if ( 0x401E < aExp ) { 4972 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4973 goto invalid; 4974 } 4975 else if ( aExp < 0x3FFF ) { 4976 if (aExp || aSig) { 4977 status->float_exception_flags |= float_flag_inexact; 4978 } 4979 return 0; 4980 } 4981 shiftCount = 0x403E - aExp; 4982 savedASig = aSig; 4983 aSig >>= shiftCount; 4984 z = aSig; 4985 if ( aSign ) z = - z; 4986 if ( ( z < 0 ) ^ aSign ) { 4987 invalid: 4988 float_raise(float_flag_invalid, status); 4989 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4990 } 4991 if ( ( aSig<<shiftCount ) != savedASig ) { 4992 status->float_exception_flags |= float_flag_inexact; 4993 } 4994 return z; 4995 4996 } 4997 4998 /*---------------------------------------------------------------------------- 4999 | Returns the result of converting the extended double-precision floating- 5000 | point value `a' to the 64-bit two's complement integer format. The 5001 | conversion is performed according to the IEC/IEEE Standard for Binary 5002 | Floating-Point Arithmetic---which means in particular that the conversion 5003 | is rounded according to the current rounding mode. If `a' is a NaN, 5004 | the largest positive integer is returned. Otherwise, if the conversion 5005 | overflows, the largest integer with the same sign as `a' is returned. 5006 *----------------------------------------------------------------------------*/ 5007 5008 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5009 { 5010 flag aSign; 5011 int32_t aExp, shiftCount; 5012 uint64_t aSig, aSigExtra; 5013 5014 if (floatx80_invalid_encoding(a)) { 5015 float_raise(float_flag_invalid, status); 5016 return 1ULL << 63; 5017 } 5018 aSig = extractFloatx80Frac( a ); 5019 aExp = extractFloatx80Exp( a ); 5020 aSign = extractFloatx80Sign( a ); 5021 shiftCount = 0x403E - aExp; 5022 if ( shiftCount <= 0 ) { 5023 if ( shiftCount ) { 5024 float_raise(float_flag_invalid, status); 5025 if (!aSign || floatx80_is_any_nan(a)) { 5026 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5027 } 5028 return (int64_t) LIT64( 0x8000000000000000 ); 5029 } 5030 aSigExtra = 0; 5031 } 5032 else { 5033 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5034 } 5035 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5036 5037 } 5038 5039 /*---------------------------------------------------------------------------- 5040 | Returns the result of converting the extended double-precision floating- 5041 | point value `a' to the 64-bit two's complement integer format. The 5042 | conversion is performed according to the IEC/IEEE Standard for Binary 5043 | Floating-Point Arithmetic, except that the conversion is always rounded 5044 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5045 | Otherwise, if the conversion overflows, the largest integer with the same 5046 | sign as `a' is returned. 5047 *----------------------------------------------------------------------------*/ 5048 5049 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5050 { 5051 flag aSign; 5052 int32_t aExp, shiftCount; 5053 uint64_t aSig; 5054 int64_t z; 5055 5056 if (floatx80_invalid_encoding(a)) { 5057 float_raise(float_flag_invalid, status); 5058 return 1ULL << 63; 5059 } 5060 aSig = extractFloatx80Frac( a ); 5061 aExp = extractFloatx80Exp( a ); 5062 aSign = extractFloatx80Sign( a ); 5063 shiftCount = aExp - 0x403E; 5064 if ( 0 <= shiftCount ) { 5065 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5066 if ( ( a.high != 0xC03E ) || aSig ) { 5067 float_raise(float_flag_invalid, status); 5068 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5069 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5070 } 5071 } 5072 return (int64_t) LIT64( 0x8000000000000000 ); 5073 } 5074 else if ( aExp < 0x3FFF ) { 5075 if (aExp | aSig) { 5076 status->float_exception_flags |= float_flag_inexact; 5077 } 5078 return 0; 5079 } 5080 z = aSig>>( - shiftCount ); 5081 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5082 status->float_exception_flags |= float_flag_inexact; 5083 } 5084 if ( aSign ) z = - z; 5085 return z; 5086 5087 } 5088 5089 /*---------------------------------------------------------------------------- 5090 | Returns the result of converting the extended double-precision floating- 5091 | point value `a' to the single-precision floating-point format. The 5092 | conversion is performed according to the IEC/IEEE Standard for Binary 5093 | Floating-Point Arithmetic. 5094 *----------------------------------------------------------------------------*/ 5095 5096 float32 floatx80_to_float32(floatx80 a, float_status *status) 5097 { 5098 flag aSign; 5099 int32_t aExp; 5100 uint64_t aSig; 5101 5102 if (floatx80_invalid_encoding(a)) { 5103 float_raise(float_flag_invalid, status); 5104 return float32_default_nan(status); 5105 } 5106 aSig = extractFloatx80Frac( a ); 5107 aExp = extractFloatx80Exp( a ); 5108 aSign = extractFloatx80Sign( a ); 5109 if ( aExp == 0x7FFF ) { 5110 if ( (uint64_t) ( aSig<<1 ) ) { 5111 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5112 } 5113 return packFloat32( aSign, 0xFF, 0 ); 5114 } 5115 shift64RightJamming( aSig, 33, &aSig ); 5116 if ( aExp || aSig ) aExp -= 0x3F81; 5117 return roundAndPackFloat32(aSign, aExp, aSig, status); 5118 5119 } 5120 5121 /*---------------------------------------------------------------------------- 5122 | Returns the result of converting the extended double-precision floating- 5123 | point value `a' to the double-precision floating-point format. The 5124 | conversion is performed according to the IEC/IEEE Standard for Binary 5125 | Floating-Point Arithmetic. 5126 *----------------------------------------------------------------------------*/ 5127 5128 float64 floatx80_to_float64(floatx80 a, float_status *status) 5129 { 5130 flag aSign; 5131 int32_t aExp; 5132 uint64_t aSig, zSig; 5133 5134 if (floatx80_invalid_encoding(a)) { 5135 float_raise(float_flag_invalid, status); 5136 return float64_default_nan(status); 5137 } 5138 aSig = extractFloatx80Frac( a ); 5139 aExp = extractFloatx80Exp( a ); 5140 aSign = extractFloatx80Sign( a ); 5141 if ( aExp == 0x7FFF ) { 5142 if ( (uint64_t) ( aSig<<1 ) ) { 5143 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5144 } 5145 return packFloat64( aSign, 0x7FF, 0 ); 5146 } 5147 shift64RightJamming( aSig, 1, &zSig ); 5148 if ( aExp || aSig ) aExp -= 0x3C01; 5149 return roundAndPackFloat64(aSign, aExp, zSig, status); 5150 5151 } 5152 5153 /*---------------------------------------------------------------------------- 5154 | Returns the result of converting the extended double-precision floating- 5155 | point value `a' to the quadruple-precision floating-point format. The 5156 | conversion is performed according to the IEC/IEEE Standard for Binary 5157 | Floating-Point Arithmetic. 5158 *----------------------------------------------------------------------------*/ 5159 5160 float128 floatx80_to_float128(floatx80 a, float_status *status) 5161 { 5162 flag aSign; 5163 int aExp; 5164 uint64_t aSig, zSig0, zSig1; 5165 5166 if (floatx80_invalid_encoding(a)) { 5167 float_raise(float_flag_invalid, status); 5168 return float128_default_nan(status); 5169 } 5170 aSig = extractFloatx80Frac( a ); 5171 aExp = extractFloatx80Exp( a ); 5172 aSign = extractFloatx80Sign( a ); 5173 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5174 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5175 } 5176 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5177 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5178 5179 } 5180 5181 /*---------------------------------------------------------------------------- 5182 | Rounds the extended double-precision floating-point value `a' 5183 | to the precision provided by floatx80_rounding_precision and returns the 5184 | result as an extended double-precision floating-point value. 5185 | The operation is performed according to the IEC/IEEE Standard for Binary 5186 | Floating-Point Arithmetic. 5187 *----------------------------------------------------------------------------*/ 5188 5189 floatx80 floatx80_round(floatx80 a, float_status *status) 5190 { 5191 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5192 extractFloatx80Sign(a), 5193 extractFloatx80Exp(a), 5194 extractFloatx80Frac(a), 0, status); 5195 } 5196 5197 /*---------------------------------------------------------------------------- 5198 | Rounds the extended double-precision floating-point value `a' to an integer, 5199 | and returns the result as an extended quadruple-precision floating-point 5200 | value. The operation is performed according to the IEC/IEEE Standard for 5201 | Binary Floating-Point Arithmetic. 5202 *----------------------------------------------------------------------------*/ 5203 5204 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5205 { 5206 flag aSign; 5207 int32_t aExp; 5208 uint64_t lastBitMask, roundBitsMask; 5209 floatx80 z; 5210 5211 if (floatx80_invalid_encoding(a)) { 5212 float_raise(float_flag_invalid, status); 5213 return floatx80_default_nan(status); 5214 } 5215 aExp = extractFloatx80Exp( a ); 5216 if ( 0x403E <= aExp ) { 5217 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5218 return propagateFloatx80NaN(a, a, status); 5219 } 5220 return a; 5221 } 5222 if ( aExp < 0x3FFF ) { 5223 if ( ( aExp == 0 ) 5224 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5225 return a; 5226 } 5227 status->float_exception_flags |= float_flag_inexact; 5228 aSign = extractFloatx80Sign( a ); 5229 switch (status->float_rounding_mode) { 5230 case float_round_nearest_even: 5231 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5232 ) { 5233 return 5234 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5235 } 5236 break; 5237 case float_round_ties_away: 5238 if (aExp == 0x3FFE) { 5239 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5240 } 5241 break; 5242 case float_round_down: 5243 return 5244 aSign ? 5245 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5246 : packFloatx80( 0, 0, 0 ); 5247 case float_round_up: 5248 return 5249 aSign ? packFloatx80( 1, 0, 0 ) 5250 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5251 } 5252 return packFloatx80( aSign, 0, 0 ); 5253 } 5254 lastBitMask = 1; 5255 lastBitMask <<= 0x403E - aExp; 5256 roundBitsMask = lastBitMask - 1; 5257 z = a; 5258 switch (status->float_rounding_mode) { 5259 case float_round_nearest_even: 5260 z.low += lastBitMask>>1; 5261 if ((z.low & roundBitsMask) == 0) { 5262 z.low &= ~lastBitMask; 5263 } 5264 break; 5265 case float_round_ties_away: 5266 z.low += lastBitMask >> 1; 5267 break; 5268 case float_round_to_zero: 5269 break; 5270 case float_round_up: 5271 if (!extractFloatx80Sign(z)) { 5272 z.low += roundBitsMask; 5273 } 5274 break; 5275 case float_round_down: 5276 if (extractFloatx80Sign(z)) { 5277 z.low += roundBitsMask; 5278 } 5279 break; 5280 default: 5281 abort(); 5282 } 5283 z.low &= ~ roundBitsMask; 5284 if ( z.low == 0 ) { 5285 ++z.high; 5286 z.low = LIT64( 0x8000000000000000 ); 5287 } 5288 if (z.low != a.low) { 5289 status->float_exception_flags |= float_flag_inexact; 5290 } 5291 return z; 5292 5293 } 5294 5295 /*---------------------------------------------------------------------------- 5296 | Returns the result of adding the absolute values of the extended double- 5297 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5298 | negated before being returned. `zSign' is ignored if the result is a NaN. 5299 | The addition is performed according to the IEC/IEEE Standard for Binary 5300 | Floating-Point Arithmetic. 5301 *----------------------------------------------------------------------------*/ 5302 5303 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5304 float_status *status) 5305 { 5306 int32_t aExp, bExp, zExp; 5307 uint64_t aSig, bSig, zSig0, zSig1; 5308 int32_t expDiff; 5309 5310 aSig = extractFloatx80Frac( a ); 5311 aExp = extractFloatx80Exp( a ); 5312 bSig = extractFloatx80Frac( b ); 5313 bExp = extractFloatx80Exp( b ); 5314 expDiff = aExp - bExp; 5315 if ( 0 < expDiff ) { 5316 if ( aExp == 0x7FFF ) { 5317 if ((uint64_t)(aSig << 1)) { 5318 return propagateFloatx80NaN(a, b, status); 5319 } 5320 return a; 5321 } 5322 if ( bExp == 0 ) --expDiff; 5323 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5324 zExp = aExp; 5325 } 5326 else if ( expDiff < 0 ) { 5327 if ( bExp == 0x7FFF ) { 5328 if ((uint64_t)(bSig << 1)) { 5329 return propagateFloatx80NaN(a, b, status); 5330 } 5331 return packFloatx80(zSign, 5332 floatx80_infinity_high, 5333 floatx80_infinity_low); 5334 } 5335 if ( aExp == 0 ) ++expDiff; 5336 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5337 zExp = bExp; 5338 } 5339 else { 5340 if ( aExp == 0x7FFF ) { 5341 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5342 return propagateFloatx80NaN(a, b, status); 5343 } 5344 return a; 5345 } 5346 zSig1 = 0; 5347 zSig0 = aSig + bSig; 5348 if ( aExp == 0 ) { 5349 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5350 goto roundAndPack; 5351 } 5352 zExp = aExp; 5353 goto shiftRight1; 5354 } 5355 zSig0 = aSig + bSig; 5356 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5357 shiftRight1: 5358 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5359 zSig0 |= LIT64( 0x8000000000000000 ); 5360 ++zExp; 5361 roundAndPack: 5362 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5363 zSign, zExp, zSig0, zSig1, status); 5364 } 5365 5366 /*---------------------------------------------------------------------------- 5367 | Returns the result of subtracting the absolute values of the extended 5368 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5369 | difference is negated before being returned. `zSign' is ignored if the 5370 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5371 | Standard for Binary Floating-Point Arithmetic. 5372 *----------------------------------------------------------------------------*/ 5373 5374 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5375 float_status *status) 5376 { 5377 int32_t aExp, bExp, zExp; 5378 uint64_t aSig, bSig, zSig0, zSig1; 5379 int32_t expDiff; 5380 5381 aSig = extractFloatx80Frac( a ); 5382 aExp = extractFloatx80Exp( a ); 5383 bSig = extractFloatx80Frac( b ); 5384 bExp = extractFloatx80Exp( b ); 5385 expDiff = aExp - bExp; 5386 if ( 0 < expDiff ) goto aExpBigger; 5387 if ( expDiff < 0 ) goto bExpBigger; 5388 if ( aExp == 0x7FFF ) { 5389 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5390 return propagateFloatx80NaN(a, b, status); 5391 } 5392 float_raise(float_flag_invalid, status); 5393 return floatx80_default_nan(status); 5394 } 5395 if ( aExp == 0 ) { 5396 aExp = 1; 5397 bExp = 1; 5398 } 5399 zSig1 = 0; 5400 if ( bSig < aSig ) goto aBigger; 5401 if ( aSig < bSig ) goto bBigger; 5402 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5403 bExpBigger: 5404 if ( bExp == 0x7FFF ) { 5405 if ((uint64_t)(bSig << 1)) { 5406 return propagateFloatx80NaN(a, b, status); 5407 } 5408 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5409 floatx80_infinity_low); 5410 } 5411 if ( aExp == 0 ) ++expDiff; 5412 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5413 bBigger: 5414 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5415 zExp = bExp; 5416 zSign ^= 1; 5417 goto normalizeRoundAndPack; 5418 aExpBigger: 5419 if ( aExp == 0x7FFF ) { 5420 if ((uint64_t)(aSig << 1)) { 5421 return propagateFloatx80NaN(a, b, status); 5422 } 5423 return a; 5424 } 5425 if ( bExp == 0 ) --expDiff; 5426 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5427 aBigger: 5428 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5429 zExp = aExp; 5430 normalizeRoundAndPack: 5431 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5432 zSign, zExp, zSig0, zSig1, status); 5433 } 5434 5435 /*---------------------------------------------------------------------------- 5436 | Returns the result of adding the extended double-precision floating-point 5437 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5438 | Standard for Binary Floating-Point Arithmetic. 5439 *----------------------------------------------------------------------------*/ 5440 5441 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5442 { 5443 flag aSign, bSign; 5444 5445 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5446 float_raise(float_flag_invalid, status); 5447 return floatx80_default_nan(status); 5448 } 5449 aSign = extractFloatx80Sign( a ); 5450 bSign = extractFloatx80Sign( b ); 5451 if ( aSign == bSign ) { 5452 return addFloatx80Sigs(a, b, aSign, status); 5453 } 5454 else { 5455 return subFloatx80Sigs(a, b, aSign, status); 5456 } 5457 5458 } 5459 5460 /*---------------------------------------------------------------------------- 5461 | Returns the result of subtracting the extended double-precision floating- 5462 | point values `a' and `b'. The operation is performed according to the 5463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5464 *----------------------------------------------------------------------------*/ 5465 5466 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5467 { 5468 flag aSign, bSign; 5469 5470 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5471 float_raise(float_flag_invalid, status); 5472 return floatx80_default_nan(status); 5473 } 5474 aSign = extractFloatx80Sign( a ); 5475 bSign = extractFloatx80Sign( b ); 5476 if ( aSign == bSign ) { 5477 return subFloatx80Sigs(a, b, aSign, status); 5478 } 5479 else { 5480 return addFloatx80Sigs(a, b, aSign, status); 5481 } 5482 5483 } 5484 5485 /*---------------------------------------------------------------------------- 5486 | Returns the result of multiplying the extended double-precision floating- 5487 | point values `a' and `b'. The operation is performed according to the 5488 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5489 *----------------------------------------------------------------------------*/ 5490 5491 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5492 { 5493 flag aSign, bSign, zSign; 5494 int32_t aExp, bExp, zExp; 5495 uint64_t aSig, bSig, zSig0, zSig1; 5496 5497 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5498 float_raise(float_flag_invalid, status); 5499 return floatx80_default_nan(status); 5500 } 5501 aSig = extractFloatx80Frac( a ); 5502 aExp = extractFloatx80Exp( a ); 5503 aSign = extractFloatx80Sign( a ); 5504 bSig = extractFloatx80Frac( b ); 5505 bExp = extractFloatx80Exp( b ); 5506 bSign = extractFloatx80Sign( b ); 5507 zSign = aSign ^ bSign; 5508 if ( aExp == 0x7FFF ) { 5509 if ( (uint64_t) ( aSig<<1 ) 5510 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5511 return propagateFloatx80NaN(a, b, status); 5512 } 5513 if ( ( bExp | bSig ) == 0 ) goto invalid; 5514 return packFloatx80(zSign, floatx80_infinity_high, 5515 floatx80_infinity_low); 5516 } 5517 if ( bExp == 0x7FFF ) { 5518 if ((uint64_t)(bSig << 1)) { 5519 return propagateFloatx80NaN(a, b, status); 5520 } 5521 if ( ( aExp | aSig ) == 0 ) { 5522 invalid: 5523 float_raise(float_flag_invalid, status); 5524 return floatx80_default_nan(status); 5525 } 5526 return packFloatx80(zSign, floatx80_infinity_high, 5527 floatx80_infinity_low); 5528 } 5529 if ( aExp == 0 ) { 5530 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5531 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5532 } 5533 if ( bExp == 0 ) { 5534 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5535 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5536 } 5537 zExp = aExp + bExp - 0x3FFE; 5538 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5539 if ( 0 < (int64_t) zSig0 ) { 5540 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5541 --zExp; 5542 } 5543 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5544 zSign, zExp, zSig0, zSig1, status); 5545 } 5546 5547 /*---------------------------------------------------------------------------- 5548 | Returns the result of dividing the extended double-precision floating-point 5549 | value `a' by the corresponding value `b'. The operation is performed 5550 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5551 *----------------------------------------------------------------------------*/ 5552 5553 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5554 { 5555 flag aSign, bSign, zSign; 5556 int32_t aExp, bExp, zExp; 5557 uint64_t aSig, bSig, zSig0, zSig1; 5558 uint64_t rem0, rem1, rem2, term0, term1, term2; 5559 5560 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5561 float_raise(float_flag_invalid, status); 5562 return floatx80_default_nan(status); 5563 } 5564 aSig = extractFloatx80Frac( a ); 5565 aExp = extractFloatx80Exp( a ); 5566 aSign = extractFloatx80Sign( a ); 5567 bSig = extractFloatx80Frac( b ); 5568 bExp = extractFloatx80Exp( b ); 5569 bSign = extractFloatx80Sign( b ); 5570 zSign = aSign ^ bSign; 5571 if ( aExp == 0x7FFF ) { 5572 if ((uint64_t)(aSig << 1)) { 5573 return propagateFloatx80NaN(a, b, status); 5574 } 5575 if ( bExp == 0x7FFF ) { 5576 if ((uint64_t)(bSig << 1)) { 5577 return propagateFloatx80NaN(a, b, status); 5578 } 5579 goto invalid; 5580 } 5581 return packFloatx80(zSign, floatx80_infinity_high, 5582 floatx80_infinity_low); 5583 } 5584 if ( bExp == 0x7FFF ) { 5585 if ((uint64_t)(bSig << 1)) { 5586 return propagateFloatx80NaN(a, b, status); 5587 } 5588 return packFloatx80( zSign, 0, 0 ); 5589 } 5590 if ( bExp == 0 ) { 5591 if ( bSig == 0 ) { 5592 if ( ( aExp | aSig ) == 0 ) { 5593 invalid: 5594 float_raise(float_flag_invalid, status); 5595 return floatx80_default_nan(status); 5596 } 5597 float_raise(float_flag_divbyzero, status); 5598 return packFloatx80(zSign, floatx80_infinity_high, 5599 floatx80_infinity_low); 5600 } 5601 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5602 } 5603 if ( aExp == 0 ) { 5604 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5605 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5606 } 5607 zExp = aExp - bExp + 0x3FFE; 5608 rem1 = 0; 5609 if ( bSig <= aSig ) { 5610 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5611 ++zExp; 5612 } 5613 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5614 mul64To128( bSig, zSig0, &term0, &term1 ); 5615 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5616 while ( (int64_t) rem0 < 0 ) { 5617 --zSig0; 5618 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5619 } 5620 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5621 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5622 mul64To128( bSig, zSig1, &term1, &term2 ); 5623 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5624 while ( (int64_t) rem1 < 0 ) { 5625 --zSig1; 5626 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5627 } 5628 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5629 } 5630 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5631 zSign, zExp, zSig0, zSig1, status); 5632 } 5633 5634 /*---------------------------------------------------------------------------- 5635 | Returns the remainder of the extended double-precision floating-point value 5636 | `a' with respect to the corresponding value `b'. The operation is performed 5637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5638 *----------------------------------------------------------------------------*/ 5639 5640 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5641 { 5642 flag aSign, zSign; 5643 int32_t aExp, bExp, expDiff; 5644 uint64_t aSig0, aSig1, bSig; 5645 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5646 5647 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5648 float_raise(float_flag_invalid, status); 5649 return floatx80_default_nan(status); 5650 } 5651 aSig0 = extractFloatx80Frac( a ); 5652 aExp = extractFloatx80Exp( a ); 5653 aSign = extractFloatx80Sign( a ); 5654 bSig = extractFloatx80Frac( b ); 5655 bExp = extractFloatx80Exp( b ); 5656 if ( aExp == 0x7FFF ) { 5657 if ( (uint64_t) ( aSig0<<1 ) 5658 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5659 return propagateFloatx80NaN(a, b, status); 5660 } 5661 goto invalid; 5662 } 5663 if ( bExp == 0x7FFF ) { 5664 if ((uint64_t)(bSig << 1)) { 5665 return propagateFloatx80NaN(a, b, status); 5666 } 5667 return a; 5668 } 5669 if ( bExp == 0 ) { 5670 if ( bSig == 0 ) { 5671 invalid: 5672 float_raise(float_flag_invalid, status); 5673 return floatx80_default_nan(status); 5674 } 5675 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5676 } 5677 if ( aExp == 0 ) { 5678 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5679 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5680 } 5681 bSig |= LIT64( 0x8000000000000000 ); 5682 zSign = aSign; 5683 expDiff = aExp - bExp; 5684 aSig1 = 0; 5685 if ( expDiff < 0 ) { 5686 if ( expDiff < -1 ) return a; 5687 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5688 expDiff = 0; 5689 } 5690 q = ( bSig <= aSig0 ); 5691 if ( q ) aSig0 -= bSig; 5692 expDiff -= 64; 5693 while ( 0 < expDiff ) { 5694 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5695 q = ( 2 < q ) ? q - 2 : 0; 5696 mul64To128( bSig, q, &term0, &term1 ); 5697 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5698 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5699 expDiff -= 62; 5700 } 5701 expDiff += 64; 5702 if ( 0 < expDiff ) { 5703 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5704 q = ( 2 < q ) ? q - 2 : 0; 5705 q >>= 64 - expDiff; 5706 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5707 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5708 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5709 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5710 ++q; 5711 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5712 } 5713 } 5714 else { 5715 term1 = 0; 5716 term0 = bSig; 5717 } 5718 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5719 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5720 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5721 && ( q & 1 ) ) 5722 ) { 5723 aSig0 = alternateASig0; 5724 aSig1 = alternateASig1; 5725 zSign = ! zSign; 5726 } 5727 return 5728 normalizeRoundAndPackFloatx80( 5729 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5730 5731 } 5732 5733 /*---------------------------------------------------------------------------- 5734 | Returns the square root of the extended double-precision floating-point 5735 | value `a'. The operation is performed according to the IEC/IEEE Standard 5736 | for Binary Floating-Point Arithmetic. 5737 *----------------------------------------------------------------------------*/ 5738 5739 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5740 { 5741 flag aSign; 5742 int32_t aExp, zExp; 5743 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5744 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5745 5746 if (floatx80_invalid_encoding(a)) { 5747 float_raise(float_flag_invalid, status); 5748 return floatx80_default_nan(status); 5749 } 5750 aSig0 = extractFloatx80Frac( a ); 5751 aExp = extractFloatx80Exp( a ); 5752 aSign = extractFloatx80Sign( a ); 5753 if ( aExp == 0x7FFF ) { 5754 if ((uint64_t)(aSig0 << 1)) { 5755 return propagateFloatx80NaN(a, a, status); 5756 } 5757 if ( ! aSign ) return a; 5758 goto invalid; 5759 } 5760 if ( aSign ) { 5761 if ( ( aExp | aSig0 ) == 0 ) return a; 5762 invalid: 5763 float_raise(float_flag_invalid, status); 5764 return floatx80_default_nan(status); 5765 } 5766 if ( aExp == 0 ) { 5767 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5768 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5769 } 5770 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5771 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5772 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5773 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5774 doubleZSig0 = zSig0<<1; 5775 mul64To128( zSig0, zSig0, &term0, &term1 ); 5776 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5777 while ( (int64_t) rem0 < 0 ) { 5778 --zSig0; 5779 doubleZSig0 -= 2; 5780 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5781 } 5782 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5783 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5784 if ( zSig1 == 0 ) zSig1 = 1; 5785 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5786 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5787 mul64To128( zSig1, zSig1, &term2, &term3 ); 5788 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5789 while ( (int64_t) rem1 < 0 ) { 5790 --zSig1; 5791 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5792 term3 |= 1; 5793 term2 |= doubleZSig0; 5794 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5795 } 5796 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5797 } 5798 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5799 zSig0 |= doubleZSig0; 5800 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5801 0, zExp, zSig0, zSig1, status); 5802 } 5803 5804 /*---------------------------------------------------------------------------- 5805 | Returns 1 if the extended double-precision floating-point value `a' is equal 5806 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5807 | raised if either operand is a NaN. Otherwise, the comparison is performed 5808 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5809 *----------------------------------------------------------------------------*/ 5810 5811 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5812 { 5813 5814 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5815 || (extractFloatx80Exp(a) == 0x7FFF 5816 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5817 || (extractFloatx80Exp(b) == 0x7FFF 5818 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5819 ) { 5820 float_raise(float_flag_invalid, status); 5821 return 0; 5822 } 5823 return 5824 ( a.low == b.low ) 5825 && ( ( a.high == b.high ) 5826 || ( ( a.low == 0 ) 5827 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5828 ); 5829 5830 } 5831 5832 /*---------------------------------------------------------------------------- 5833 | Returns 1 if the extended double-precision floating-point value `a' is 5834 | less than or equal to the corresponding value `b', and 0 otherwise. The 5835 | invalid exception is raised if either operand is a NaN. The comparison is 5836 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5837 | Arithmetic. 5838 *----------------------------------------------------------------------------*/ 5839 5840 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5841 { 5842 flag aSign, bSign; 5843 5844 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5845 || (extractFloatx80Exp(a) == 0x7FFF 5846 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5847 || (extractFloatx80Exp(b) == 0x7FFF 5848 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5849 ) { 5850 float_raise(float_flag_invalid, status); 5851 return 0; 5852 } 5853 aSign = extractFloatx80Sign( a ); 5854 bSign = extractFloatx80Sign( b ); 5855 if ( aSign != bSign ) { 5856 return 5857 aSign 5858 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5859 == 0 ); 5860 } 5861 return 5862 aSign ? le128( b.high, b.low, a.high, a.low ) 5863 : le128( a.high, a.low, b.high, b.low ); 5864 5865 } 5866 5867 /*---------------------------------------------------------------------------- 5868 | Returns 1 if the extended double-precision floating-point value `a' is 5869 | less than the corresponding value `b', and 0 otherwise. The invalid 5870 | exception is raised if either operand is a NaN. The comparison is performed 5871 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5872 *----------------------------------------------------------------------------*/ 5873 5874 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5875 { 5876 flag aSign, bSign; 5877 5878 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5879 || (extractFloatx80Exp(a) == 0x7FFF 5880 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5881 || (extractFloatx80Exp(b) == 0x7FFF 5882 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5883 ) { 5884 float_raise(float_flag_invalid, status); 5885 return 0; 5886 } 5887 aSign = extractFloatx80Sign( a ); 5888 bSign = extractFloatx80Sign( b ); 5889 if ( aSign != bSign ) { 5890 return 5891 aSign 5892 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5893 != 0 ); 5894 } 5895 return 5896 aSign ? lt128( b.high, b.low, a.high, a.low ) 5897 : lt128( a.high, a.low, b.high, b.low ); 5898 5899 } 5900 5901 /*---------------------------------------------------------------------------- 5902 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5903 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5904 | either operand is a NaN. The comparison is performed according to the 5905 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5906 *----------------------------------------------------------------------------*/ 5907 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5908 { 5909 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5910 || (extractFloatx80Exp(a) == 0x7FFF 5911 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5912 || (extractFloatx80Exp(b) == 0x7FFF 5913 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5914 ) { 5915 float_raise(float_flag_invalid, status); 5916 return 1; 5917 } 5918 return 0; 5919 } 5920 5921 /*---------------------------------------------------------------------------- 5922 | Returns 1 if the extended double-precision floating-point value `a' is 5923 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5924 | cause an exception. The comparison is performed according to the IEC/IEEE 5925 | Standard for Binary Floating-Point Arithmetic. 5926 *----------------------------------------------------------------------------*/ 5927 5928 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5929 { 5930 5931 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5932 float_raise(float_flag_invalid, status); 5933 return 0; 5934 } 5935 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5936 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5937 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5938 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5939 ) { 5940 if (floatx80_is_signaling_nan(a, status) 5941 || floatx80_is_signaling_nan(b, status)) { 5942 float_raise(float_flag_invalid, status); 5943 } 5944 return 0; 5945 } 5946 return 5947 ( a.low == b.low ) 5948 && ( ( a.high == b.high ) 5949 || ( ( a.low == 0 ) 5950 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5951 ); 5952 5953 } 5954 5955 /*---------------------------------------------------------------------------- 5956 | Returns 1 if the extended double-precision floating-point value `a' is less 5957 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5958 | do not cause an exception. Otherwise, the comparison is performed according 5959 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5960 *----------------------------------------------------------------------------*/ 5961 5962 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5963 { 5964 flag aSign, bSign; 5965 5966 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5967 float_raise(float_flag_invalid, status); 5968 return 0; 5969 } 5970 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5971 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5972 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5973 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5974 ) { 5975 if (floatx80_is_signaling_nan(a, status) 5976 || floatx80_is_signaling_nan(b, status)) { 5977 float_raise(float_flag_invalid, status); 5978 } 5979 return 0; 5980 } 5981 aSign = extractFloatx80Sign( a ); 5982 bSign = extractFloatx80Sign( b ); 5983 if ( aSign != bSign ) { 5984 return 5985 aSign 5986 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5987 == 0 ); 5988 } 5989 return 5990 aSign ? le128( b.high, b.low, a.high, a.low ) 5991 : le128( a.high, a.low, b.high, b.low ); 5992 5993 } 5994 5995 /*---------------------------------------------------------------------------- 5996 | Returns 1 if the extended double-precision floating-point value `a' is less 5997 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5998 | an exception. Otherwise, the comparison is performed according to the 5999 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6000 *----------------------------------------------------------------------------*/ 6001 6002 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6003 { 6004 flag aSign, bSign; 6005 6006 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6007 float_raise(float_flag_invalid, status); 6008 return 0; 6009 } 6010 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6011 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6012 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6013 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6014 ) { 6015 if (floatx80_is_signaling_nan(a, status) 6016 || floatx80_is_signaling_nan(b, status)) { 6017 float_raise(float_flag_invalid, status); 6018 } 6019 return 0; 6020 } 6021 aSign = extractFloatx80Sign( a ); 6022 bSign = extractFloatx80Sign( b ); 6023 if ( aSign != bSign ) { 6024 return 6025 aSign 6026 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6027 != 0 ); 6028 } 6029 return 6030 aSign ? lt128( b.high, b.low, a.high, a.low ) 6031 : lt128( a.high, a.low, b.high, b.low ); 6032 6033 } 6034 6035 /*---------------------------------------------------------------------------- 6036 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6037 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6038 | The comparison is performed according to the IEC/IEEE Standard for Binary 6039 | Floating-Point Arithmetic. 6040 *----------------------------------------------------------------------------*/ 6041 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6042 { 6043 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6044 float_raise(float_flag_invalid, status); 6045 return 1; 6046 } 6047 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6048 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6049 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6050 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6051 ) { 6052 if (floatx80_is_signaling_nan(a, status) 6053 || floatx80_is_signaling_nan(b, status)) { 6054 float_raise(float_flag_invalid, status); 6055 } 6056 return 1; 6057 } 6058 return 0; 6059 } 6060 6061 /*---------------------------------------------------------------------------- 6062 | Returns the result of converting the quadruple-precision floating-point 6063 | value `a' to the 32-bit two's complement integer format. The conversion 6064 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6065 | Arithmetic---which means in particular that the conversion is rounded 6066 | according to the current rounding mode. If `a' is a NaN, the largest 6067 | positive integer is returned. Otherwise, if the conversion overflows, the 6068 | largest integer with the same sign as `a' is returned. 6069 *----------------------------------------------------------------------------*/ 6070 6071 int32_t float128_to_int32(float128 a, float_status *status) 6072 { 6073 flag aSign; 6074 int32_t aExp, shiftCount; 6075 uint64_t aSig0, aSig1; 6076 6077 aSig1 = extractFloat128Frac1( a ); 6078 aSig0 = extractFloat128Frac0( a ); 6079 aExp = extractFloat128Exp( a ); 6080 aSign = extractFloat128Sign( a ); 6081 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6082 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6083 aSig0 |= ( aSig1 != 0 ); 6084 shiftCount = 0x4028 - aExp; 6085 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6086 return roundAndPackInt32(aSign, aSig0, status); 6087 6088 } 6089 6090 /*---------------------------------------------------------------------------- 6091 | Returns the result of converting the quadruple-precision floating-point 6092 | value `a' to the 32-bit two's complement integer format. The conversion 6093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6094 | Arithmetic, except that the conversion is always rounded toward zero. If 6095 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6096 | conversion overflows, the largest integer with the same sign as `a' is 6097 | returned. 6098 *----------------------------------------------------------------------------*/ 6099 6100 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6101 { 6102 flag aSign; 6103 int32_t aExp, shiftCount; 6104 uint64_t aSig0, aSig1, savedASig; 6105 int32_t z; 6106 6107 aSig1 = extractFloat128Frac1( a ); 6108 aSig0 = extractFloat128Frac0( a ); 6109 aExp = extractFloat128Exp( a ); 6110 aSign = extractFloat128Sign( a ); 6111 aSig0 |= ( aSig1 != 0 ); 6112 if ( 0x401E < aExp ) { 6113 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6114 goto invalid; 6115 } 6116 else if ( aExp < 0x3FFF ) { 6117 if (aExp || aSig0) { 6118 status->float_exception_flags |= float_flag_inexact; 6119 } 6120 return 0; 6121 } 6122 aSig0 |= LIT64( 0x0001000000000000 ); 6123 shiftCount = 0x402F - aExp; 6124 savedASig = aSig0; 6125 aSig0 >>= shiftCount; 6126 z = aSig0; 6127 if ( aSign ) z = - z; 6128 if ( ( z < 0 ) ^ aSign ) { 6129 invalid: 6130 float_raise(float_flag_invalid, status); 6131 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6132 } 6133 if ( ( aSig0<<shiftCount ) != savedASig ) { 6134 status->float_exception_flags |= float_flag_inexact; 6135 } 6136 return z; 6137 6138 } 6139 6140 /*---------------------------------------------------------------------------- 6141 | Returns the result of converting the quadruple-precision floating-point 6142 | value `a' to the 64-bit two's complement integer format. The conversion 6143 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6144 | Arithmetic---which means in particular that the conversion is rounded 6145 | according to the current rounding mode. If `a' is a NaN, the largest 6146 | positive integer is returned. Otherwise, if the conversion overflows, the 6147 | largest integer with the same sign as `a' is returned. 6148 *----------------------------------------------------------------------------*/ 6149 6150 int64_t float128_to_int64(float128 a, float_status *status) 6151 { 6152 flag aSign; 6153 int32_t aExp, shiftCount; 6154 uint64_t aSig0, aSig1; 6155 6156 aSig1 = extractFloat128Frac1( a ); 6157 aSig0 = extractFloat128Frac0( a ); 6158 aExp = extractFloat128Exp( a ); 6159 aSign = extractFloat128Sign( a ); 6160 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6161 shiftCount = 0x402F - aExp; 6162 if ( shiftCount <= 0 ) { 6163 if ( 0x403E < aExp ) { 6164 float_raise(float_flag_invalid, status); 6165 if ( ! aSign 6166 || ( ( aExp == 0x7FFF ) 6167 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6168 ) 6169 ) { 6170 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6171 } 6172 return (int64_t) LIT64( 0x8000000000000000 ); 6173 } 6174 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6175 } 6176 else { 6177 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6178 } 6179 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the result of converting the quadruple-precision floating-point 6185 | value `a' to the 64-bit two's complement integer format. The conversion 6186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6187 | Arithmetic, except that the conversion is always rounded toward zero. 6188 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6189 | the conversion overflows, the largest integer with the same sign as `a' is 6190 | returned. 6191 *----------------------------------------------------------------------------*/ 6192 6193 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6194 { 6195 flag aSign; 6196 int32_t aExp, shiftCount; 6197 uint64_t aSig0, aSig1; 6198 int64_t z; 6199 6200 aSig1 = extractFloat128Frac1( a ); 6201 aSig0 = extractFloat128Frac0( a ); 6202 aExp = extractFloat128Exp( a ); 6203 aSign = extractFloat128Sign( a ); 6204 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6205 shiftCount = aExp - 0x402F; 6206 if ( 0 < shiftCount ) { 6207 if ( 0x403E <= aExp ) { 6208 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6209 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6210 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6211 if (aSig1) { 6212 status->float_exception_flags |= float_flag_inexact; 6213 } 6214 } 6215 else { 6216 float_raise(float_flag_invalid, status); 6217 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6218 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6219 } 6220 } 6221 return (int64_t) LIT64( 0x8000000000000000 ); 6222 } 6223 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6224 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6225 status->float_exception_flags |= float_flag_inexact; 6226 } 6227 } 6228 else { 6229 if ( aExp < 0x3FFF ) { 6230 if ( aExp | aSig0 | aSig1 ) { 6231 status->float_exception_flags |= float_flag_inexact; 6232 } 6233 return 0; 6234 } 6235 z = aSig0>>( - shiftCount ); 6236 if ( aSig1 6237 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6238 status->float_exception_flags |= float_flag_inexact; 6239 } 6240 } 6241 if ( aSign ) z = - z; 6242 return z; 6243 6244 } 6245 6246 /*---------------------------------------------------------------------------- 6247 | Returns the result of converting the quadruple-precision floating-point value 6248 | `a' to the 64-bit unsigned integer format. The conversion is 6249 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6250 | Arithmetic---which means in particular that the conversion is rounded 6251 | according to the current rounding mode. If `a' is a NaN, the largest 6252 | positive integer is returned. If the conversion overflows, the 6253 | largest unsigned integer is returned. If 'a' is negative, the value is 6254 | rounded and zero is returned; negative values that do not round to zero 6255 | will raise the inexact exception. 6256 *----------------------------------------------------------------------------*/ 6257 6258 uint64_t float128_to_uint64(float128 a, float_status *status) 6259 { 6260 flag aSign; 6261 int aExp; 6262 int shiftCount; 6263 uint64_t aSig0, aSig1; 6264 6265 aSig0 = extractFloat128Frac0(a); 6266 aSig1 = extractFloat128Frac1(a); 6267 aExp = extractFloat128Exp(a); 6268 aSign = extractFloat128Sign(a); 6269 if (aSign && (aExp > 0x3FFE)) { 6270 float_raise(float_flag_invalid, status); 6271 if (float128_is_any_nan(a)) { 6272 return LIT64(0xFFFFFFFFFFFFFFFF); 6273 } else { 6274 return 0; 6275 } 6276 } 6277 if (aExp) { 6278 aSig0 |= LIT64(0x0001000000000000); 6279 } 6280 shiftCount = 0x402F - aExp; 6281 if (shiftCount <= 0) { 6282 if (0x403E < aExp) { 6283 float_raise(float_flag_invalid, status); 6284 return LIT64(0xFFFFFFFFFFFFFFFF); 6285 } 6286 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6287 } else { 6288 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6289 } 6290 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6291 } 6292 6293 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6294 { 6295 uint64_t v; 6296 signed char current_rounding_mode = status->float_rounding_mode; 6297 6298 set_float_rounding_mode(float_round_to_zero, status); 6299 v = float128_to_uint64(a, status); 6300 set_float_rounding_mode(current_rounding_mode, status); 6301 6302 return v; 6303 } 6304 6305 /*---------------------------------------------------------------------------- 6306 | Returns the result of converting the quadruple-precision floating-point 6307 | value `a' to the 32-bit unsigned integer format. The conversion 6308 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6309 | Arithmetic except that the conversion is always rounded toward zero. 6310 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6311 | if the conversion overflows, the largest unsigned integer is returned. 6312 | If 'a' is negative, the value is rounded and zero is returned; negative 6313 | values that do not round to zero will raise the inexact exception. 6314 *----------------------------------------------------------------------------*/ 6315 6316 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6317 { 6318 uint64_t v; 6319 uint32_t res; 6320 int old_exc_flags = get_float_exception_flags(status); 6321 6322 v = float128_to_uint64_round_to_zero(a, status); 6323 if (v > 0xffffffff) { 6324 res = 0xffffffff; 6325 } else { 6326 return v; 6327 } 6328 set_float_exception_flags(old_exc_flags, status); 6329 float_raise(float_flag_invalid, status); 6330 return res; 6331 } 6332 6333 /*---------------------------------------------------------------------------- 6334 | Returns the result of converting the quadruple-precision floating-point 6335 | value `a' to the single-precision floating-point format. The conversion 6336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6337 | Arithmetic. 6338 *----------------------------------------------------------------------------*/ 6339 6340 float32 float128_to_float32(float128 a, float_status *status) 6341 { 6342 flag aSign; 6343 int32_t aExp; 6344 uint64_t aSig0, aSig1; 6345 uint32_t zSig; 6346 6347 aSig1 = extractFloat128Frac1( a ); 6348 aSig0 = extractFloat128Frac0( a ); 6349 aExp = extractFloat128Exp( a ); 6350 aSign = extractFloat128Sign( a ); 6351 if ( aExp == 0x7FFF ) { 6352 if ( aSig0 | aSig1 ) { 6353 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6354 } 6355 return packFloat32( aSign, 0xFF, 0 ); 6356 } 6357 aSig0 |= ( aSig1 != 0 ); 6358 shift64RightJamming( aSig0, 18, &aSig0 ); 6359 zSig = aSig0; 6360 if ( aExp || zSig ) { 6361 zSig |= 0x40000000; 6362 aExp -= 0x3F81; 6363 } 6364 return roundAndPackFloat32(aSign, aExp, zSig, status); 6365 6366 } 6367 6368 /*---------------------------------------------------------------------------- 6369 | Returns the result of converting the quadruple-precision floating-point 6370 | value `a' to the double-precision floating-point format. The conversion 6371 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6372 | Arithmetic. 6373 *----------------------------------------------------------------------------*/ 6374 6375 float64 float128_to_float64(float128 a, float_status *status) 6376 { 6377 flag aSign; 6378 int32_t aExp; 6379 uint64_t aSig0, aSig1; 6380 6381 aSig1 = extractFloat128Frac1( a ); 6382 aSig0 = extractFloat128Frac0( a ); 6383 aExp = extractFloat128Exp( a ); 6384 aSign = extractFloat128Sign( a ); 6385 if ( aExp == 0x7FFF ) { 6386 if ( aSig0 | aSig1 ) { 6387 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6388 } 6389 return packFloat64( aSign, 0x7FF, 0 ); 6390 } 6391 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6392 aSig0 |= ( aSig1 != 0 ); 6393 if ( aExp || aSig0 ) { 6394 aSig0 |= LIT64( 0x4000000000000000 ); 6395 aExp -= 0x3C01; 6396 } 6397 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6398 6399 } 6400 6401 /*---------------------------------------------------------------------------- 6402 | Returns the result of converting the quadruple-precision floating-point 6403 | value `a' to the extended double-precision floating-point format. The 6404 | conversion is performed according to the IEC/IEEE Standard for Binary 6405 | Floating-Point Arithmetic. 6406 *----------------------------------------------------------------------------*/ 6407 6408 floatx80 float128_to_floatx80(float128 a, float_status *status) 6409 { 6410 flag aSign; 6411 int32_t aExp; 6412 uint64_t aSig0, aSig1; 6413 6414 aSig1 = extractFloat128Frac1( a ); 6415 aSig0 = extractFloat128Frac0( a ); 6416 aExp = extractFloat128Exp( a ); 6417 aSign = extractFloat128Sign( a ); 6418 if ( aExp == 0x7FFF ) { 6419 if ( aSig0 | aSig1 ) { 6420 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6421 } 6422 return packFloatx80(aSign, floatx80_infinity_high, 6423 floatx80_infinity_low); 6424 } 6425 if ( aExp == 0 ) { 6426 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6427 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6428 } 6429 else { 6430 aSig0 |= LIT64( 0x0001000000000000 ); 6431 } 6432 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6433 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6434 6435 } 6436 6437 /*---------------------------------------------------------------------------- 6438 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6439 | returns the result as a quadruple-precision floating-point value. The 6440 | operation is performed according to the IEC/IEEE Standard for Binary 6441 | Floating-Point Arithmetic. 6442 *----------------------------------------------------------------------------*/ 6443 6444 float128 float128_round_to_int(float128 a, float_status *status) 6445 { 6446 flag aSign; 6447 int32_t aExp; 6448 uint64_t lastBitMask, roundBitsMask; 6449 float128 z; 6450 6451 aExp = extractFloat128Exp( a ); 6452 if ( 0x402F <= aExp ) { 6453 if ( 0x406F <= aExp ) { 6454 if ( ( aExp == 0x7FFF ) 6455 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6456 ) { 6457 return propagateFloat128NaN(a, a, status); 6458 } 6459 return a; 6460 } 6461 lastBitMask = 1; 6462 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6463 roundBitsMask = lastBitMask - 1; 6464 z = a; 6465 switch (status->float_rounding_mode) { 6466 case float_round_nearest_even: 6467 if ( lastBitMask ) { 6468 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6469 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6470 } 6471 else { 6472 if ( (int64_t) z.low < 0 ) { 6473 ++z.high; 6474 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6475 } 6476 } 6477 break; 6478 case float_round_ties_away: 6479 if (lastBitMask) { 6480 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6481 } else { 6482 if ((int64_t) z.low < 0) { 6483 ++z.high; 6484 } 6485 } 6486 break; 6487 case float_round_to_zero: 6488 break; 6489 case float_round_up: 6490 if (!extractFloat128Sign(z)) { 6491 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6492 } 6493 break; 6494 case float_round_down: 6495 if (extractFloat128Sign(z)) { 6496 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6497 } 6498 break; 6499 default: 6500 abort(); 6501 } 6502 z.low &= ~ roundBitsMask; 6503 } 6504 else { 6505 if ( aExp < 0x3FFF ) { 6506 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6507 status->float_exception_flags |= float_flag_inexact; 6508 aSign = extractFloat128Sign( a ); 6509 switch (status->float_rounding_mode) { 6510 case float_round_nearest_even: 6511 if ( ( aExp == 0x3FFE ) 6512 && ( extractFloat128Frac0( a ) 6513 | extractFloat128Frac1( a ) ) 6514 ) { 6515 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6516 } 6517 break; 6518 case float_round_ties_away: 6519 if (aExp == 0x3FFE) { 6520 return packFloat128(aSign, 0x3FFF, 0, 0); 6521 } 6522 break; 6523 case float_round_down: 6524 return 6525 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6526 : packFloat128( 0, 0, 0, 0 ); 6527 case float_round_up: 6528 return 6529 aSign ? packFloat128( 1, 0, 0, 0 ) 6530 : packFloat128( 0, 0x3FFF, 0, 0 ); 6531 } 6532 return packFloat128( aSign, 0, 0, 0 ); 6533 } 6534 lastBitMask = 1; 6535 lastBitMask <<= 0x402F - aExp; 6536 roundBitsMask = lastBitMask - 1; 6537 z.low = 0; 6538 z.high = a.high; 6539 switch (status->float_rounding_mode) { 6540 case float_round_nearest_even: 6541 z.high += lastBitMask>>1; 6542 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6543 z.high &= ~ lastBitMask; 6544 } 6545 break; 6546 case float_round_ties_away: 6547 z.high += lastBitMask>>1; 6548 break; 6549 case float_round_to_zero: 6550 break; 6551 case float_round_up: 6552 if (!extractFloat128Sign(z)) { 6553 z.high |= ( a.low != 0 ); 6554 z.high += roundBitsMask; 6555 } 6556 break; 6557 case float_round_down: 6558 if (extractFloat128Sign(z)) { 6559 z.high |= (a.low != 0); 6560 z.high += roundBitsMask; 6561 } 6562 break; 6563 default: 6564 abort(); 6565 } 6566 z.high &= ~ roundBitsMask; 6567 } 6568 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6569 status->float_exception_flags |= float_flag_inexact; 6570 } 6571 return z; 6572 6573 } 6574 6575 /*---------------------------------------------------------------------------- 6576 | Returns the result of adding the absolute values of the quadruple-precision 6577 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6578 | before being returned. `zSign' is ignored if the result is a NaN. 6579 | The addition is performed according to the IEC/IEEE Standard for Binary 6580 | Floating-Point Arithmetic. 6581 *----------------------------------------------------------------------------*/ 6582 6583 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6584 float_status *status) 6585 { 6586 int32_t aExp, bExp, zExp; 6587 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6588 int32_t expDiff; 6589 6590 aSig1 = extractFloat128Frac1( a ); 6591 aSig0 = extractFloat128Frac0( a ); 6592 aExp = extractFloat128Exp( a ); 6593 bSig1 = extractFloat128Frac1( b ); 6594 bSig0 = extractFloat128Frac0( b ); 6595 bExp = extractFloat128Exp( b ); 6596 expDiff = aExp - bExp; 6597 if ( 0 < expDiff ) { 6598 if ( aExp == 0x7FFF ) { 6599 if (aSig0 | aSig1) { 6600 return propagateFloat128NaN(a, b, status); 6601 } 6602 return a; 6603 } 6604 if ( bExp == 0 ) { 6605 --expDiff; 6606 } 6607 else { 6608 bSig0 |= LIT64( 0x0001000000000000 ); 6609 } 6610 shift128ExtraRightJamming( 6611 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6612 zExp = aExp; 6613 } 6614 else if ( expDiff < 0 ) { 6615 if ( bExp == 0x7FFF ) { 6616 if (bSig0 | bSig1) { 6617 return propagateFloat128NaN(a, b, status); 6618 } 6619 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6620 } 6621 if ( aExp == 0 ) { 6622 ++expDiff; 6623 } 6624 else { 6625 aSig0 |= LIT64( 0x0001000000000000 ); 6626 } 6627 shift128ExtraRightJamming( 6628 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6629 zExp = bExp; 6630 } 6631 else { 6632 if ( aExp == 0x7FFF ) { 6633 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6634 return propagateFloat128NaN(a, b, status); 6635 } 6636 return a; 6637 } 6638 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6639 if ( aExp == 0 ) { 6640 if (status->flush_to_zero) { 6641 if (zSig0 | zSig1) { 6642 float_raise(float_flag_output_denormal, status); 6643 } 6644 return packFloat128(zSign, 0, 0, 0); 6645 } 6646 return packFloat128( zSign, 0, zSig0, zSig1 ); 6647 } 6648 zSig2 = 0; 6649 zSig0 |= LIT64( 0x0002000000000000 ); 6650 zExp = aExp; 6651 goto shiftRight1; 6652 } 6653 aSig0 |= LIT64( 0x0001000000000000 ); 6654 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6655 --zExp; 6656 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6657 ++zExp; 6658 shiftRight1: 6659 shift128ExtraRightJamming( 6660 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6661 roundAndPack: 6662 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6663 6664 } 6665 6666 /*---------------------------------------------------------------------------- 6667 | Returns the result of subtracting the absolute values of the quadruple- 6668 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6669 | difference is negated before being returned. `zSign' is ignored if the 6670 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6671 | Standard for Binary Floating-Point Arithmetic. 6672 *----------------------------------------------------------------------------*/ 6673 6674 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6675 float_status *status) 6676 { 6677 int32_t aExp, bExp, zExp; 6678 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6679 int32_t expDiff; 6680 6681 aSig1 = extractFloat128Frac1( a ); 6682 aSig0 = extractFloat128Frac0( a ); 6683 aExp = extractFloat128Exp( a ); 6684 bSig1 = extractFloat128Frac1( b ); 6685 bSig0 = extractFloat128Frac0( b ); 6686 bExp = extractFloat128Exp( b ); 6687 expDiff = aExp - bExp; 6688 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6689 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6690 if ( 0 < expDiff ) goto aExpBigger; 6691 if ( expDiff < 0 ) goto bExpBigger; 6692 if ( aExp == 0x7FFF ) { 6693 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6694 return propagateFloat128NaN(a, b, status); 6695 } 6696 float_raise(float_flag_invalid, status); 6697 return float128_default_nan(status); 6698 } 6699 if ( aExp == 0 ) { 6700 aExp = 1; 6701 bExp = 1; 6702 } 6703 if ( bSig0 < aSig0 ) goto aBigger; 6704 if ( aSig0 < bSig0 ) goto bBigger; 6705 if ( bSig1 < aSig1 ) goto aBigger; 6706 if ( aSig1 < bSig1 ) goto bBigger; 6707 return packFloat128(status->float_rounding_mode == float_round_down, 6708 0, 0, 0); 6709 bExpBigger: 6710 if ( bExp == 0x7FFF ) { 6711 if (bSig0 | bSig1) { 6712 return propagateFloat128NaN(a, b, status); 6713 } 6714 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6715 } 6716 if ( aExp == 0 ) { 6717 ++expDiff; 6718 } 6719 else { 6720 aSig0 |= LIT64( 0x4000000000000000 ); 6721 } 6722 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6723 bSig0 |= LIT64( 0x4000000000000000 ); 6724 bBigger: 6725 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6726 zExp = bExp; 6727 zSign ^= 1; 6728 goto normalizeRoundAndPack; 6729 aExpBigger: 6730 if ( aExp == 0x7FFF ) { 6731 if (aSig0 | aSig1) { 6732 return propagateFloat128NaN(a, b, status); 6733 } 6734 return a; 6735 } 6736 if ( bExp == 0 ) { 6737 --expDiff; 6738 } 6739 else { 6740 bSig0 |= LIT64( 0x4000000000000000 ); 6741 } 6742 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6743 aSig0 |= LIT64( 0x4000000000000000 ); 6744 aBigger: 6745 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6746 zExp = aExp; 6747 normalizeRoundAndPack: 6748 --zExp; 6749 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6750 status); 6751 6752 } 6753 6754 /*---------------------------------------------------------------------------- 6755 | Returns the result of adding the quadruple-precision floating-point values 6756 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6757 | for Binary Floating-Point Arithmetic. 6758 *----------------------------------------------------------------------------*/ 6759 6760 float128 float128_add(float128 a, float128 b, float_status *status) 6761 { 6762 flag aSign, bSign; 6763 6764 aSign = extractFloat128Sign( a ); 6765 bSign = extractFloat128Sign( b ); 6766 if ( aSign == bSign ) { 6767 return addFloat128Sigs(a, b, aSign, status); 6768 } 6769 else { 6770 return subFloat128Sigs(a, b, aSign, status); 6771 } 6772 6773 } 6774 6775 /*---------------------------------------------------------------------------- 6776 | Returns the result of subtracting the quadruple-precision floating-point 6777 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6778 | Standard for Binary Floating-Point Arithmetic. 6779 *----------------------------------------------------------------------------*/ 6780 6781 float128 float128_sub(float128 a, float128 b, float_status *status) 6782 { 6783 flag aSign, bSign; 6784 6785 aSign = extractFloat128Sign( a ); 6786 bSign = extractFloat128Sign( b ); 6787 if ( aSign == bSign ) { 6788 return subFloat128Sigs(a, b, aSign, status); 6789 } 6790 else { 6791 return addFloat128Sigs(a, b, aSign, status); 6792 } 6793 6794 } 6795 6796 /*---------------------------------------------------------------------------- 6797 | Returns the result of multiplying the quadruple-precision floating-point 6798 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6799 | Standard for Binary Floating-Point Arithmetic. 6800 *----------------------------------------------------------------------------*/ 6801 6802 float128 float128_mul(float128 a, float128 b, float_status *status) 6803 { 6804 flag aSign, bSign, zSign; 6805 int32_t aExp, bExp, zExp; 6806 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6807 6808 aSig1 = extractFloat128Frac1( a ); 6809 aSig0 = extractFloat128Frac0( a ); 6810 aExp = extractFloat128Exp( a ); 6811 aSign = extractFloat128Sign( a ); 6812 bSig1 = extractFloat128Frac1( b ); 6813 bSig0 = extractFloat128Frac0( b ); 6814 bExp = extractFloat128Exp( b ); 6815 bSign = extractFloat128Sign( b ); 6816 zSign = aSign ^ bSign; 6817 if ( aExp == 0x7FFF ) { 6818 if ( ( aSig0 | aSig1 ) 6819 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6820 return propagateFloat128NaN(a, b, status); 6821 } 6822 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6823 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6824 } 6825 if ( bExp == 0x7FFF ) { 6826 if (bSig0 | bSig1) { 6827 return propagateFloat128NaN(a, b, status); 6828 } 6829 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6830 invalid: 6831 float_raise(float_flag_invalid, status); 6832 return float128_default_nan(status); 6833 } 6834 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6835 } 6836 if ( aExp == 0 ) { 6837 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6838 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6839 } 6840 if ( bExp == 0 ) { 6841 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6842 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6843 } 6844 zExp = aExp + bExp - 0x4000; 6845 aSig0 |= LIT64( 0x0001000000000000 ); 6846 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6847 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6848 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6849 zSig2 |= ( zSig3 != 0 ); 6850 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6851 shift128ExtraRightJamming( 6852 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6853 ++zExp; 6854 } 6855 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6856 6857 } 6858 6859 /*---------------------------------------------------------------------------- 6860 | Returns the result of dividing the quadruple-precision floating-point value 6861 | `a' by the corresponding value `b'. The operation is performed according to 6862 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6863 *----------------------------------------------------------------------------*/ 6864 6865 float128 float128_div(float128 a, float128 b, float_status *status) 6866 { 6867 flag aSign, bSign, zSign; 6868 int32_t aExp, bExp, zExp; 6869 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6870 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6871 6872 aSig1 = extractFloat128Frac1( a ); 6873 aSig0 = extractFloat128Frac0( a ); 6874 aExp = extractFloat128Exp( a ); 6875 aSign = extractFloat128Sign( a ); 6876 bSig1 = extractFloat128Frac1( b ); 6877 bSig0 = extractFloat128Frac0( b ); 6878 bExp = extractFloat128Exp( b ); 6879 bSign = extractFloat128Sign( b ); 6880 zSign = aSign ^ bSign; 6881 if ( aExp == 0x7FFF ) { 6882 if (aSig0 | aSig1) { 6883 return propagateFloat128NaN(a, b, status); 6884 } 6885 if ( bExp == 0x7FFF ) { 6886 if (bSig0 | bSig1) { 6887 return propagateFloat128NaN(a, b, status); 6888 } 6889 goto invalid; 6890 } 6891 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6892 } 6893 if ( bExp == 0x7FFF ) { 6894 if (bSig0 | bSig1) { 6895 return propagateFloat128NaN(a, b, status); 6896 } 6897 return packFloat128( zSign, 0, 0, 0 ); 6898 } 6899 if ( bExp == 0 ) { 6900 if ( ( bSig0 | bSig1 ) == 0 ) { 6901 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6902 invalid: 6903 float_raise(float_flag_invalid, status); 6904 return float128_default_nan(status); 6905 } 6906 float_raise(float_flag_divbyzero, status); 6907 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6908 } 6909 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6910 } 6911 if ( aExp == 0 ) { 6912 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6913 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6914 } 6915 zExp = aExp - bExp + 0x3FFD; 6916 shortShift128Left( 6917 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6918 shortShift128Left( 6919 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6920 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6921 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6922 ++zExp; 6923 } 6924 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6925 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6926 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6927 while ( (int64_t) rem0 < 0 ) { 6928 --zSig0; 6929 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6930 } 6931 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6932 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6933 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6934 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6935 while ( (int64_t) rem1 < 0 ) { 6936 --zSig1; 6937 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6938 } 6939 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6940 } 6941 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6942 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6943 6944 } 6945 6946 /*---------------------------------------------------------------------------- 6947 | Returns the remainder of the quadruple-precision floating-point value `a' 6948 | with respect to the corresponding value `b'. The operation is performed 6949 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6950 *----------------------------------------------------------------------------*/ 6951 6952 float128 float128_rem(float128 a, float128 b, float_status *status) 6953 { 6954 flag aSign, zSign; 6955 int32_t aExp, bExp, expDiff; 6956 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6957 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6958 int64_t sigMean0; 6959 6960 aSig1 = extractFloat128Frac1( a ); 6961 aSig0 = extractFloat128Frac0( a ); 6962 aExp = extractFloat128Exp( a ); 6963 aSign = extractFloat128Sign( a ); 6964 bSig1 = extractFloat128Frac1( b ); 6965 bSig0 = extractFloat128Frac0( b ); 6966 bExp = extractFloat128Exp( b ); 6967 if ( aExp == 0x7FFF ) { 6968 if ( ( aSig0 | aSig1 ) 6969 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6970 return propagateFloat128NaN(a, b, status); 6971 } 6972 goto invalid; 6973 } 6974 if ( bExp == 0x7FFF ) { 6975 if (bSig0 | bSig1) { 6976 return propagateFloat128NaN(a, b, status); 6977 } 6978 return a; 6979 } 6980 if ( bExp == 0 ) { 6981 if ( ( bSig0 | bSig1 ) == 0 ) { 6982 invalid: 6983 float_raise(float_flag_invalid, status); 6984 return float128_default_nan(status); 6985 } 6986 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6987 } 6988 if ( aExp == 0 ) { 6989 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6990 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6991 } 6992 expDiff = aExp - bExp; 6993 if ( expDiff < -1 ) return a; 6994 shortShift128Left( 6995 aSig0 | LIT64( 0x0001000000000000 ), 6996 aSig1, 6997 15 - ( expDiff < 0 ), 6998 &aSig0, 6999 &aSig1 7000 ); 7001 shortShift128Left( 7002 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7003 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7004 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7005 expDiff -= 64; 7006 while ( 0 < expDiff ) { 7007 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7008 q = ( 4 < q ) ? q - 4 : 0; 7009 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7010 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7011 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7012 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7013 expDiff -= 61; 7014 } 7015 if ( -64 < expDiff ) { 7016 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7017 q = ( 4 < q ) ? q - 4 : 0; 7018 q >>= - expDiff; 7019 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7020 expDiff += 52; 7021 if ( expDiff < 0 ) { 7022 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7023 } 7024 else { 7025 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7026 } 7027 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7028 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7029 } 7030 else { 7031 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7032 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7033 } 7034 do { 7035 alternateASig0 = aSig0; 7036 alternateASig1 = aSig1; 7037 ++q; 7038 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7039 } while ( 0 <= (int64_t) aSig0 ); 7040 add128( 7041 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7042 if ( ( sigMean0 < 0 ) 7043 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7044 aSig0 = alternateASig0; 7045 aSig1 = alternateASig1; 7046 } 7047 zSign = ( (int64_t) aSig0 < 0 ); 7048 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7049 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7050 status); 7051 } 7052 7053 /*---------------------------------------------------------------------------- 7054 | Returns the square root of the quadruple-precision floating-point value `a'. 7055 | The operation is performed according to the IEC/IEEE Standard for Binary 7056 | Floating-Point Arithmetic. 7057 *----------------------------------------------------------------------------*/ 7058 7059 float128 float128_sqrt(float128 a, float_status *status) 7060 { 7061 flag aSign; 7062 int32_t aExp, zExp; 7063 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7064 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7065 7066 aSig1 = extractFloat128Frac1( a ); 7067 aSig0 = extractFloat128Frac0( a ); 7068 aExp = extractFloat128Exp( a ); 7069 aSign = extractFloat128Sign( a ); 7070 if ( aExp == 0x7FFF ) { 7071 if (aSig0 | aSig1) { 7072 return propagateFloat128NaN(a, a, status); 7073 } 7074 if ( ! aSign ) return a; 7075 goto invalid; 7076 } 7077 if ( aSign ) { 7078 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7079 invalid: 7080 float_raise(float_flag_invalid, status); 7081 return float128_default_nan(status); 7082 } 7083 if ( aExp == 0 ) { 7084 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7085 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7086 } 7087 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7088 aSig0 |= LIT64( 0x0001000000000000 ); 7089 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7090 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7091 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7092 doubleZSig0 = zSig0<<1; 7093 mul64To128( zSig0, zSig0, &term0, &term1 ); 7094 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7095 while ( (int64_t) rem0 < 0 ) { 7096 --zSig0; 7097 doubleZSig0 -= 2; 7098 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7099 } 7100 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7101 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7102 if ( zSig1 == 0 ) zSig1 = 1; 7103 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7104 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7105 mul64To128( zSig1, zSig1, &term2, &term3 ); 7106 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7107 while ( (int64_t) rem1 < 0 ) { 7108 --zSig1; 7109 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7110 term3 |= 1; 7111 term2 |= doubleZSig0; 7112 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7113 } 7114 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7115 } 7116 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7117 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7118 7119 } 7120 7121 /*---------------------------------------------------------------------------- 7122 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7123 | the corresponding value `b', and 0 otherwise. The invalid exception is 7124 | raised if either operand is a NaN. Otherwise, the comparison is performed 7125 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7126 *----------------------------------------------------------------------------*/ 7127 7128 int float128_eq(float128 a, float128 b, float_status *status) 7129 { 7130 7131 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7132 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7133 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7134 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7135 ) { 7136 float_raise(float_flag_invalid, status); 7137 return 0; 7138 } 7139 return 7140 ( a.low == b.low ) 7141 && ( ( a.high == b.high ) 7142 || ( ( a.low == 0 ) 7143 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7144 ); 7145 7146 } 7147 7148 /*---------------------------------------------------------------------------- 7149 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7150 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7151 | exception is raised if either operand is a NaN. The comparison is performed 7152 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7153 *----------------------------------------------------------------------------*/ 7154 7155 int float128_le(float128 a, float128 b, float_status *status) 7156 { 7157 flag aSign, bSign; 7158 7159 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7160 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7161 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7162 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7163 ) { 7164 float_raise(float_flag_invalid, status); 7165 return 0; 7166 } 7167 aSign = extractFloat128Sign( a ); 7168 bSign = extractFloat128Sign( b ); 7169 if ( aSign != bSign ) { 7170 return 7171 aSign 7172 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7173 == 0 ); 7174 } 7175 return 7176 aSign ? le128( b.high, b.low, a.high, a.low ) 7177 : le128( a.high, a.low, b.high, b.low ); 7178 7179 } 7180 7181 /*---------------------------------------------------------------------------- 7182 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7183 | the corresponding value `b', and 0 otherwise. The invalid exception is 7184 | raised if either operand is a NaN. The comparison is performed according 7185 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7186 *----------------------------------------------------------------------------*/ 7187 7188 int float128_lt(float128 a, float128 b, float_status *status) 7189 { 7190 flag aSign, bSign; 7191 7192 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7193 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7194 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7195 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7196 ) { 7197 float_raise(float_flag_invalid, status); 7198 return 0; 7199 } 7200 aSign = extractFloat128Sign( a ); 7201 bSign = extractFloat128Sign( b ); 7202 if ( aSign != bSign ) { 7203 return 7204 aSign 7205 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7206 != 0 ); 7207 } 7208 return 7209 aSign ? lt128( b.high, b.low, a.high, a.low ) 7210 : lt128( a.high, a.low, b.high, b.low ); 7211 7212 } 7213 7214 /*---------------------------------------------------------------------------- 7215 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7216 | be compared, and 0 otherwise. The invalid exception is raised if either 7217 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7218 | Standard for Binary Floating-Point Arithmetic. 7219 *----------------------------------------------------------------------------*/ 7220 7221 int float128_unordered(float128 a, float128 b, float_status *status) 7222 { 7223 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7224 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7225 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7226 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7227 ) { 7228 float_raise(float_flag_invalid, status); 7229 return 1; 7230 } 7231 return 0; 7232 } 7233 7234 /*---------------------------------------------------------------------------- 7235 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7236 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7237 | exception. The comparison is performed according to the IEC/IEEE Standard 7238 | for Binary Floating-Point Arithmetic. 7239 *----------------------------------------------------------------------------*/ 7240 7241 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7242 { 7243 7244 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7245 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7246 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7247 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7248 ) { 7249 if (float128_is_signaling_nan(a, status) 7250 || float128_is_signaling_nan(b, status)) { 7251 float_raise(float_flag_invalid, status); 7252 } 7253 return 0; 7254 } 7255 return 7256 ( a.low == b.low ) 7257 && ( ( a.high == b.high ) 7258 || ( ( a.low == 0 ) 7259 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7260 ); 7261 7262 } 7263 7264 /*---------------------------------------------------------------------------- 7265 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7266 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7267 | cause an exception. Otherwise, the comparison is performed according to the 7268 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7269 *----------------------------------------------------------------------------*/ 7270 7271 int float128_le_quiet(float128 a, float128 b, float_status *status) 7272 { 7273 flag aSign, bSign; 7274 7275 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7276 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7277 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7278 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7279 ) { 7280 if (float128_is_signaling_nan(a, status) 7281 || float128_is_signaling_nan(b, status)) { 7282 float_raise(float_flag_invalid, status); 7283 } 7284 return 0; 7285 } 7286 aSign = extractFloat128Sign( a ); 7287 bSign = extractFloat128Sign( b ); 7288 if ( aSign != bSign ) { 7289 return 7290 aSign 7291 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7292 == 0 ); 7293 } 7294 return 7295 aSign ? le128( b.high, b.low, a.high, a.low ) 7296 : le128( a.high, a.low, b.high, b.low ); 7297 7298 } 7299 7300 /*---------------------------------------------------------------------------- 7301 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7302 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7303 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7304 | Standard for Binary Floating-Point Arithmetic. 7305 *----------------------------------------------------------------------------*/ 7306 7307 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7308 { 7309 flag aSign, bSign; 7310 7311 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7312 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7313 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7314 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7315 ) { 7316 if (float128_is_signaling_nan(a, status) 7317 || float128_is_signaling_nan(b, status)) { 7318 float_raise(float_flag_invalid, status); 7319 } 7320 return 0; 7321 } 7322 aSign = extractFloat128Sign( a ); 7323 bSign = extractFloat128Sign( b ); 7324 if ( aSign != bSign ) { 7325 return 7326 aSign 7327 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7328 != 0 ); 7329 } 7330 return 7331 aSign ? lt128( b.high, b.low, a.high, a.low ) 7332 : lt128( a.high, a.low, b.high, b.low ); 7333 7334 } 7335 7336 /*---------------------------------------------------------------------------- 7337 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7338 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7339 | comparison is performed according to the IEC/IEEE Standard for Binary 7340 | Floating-Point Arithmetic. 7341 *----------------------------------------------------------------------------*/ 7342 7343 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7344 { 7345 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7346 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7347 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7348 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7349 ) { 7350 if (float128_is_signaling_nan(a, status) 7351 || float128_is_signaling_nan(b, status)) { 7352 float_raise(float_flag_invalid, status); 7353 } 7354 return 1; 7355 } 7356 return 0; 7357 } 7358 7359 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7360 int is_quiet, float_status *status) 7361 { 7362 flag aSign, bSign; 7363 7364 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7365 float_raise(float_flag_invalid, status); 7366 return float_relation_unordered; 7367 } 7368 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7369 ( extractFloatx80Frac( a )<<1 ) ) || 7370 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7371 ( extractFloatx80Frac( b )<<1 ) )) { 7372 if (!is_quiet || 7373 floatx80_is_signaling_nan(a, status) || 7374 floatx80_is_signaling_nan(b, status)) { 7375 float_raise(float_flag_invalid, status); 7376 } 7377 return float_relation_unordered; 7378 } 7379 aSign = extractFloatx80Sign( a ); 7380 bSign = extractFloatx80Sign( b ); 7381 if ( aSign != bSign ) { 7382 7383 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7384 ( ( a.low | b.low ) == 0 ) ) { 7385 /* zero case */ 7386 return float_relation_equal; 7387 } else { 7388 return 1 - (2 * aSign); 7389 } 7390 } else { 7391 if (a.low == b.low && a.high == b.high) { 7392 return float_relation_equal; 7393 } else { 7394 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7395 } 7396 } 7397 } 7398 7399 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7400 { 7401 return floatx80_compare_internal(a, b, 0, status); 7402 } 7403 7404 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7405 { 7406 return floatx80_compare_internal(a, b, 1, status); 7407 } 7408 7409 static inline int float128_compare_internal(float128 a, float128 b, 7410 int is_quiet, float_status *status) 7411 { 7412 flag aSign, bSign; 7413 7414 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7415 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7416 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7417 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7418 if (!is_quiet || 7419 float128_is_signaling_nan(a, status) || 7420 float128_is_signaling_nan(b, status)) { 7421 float_raise(float_flag_invalid, status); 7422 } 7423 return float_relation_unordered; 7424 } 7425 aSign = extractFloat128Sign( a ); 7426 bSign = extractFloat128Sign( b ); 7427 if ( aSign != bSign ) { 7428 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7429 /* zero case */ 7430 return float_relation_equal; 7431 } else { 7432 return 1 - (2 * aSign); 7433 } 7434 } else { 7435 if (a.low == b.low && a.high == b.high) { 7436 return float_relation_equal; 7437 } else { 7438 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7439 } 7440 } 7441 } 7442 7443 int float128_compare(float128 a, float128 b, float_status *status) 7444 { 7445 return float128_compare_internal(a, b, 0, status); 7446 } 7447 7448 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7449 { 7450 return float128_compare_internal(a, b, 1, status); 7451 } 7452 7453 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7454 { 7455 flag aSign; 7456 int32_t aExp; 7457 uint64_t aSig; 7458 7459 if (floatx80_invalid_encoding(a)) { 7460 float_raise(float_flag_invalid, status); 7461 return floatx80_default_nan(status); 7462 } 7463 aSig = extractFloatx80Frac( a ); 7464 aExp = extractFloatx80Exp( a ); 7465 aSign = extractFloatx80Sign( a ); 7466 7467 if ( aExp == 0x7FFF ) { 7468 if ( aSig<<1 ) { 7469 return propagateFloatx80NaN(a, a, status); 7470 } 7471 return a; 7472 } 7473 7474 if (aExp == 0) { 7475 if (aSig == 0) { 7476 return a; 7477 } 7478 aExp++; 7479 } 7480 7481 if (n > 0x10000) { 7482 n = 0x10000; 7483 } else if (n < -0x10000) { 7484 n = -0x10000; 7485 } 7486 7487 aExp += n; 7488 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7489 aSign, aExp, aSig, 0, status); 7490 } 7491 7492 float128 float128_scalbn(float128 a, int n, float_status *status) 7493 { 7494 flag aSign; 7495 int32_t aExp; 7496 uint64_t aSig0, aSig1; 7497 7498 aSig1 = extractFloat128Frac1( a ); 7499 aSig0 = extractFloat128Frac0( a ); 7500 aExp = extractFloat128Exp( a ); 7501 aSign = extractFloat128Sign( a ); 7502 if ( aExp == 0x7FFF ) { 7503 if ( aSig0 | aSig1 ) { 7504 return propagateFloat128NaN(a, a, status); 7505 } 7506 return a; 7507 } 7508 if (aExp != 0) { 7509 aSig0 |= LIT64( 0x0001000000000000 ); 7510 } else if (aSig0 == 0 && aSig1 == 0) { 7511 return a; 7512 } else { 7513 aExp++; 7514 } 7515 7516 if (n > 0x10000) { 7517 n = 0x10000; 7518 } else if (n < -0x10000) { 7519 n = -0x10000; 7520 } 7521 7522 aExp += n - 1; 7523 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7524 , status); 7525 7526 } 7527