xref: /qemu/fpu/softfloat.c (revision 1b615d482094e0123d187f0ad3c676ba8eb9d0a3)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
1240 {
1241     FloatParts pa = float32_unpack_canonical(a, status);
1242     FloatParts pb = float32_unpack_canonical(b, status);
1243     FloatParts pr = mul_floats(pa, pb, status);
1244 
1245     return float32_round_pack_canonical(pr, status);
1246 }
1247 
1248 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
1249 {
1250     FloatParts pa = float64_unpack_canonical(a, status);
1251     FloatParts pb = float64_unpack_canonical(b, status);
1252     FloatParts pr = mul_floats(pa, pb, status);
1253 
1254     return float64_round_pack_canonical(pr, status);
1255 }
1256 
1257 /*
1258  * Returns the result of multiplying the floating-point values `a' and
1259  * `b' then adding 'c', with no intermediate rounding step after the
1260  * multiplication. The operation is performed according to the
1261  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1262  * The flags argument allows the caller to select negation of the
1263  * addend, the intermediate product, or the final result. (The
1264  * difference between this and having the caller do a separate
1265  * negation is that negating externally will flip the sign bit on
1266  * NaNs.)
1267  */
1268 
1269 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1270                                 int flags, float_status *s)
1271 {
1272     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1273                     ((1 << float_class_inf) | (1 << float_class_zero));
1274     bool p_sign;
1275     bool sign_flip = flags & float_muladd_negate_result;
1276     FloatClass p_class;
1277     uint64_t hi, lo;
1278     int p_exp;
1279 
1280     /* It is implementation-defined whether the cases of (0,inf,qnan)
1281      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1282      * they return if they do), so we have to hand this information
1283      * off to the target-specific pick-a-NaN routine.
1284      */
1285     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1286         return pick_nan_muladd(a, b, c, inf_zero, s);
1287     }
1288 
1289     if (inf_zero) {
1290         s->float_exception_flags |= float_flag_invalid;
1291         return parts_default_nan(s);
1292     }
1293 
1294     if (flags & float_muladd_negate_c) {
1295         c.sign ^= 1;
1296     }
1297 
1298     p_sign = a.sign ^ b.sign;
1299 
1300     if (flags & float_muladd_negate_product) {
1301         p_sign ^= 1;
1302     }
1303 
1304     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1305         p_class = float_class_inf;
1306     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1307         p_class = float_class_zero;
1308     } else {
1309         p_class = float_class_normal;
1310     }
1311 
1312     if (c.cls == float_class_inf) {
1313         if (p_class == float_class_inf && p_sign != c.sign) {
1314             s->float_exception_flags |= float_flag_invalid;
1315             return parts_default_nan(s);
1316         } else {
1317             a.cls = float_class_inf;
1318             a.sign = c.sign ^ sign_flip;
1319             return a;
1320         }
1321     }
1322 
1323     if (p_class == float_class_inf) {
1324         a.cls = float_class_inf;
1325         a.sign = p_sign ^ sign_flip;
1326         return a;
1327     }
1328 
1329     if (p_class == float_class_zero) {
1330         if (c.cls == float_class_zero) {
1331             if (p_sign != c.sign) {
1332                 p_sign = s->float_rounding_mode == float_round_down;
1333             }
1334             c.sign = p_sign;
1335         } else if (flags & float_muladd_halve_result) {
1336             c.exp -= 1;
1337         }
1338         c.sign ^= sign_flip;
1339         return c;
1340     }
1341 
1342     /* a & b should be normals now... */
1343     assert(a.cls == float_class_normal &&
1344            b.cls == float_class_normal);
1345 
1346     p_exp = a.exp + b.exp;
1347 
1348     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1349      * result.
1350      */
1351     mul64To128(a.frac, b.frac, &hi, &lo);
1352     /* binary point now at bit 124 */
1353 
1354     /* check for overflow */
1355     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1356         shift128RightJamming(hi, lo, 1, &hi, &lo);
1357         p_exp += 1;
1358     }
1359 
1360     /* + add/sub */
1361     if (c.cls == float_class_zero) {
1362         /* move binary point back to 62 */
1363         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1364     } else {
1365         int exp_diff = p_exp - c.exp;
1366         if (p_sign == c.sign) {
1367             /* Addition */
1368             if (exp_diff <= 0) {
1369                 shift128RightJamming(hi, lo,
1370                                      DECOMPOSED_BINARY_POINT - exp_diff,
1371                                      &hi, &lo);
1372                 lo += c.frac;
1373                 p_exp = c.exp;
1374             } else {
1375                 uint64_t c_hi, c_lo;
1376                 /* shift c to the same binary point as the product (124) */
1377                 c_hi = c.frac >> 2;
1378                 c_lo = 0;
1379                 shift128RightJamming(c_hi, c_lo,
1380                                      exp_diff,
1381                                      &c_hi, &c_lo);
1382                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1383                 /* move binary point back to 62 */
1384                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1385             }
1386 
1387             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1388                 shift64RightJamming(lo, 1, &lo);
1389                 p_exp += 1;
1390             }
1391 
1392         } else {
1393             /* Subtraction */
1394             uint64_t c_hi, c_lo;
1395             /* make C binary point match product at bit 124 */
1396             c_hi = c.frac >> 2;
1397             c_lo = 0;
1398 
1399             if (exp_diff <= 0) {
1400                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1401                 if (exp_diff == 0
1402                     &&
1403                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1404                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1405                 } else {
1406                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1407                     p_sign ^= 1;
1408                     p_exp = c.exp;
1409                 }
1410             } else {
1411                 shift128RightJamming(c_hi, c_lo,
1412                                      exp_diff,
1413                                      &c_hi, &c_lo);
1414                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1415             }
1416 
1417             if (hi == 0 && lo == 0) {
1418                 a.cls = float_class_zero;
1419                 a.sign = s->float_rounding_mode == float_round_down;
1420                 a.sign ^= sign_flip;
1421                 return a;
1422             } else {
1423                 int shift;
1424                 if (hi != 0) {
1425                     shift = clz64(hi);
1426                 } else {
1427                     shift = clz64(lo) + 64;
1428                 }
1429                 /* Normalizing to a binary point of 124 is the
1430                    correct adjust for the exponent.  However since we're
1431                    shifting, we might as well put the binary point back
1432                    at 62 where we really want it.  Therefore shift as
1433                    if we're leaving 1 bit at the top of the word, but
1434                    adjust the exponent as if we're leaving 3 bits.  */
1435                 shift -= 1;
1436                 if (shift >= 64) {
1437                     lo = lo << (shift - 64);
1438                 } else {
1439                     hi = (hi << shift) | (lo >> (64 - shift));
1440                     lo = hi | ((lo << shift) != 0);
1441                 }
1442                 p_exp -= shift - 2;
1443             }
1444         }
1445     }
1446 
1447     if (flags & float_muladd_halve_result) {
1448         p_exp -= 1;
1449     }
1450 
1451     /* finally prepare our result */
1452     a.cls = float_class_normal;
1453     a.sign = p_sign ^ sign_flip;
1454     a.exp = p_exp;
1455     a.frac = lo;
1456 
1457     return a;
1458 }
1459 
1460 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1461                                                 int flags, float_status *status)
1462 {
1463     FloatParts pa = float16_unpack_canonical(a, status);
1464     FloatParts pb = float16_unpack_canonical(b, status);
1465     FloatParts pc = float16_unpack_canonical(c, status);
1466     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1467 
1468     return float16_round_pack_canonical(pr, status);
1469 }
1470 
1471 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1472                                                 int flags, float_status *status)
1473 {
1474     FloatParts pa = float32_unpack_canonical(a, status);
1475     FloatParts pb = float32_unpack_canonical(b, status);
1476     FloatParts pc = float32_unpack_canonical(c, status);
1477     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1478 
1479     return float32_round_pack_canonical(pr, status);
1480 }
1481 
1482 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1483                                                 int flags, float_status *status)
1484 {
1485     FloatParts pa = float64_unpack_canonical(a, status);
1486     FloatParts pb = float64_unpack_canonical(b, status);
1487     FloatParts pc = float64_unpack_canonical(c, status);
1488     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1489 
1490     return float64_round_pack_canonical(pr, status);
1491 }
1492 
1493 /*
1494  * Returns the result of dividing the floating-point value `a' by the
1495  * corresponding value `b'. The operation is performed according to
1496  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1497  */
1498 
1499 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1500 {
1501     bool sign = a.sign ^ b.sign;
1502 
1503     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1504         uint64_t n0, n1, q, r;
1505         int exp = a.exp - b.exp;
1506 
1507         /*
1508          * We want a 2*N / N-bit division to produce exactly an N-bit
1509          * result, so that we do not lose any precision and so that we
1510          * do not have to renormalize afterward.  If A.frac < B.frac,
1511          * then division would produce an (N-1)-bit result; shift A left
1512          * by one to produce the an N-bit result, and decrement the
1513          * exponent to match.
1514          *
1515          * The udiv_qrnnd algorithm that we're using requires normalization,
1516          * i.e. the msb of the denominator must be set.  Since we know that
1517          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1518          * by one (more), and the remainder must be shifted right by one.
1519          */
1520         if (a.frac < b.frac) {
1521             exp -= 1;
1522             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1523         } else {
1524             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1525         }
1526         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1527 
1528         /*
1529          * Set lsb if there is a remainder, to set inexact.
1530          * As mentioned above, to find the actual value of the remainder we
1531          * would need to shift right, but (1) we are only concerned about
1532          * non-zero-ness, and (2) the remainder will always be even because
1533          * both inputs to the division primitive are even.
1534          */
1535         a.frac = q | (r != 0);
1536         a.sign = sign;
1537         a.exp = exp;
1538         return a;
1539     }
1540     /* handle all the NaN cases */
1541     if (is_nan(a.cls) || is_nan(b.cls)) {
1542         return pick_nan(a, b, s);
1543     }
1544     /* 0/0 or Inf/Inf */
1545     if (a.cls == b.cls
1546         &&
1547         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1548         s->float_exception_flags |= float_flag_invalid;
1549         return parts_default_nan(s);
1550     }
1551     /* Inf / x or 0 / x */
1552     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1553         a.sign = sign;
1554         return a;
1555     }
1556     /* Div 0 => Inf */
1557     if (b.cls == float_class_zero) {
1558         s->float_exception_flags |= float_flag_divbyzero;
1559         a.cls = float_class_inf;
1560         a.sign = sign;
1561         return a;
1562     }
1563     /* Div by Inf */
1564     if (b.cls == float_class_inf) {
1565         a.cls = float_class_zero;
1566         a.sign = sign;
1567         return a;
1568     }
1569     g_assert_not_reached();
1570 }
1571 
1572 float16 float16_div(float16 a, float16 b, float_status *status)
1573 {
1574     FloatParts pa = float16_unpack_canonical(a, status);
1575     FloatParts pb = float16_unpack_canonical(b, status);
1576     FloatParts pr = div_floats(pa, pb, status);
1577 
1578     return float16_round_pack_canonical(pr, status);
1579 }
1580 
1581 float32 float32_div(float32 a, float32 b, float_status *status)
1582 {
1583     FloatParts pa = float32_unpack_canonical(a, status);
1584     FloatParts pb = float32_unpack_canonical(b, status);
1585     FloatParts pr = div_floats(pa, pb, status);
1586 
1587     return float32_round_pack_canonical(pr, status);
1588 }
1589 
1590 float64 float64_div(float64 a, float64 b, float_status *status)
1591 {
1592     FloatParts pa = float64_unpack_canonical(a, status);
1593     FloatParts pb = float64_unpack_canonical(b, status);
1594     FloatParts pr = div_floats(pa, pb, status);
1595 
1596     return float64_round_pack_canonical(pr, status);
1597 }
1598 
1599 /*
1600  * Float to Float conversions
1601  *
1602  * Returns the result of converting one float format to another. The
1603  * conversion is performed according to the IEC/IEEE Standard for
1604  * Binary Floating-Point Arithmetic.
1605  *
1606  * The float_to_float helper only needs to take care of raising
1607  * invalid exceptions and handling the conversion on NaNs.
1608  */
1609 
1610 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1611                                  float_status *s)
1612 {
1613     if (dstf->arm_althp) {
1614         switch (a.cls) {
1615         case float_class_qnan:
1616         case float_class_snan:
1617             /* There is no NaN in the destination format.  Raise Invalid
1618              * and return a zero with the sign of the input NaN.
1619              */
1620             s->float_exception_flags |= float_flag_invalid;
1621             a.cls = float_class_zero;
1622             a.frac = 0;
1623             a.exp = 0;
1624             break;
1625 
1626         case float_class_inf:
1627             /* There is no Inf in the destination format.  Raise Invalid
1628              * and return the maximum normal with the correct sign.
1629              */
1630             s->float_exception_flags |= float_flag_invalid;
1631             a.cls = float_class_normal;
1632             a.exp = dstf->exp_max;
1633             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1634             break;
1635 
1636         default:
1637             break;
1638         }
1639     } else if (is_nan(a.cls)) {
1640         if (is_snan(a.cls)) {
1641             s->float_exception_flags |= float_flag_invalid;
1642             a = parts_silence_nan(a, s);
1643         }
1644         if (s->default_nan_mode) {
1645             return parts_default_nan(s);
1646         }
1647     }
1648     return a;
1649 }
1650 
1651 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1652 {
1653     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1654     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1655     FloatParts pr = float_to_float(p, &float32_params, s);
1656     return float32_round_pack_canonical(pr, s);
1657 }
1658 
1659 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1660 {
1661     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1662     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1663     FloatParts pr = float_to_float(p, &float64_params, s);
1664     return float64_round_pack_canonical(pr, s);
1665 }
1666 
1667 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1668 {
1669     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1670     FloatParts p = float32_unpack_canonical(a, s);
1671     FloatParts pr = float_to_float(p, fmt16, s);
1672     return float16a_round_pack_canonical(pr, s, fmt16);
1673 }
1674 
1675 float64 float32_to_float64(float32 a, float_status *s)
1676 {
1677     FloatParts p = float32_unpack_canonical(a, s);
1678     FloatParts pr = float_to_float(p, &float64_params, s);
1679     return float64_round_pack_canonical(pr, s);
1680 }
1681 
1682 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1683 {
1684     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1685     FloatParts p = float64_unpack_canonical(a, s);
1686     FloatParts pr = float_to_float(p, fmt16, s);
1687     return float16a_round_pack_canonical(pr, s, fmt16);
1688 }
1689 
1690 float32 float64_to_float32(float64 a, float_status *s)
1691 {
1692     FloatParts p = float64_unpack_canonical(a, s);
1693     FloatParts pr = float_to_float(p, &float32_params, s);
1694     return float32_round_pack_canonical(pr, s);
1695 }
1696 
1697 /*
1698  * Rounds the floating-point value `a' to an integer, and returns the
1699  * result as a floating-point value. The operation is performed
1700  * according to the IEC/IEEE Standard for Binary Floating-Point
1701  * Arithmetic.
1702  */
1703 
1704 static FloatParts round_to_int(FloatParts a, int rmode,
1705                                int scale, float_status *s)
1706 {
1707     switch (a.cls) {
1708     case float_class_qnan:
1709     case float_class_snan:
1710         return return_nan(a, s);
1711 
1712     case float_class_zero:
1713     case float_class_inf:
1714         /* already "integral" */
1715         break;
1716 
1717     case float_class_normal:
1718         scale = MIN(MAX(scale, -0x10000), 0x10000);
1719         a.exp += scale;
1720 
1721         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1722             /* already integral */
1723             break;
1724         }
1725         if (a.exp < 0) {
1726             bool one;
1727             /* all fractional */
1728             s->float_exception_flags |= float_flag_inexact;
1729             switch (rmode) {
1730             case float_round_nearest_even:
1731                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1732                 break;
1733             case float_round_ties_away:
1734                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1735                 break;
1736             case float_round_to_zero:
1737                 one = false;
1738                 break;
1739             case float_round_up:
1740                 one = !a.sign;
1741                 break;
1742             case float_round_down:
1743                 one = a.sign;
1744                 break;
1745             default:
1746                 g_assert_not_reached();
1747             }
1748 
1749             if (one) {
1750                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1751                 a.exp = 0;
1752             } else {
1753                 a.cls = float_class_zero;
1754             }
1755         } else {
1756             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1757             uint64_t frac_lsbm1 = frac_lsb >> 1;
1758             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1759             uint64_t rnd_mask = rnd_even_mask >> 1;
1760             uint64_t inc;
1761 
1762             switch (rmode) {
1763             case float_round_nearest_even:
1764                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1765                 break;
1766             case float_round_ties_away:
1767                 inc = frac_lsbm1;
1768                 break;
1769             case float_round_to_zero:
1770                 inc = 0;
1771                 break;
1772             case float_round_up:
1773                 inc = a.sign ? 0 : rnd_mask;
1774                 break;
1775             case float_round_down:
1776                 inc = a.sign ? rnd_mask : 0;
1777                 break;
1778             default:
1779                 g_assert_not_reached();
1780             }
1781 
1782             if (a.frac & rnd_mask) {
1783                 s->float_exception_flags |= float_flag_inexact;
1784                 a.frac += inc;
1785                 a.frac &= ~rnd_mask;
1786                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1787                     a.frac >>= 1;
1788                     a.exp++;
1789                 }
1790             }
1791         }
1792         break;
1793     default:
1794         g_assert_not_reached();
1795     }
1796     return a;
1797 }
1798 
1799 float16 float16_round_to_int(float16 a, float_status *s)
1800 {
1801     FloatParts pa = float16_unpack_canonical(a, s);
1802     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1803     return float16_round_pack_canonical(pr, s);
1804 }
1805 
1806 float32 float32_round_to_int(float32 a, float_status *s)
1807 {
1808     FloatParts pa = float32_unpack_canonical(a, s);
1809     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1810     return float32_round_pack_canonical(pr, s);
1811 }
1812 
1813 float64 float64_round_to_int(float64 a, float_status *s)
1814 {
1815     FloatParts pa = float64_unpack_canonical(a, s);
1816     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1817     return float64_round_pack_canonical(pr, s);
1818 }
1819 
1820 /*
1821  * Returns the result of converting the floating-point value `a' to
1822  * the two's complement integer format. The conversion is performed
1823  * according to the IEC/IEEE Standard for Binary Floating-Point
1824  * Arithmetic---which means in particular that the conversion is
1825  * rounded according to the current rounding mode. If `a' is a NaN,
1826  * the largest positive integer is returned. Otherwise, if the
1827  * conversion overflows, the largest integer with the same sign as `a'
1828  * is returned.
1829 */
1830 
1831 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1832                                      int64_t min, int64_t max,
1833                                      float_status *s)
1834 {
1835     uint64_t r;
1836     int orig_flags = get_float_exception_flags(s);
1837     FloatParts p = round_to_int(in, rmode, scale, s);
1838 
1839     switch (p.cls) {
1840     case float_class_snan:
1841     case float_class_qnan:
1842         s->float_exception_flags = orig_flags | float_flag_invalid;
1843         return max;
1844     case float_class_inf:
1845         s->float_exception_flags = orig_flags | float_flag_invalid;
1846         return p.sign ? min : max;
1847     case float_class_zero:
1848         return 0;
1849     case float_class_normal:
1850         if (p.exp < DECOMPOSED_BINARY_POINT) {
1851             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1852         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1853             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1854         } else {
1855             r = UINT64_MAX;
1856         }
1857         if (p.sign) {
1858             if (r <= -(uint64_t) min) {
1859                 return -r;
1860             } else {
1861                 s->float_exception_flags = orig_flags | float_flag_invalid;
1862                 return min;
1863             }
1864         } else {
1865             if (r <= max) {
1866                 return r;
1867             } else {
1868                 s->float_exception_flags = orig_flags | float_flag_invalid;
1869                 return max;
1870             }
1871         }
1872     default:
1873         g_assert_not_reached();
1874     }
1875 }
1876 
1877 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1878                                 float_status *s)
1879 {
1880     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1881                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1882 }
1883 
1884 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1885                                 float_status *s)
1886 {
1887     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1888                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1889 }
1890 
1891 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1892                                 float_status *s)
1893 {
1894     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1895                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1896 }
1897 
1898 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1899                                 float_status *s)
1900 {
1901     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1902                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1903 }
1904 
1905 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1906                                 float_status *s)
1907 {
1908     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1909                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1910 }
1911 
1912 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1913                                 float_status *s)
1914 {
1915     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1916                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1917 }
1918 
1919 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1920                                 float_status *s)
1921 {
1922     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1923                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1924 }
1925 
1926 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1927                                 float_status *s)
1928 {
1929     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1930                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1931 }
1932 
1933 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1934                                 float_status *s)
1935 {
1936     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1937                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1938 }
1939 
1940 int16_t float16_to_int16(float16 a, float_status *s)
1941 {
1942     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1943 }
1944 
1945 int32_t float16_to_int32(float16 a, float_status *s)
1946 {
1947     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1948 }
1949 
1950 int64_t float16_to_int64(float16 a, float_status *s)
1951 {
1952     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1953 }
1954 
1955 int16_t float32_to_int16(float32 a, float_status *s)
1956 {
1957     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1958 }
1959 
1960 int32_t float32_to_int32(float32 a, float_status *s)
1961 {
1962     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1963 }
1964 
1965 int64_t float32_to_int64(float32 a, float_status *s)
1966 {
1967     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1968 }
1969 
1970 int16_t float64_to_int16(float64 a, float_status *s)
1971 {
1972     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1973 }
1974 
1975 int32_t float64_to_int32(float64 a, float_status *s)
1976 {
1977     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1978 }
1979 
1980 int64_t float64_to_int64(float64 a, float_status *s)
1981 {
1982     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1983 }
1984 
1985 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1986 {
1987     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1988 }
1989 
1990 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1991 {
1992     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1993 }
1994 
1995 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1996 {
1997     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1998 }
1999 
2000 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2001 {
2002     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2003 }
2004 
2005 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2006 {
2007     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2008 }
2009 
2010 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2011 {
2012     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2013 }
2014 
2015 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2016 {
2017     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2018 }
2019 
2020 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2021 {
2022     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2023 }
2024 
2025 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2026 {
2027     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2028 }
2029 
2030 /*
2031  *  Returns the result of converting the floating-point value `a' to
2032  *  the unsigned integer format. The conversion is performed according
2033  *  to the IEC/IEEE Standard for Binary Floating-Point
2034  *  Arithmetic---which means in particular that the conversion is
2035  *  rounded according to the current rounding mode. If `a' is a NaN,
2036  *  the largest unsigned integer is returned. Otherwise, if the
2037  *  conversion overflows, the largest unsigned integer is returned. If
2038  *  the 'a' is negative, the result is rounded and zero is returned;
2039  *  values that do not round to zero will raise the inexact exception
2040  *  flag.
2041  */
2042 
2043 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2044                                        uint64_t max, float_status *s)
2045 {
2046     int orig_flags = get_float_exception_flags(s);
2047     FloatParts p = round_to_int(in, rmode, scale, s);
2048     uint64_t r;
2049 
2050     switch (p.cls) {
2051     case float_class_snan:
2052     case float_class_qnan:
2053         s->float_exception_flags = orig_flags | float_flag_invalid;
2054         return max;
2055     case float_class_inf:
2056         s->float_exception_flags = orig_flags | float_flag_invalid;
2057         return p.sign ? 0 : max;
2058     case float_class_zero:
2059         return 0;
2060     case float_class_normal:
2061         if (p.sign) {
2062             s->float_exception_flags = orig_flags | float_flag_invalid;
2063             return 0;
2064         }
2065 
2066         if (p.exp < DECOMPOSED_BINARY_POINT) {
2067             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2068         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2069             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2070         } else {
2071             s->float_exception_flags = orig_flags | float_flag_invalid;
2072             return max;
2073         }
2074 
2075         /* For uint64 this will never trip, but if p.exp is too large
2076          * to shift a decomposed fraction we shall have exited via the
2077          * 3rd leg above.
2078          */
2079         if (r > max) {
2080             s->float_exception_flags = orig_flags | float_flag_invalid;
2081             return max;
2082         }
2083         return r;
2084     default:
2085         g_assert_not_reached();
2086     }
2087 }
2088 
2089 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2090                                   float_status *s)
2091 {
2092     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2093                                   rmode, scale, UINT16_MAX, s);
2094 }
2095 
2096 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2097                                   float_status *s)
2098 {
2099     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2100                                   rmode, scale, UINT32_MAX, s);
2101 }
2102 
2103 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2104                                   float_status *s)
2105 {
2106     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2107                                   rmode, scale, UINT64_MAX, s);
2108 }
2109 
2110 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2111                                   float_status *s)
2112 {
2113     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2114                                   rmode, scale, UINT16_MAX, s);
2115 }
2116 
2117 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2118                                   float_status *s)
2119 {
2120     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2121                                   rmode, scale, UINT32_MAX, s);
2122 }
2123 
2124 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2125                                   float_status *s)
2126 {
2127     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2128                                   rmode, scale, UINT64_MAX, s);
2129 }
2130 
2131 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2132                                   float_status *s)
2133 {
2134     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2135                                   rmode, scale, UINT16_MAX, s);
2136 }
2137 
2138 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2139                                   float_status *s)
2140 {
2141     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2142                                   rmode, scale, UINT32_MAX, s);
2143 }
2144 
2145 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2146                                   float_status *s)
2147 {
2148     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2149                                   rmode, scale, UINT64_MAX, s);
2150 }
2151 
2152 uint16_t float16_to_uint16(float16 a, float_status *s)
2153 {
2154     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2155 }
2156 
2157 uint32_t float16_to_uint32(float16 a, float_status *s)
2158 {
2159     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2160 }
2161 
2162 uint64_t float16_to_uint64(float16 a, float_status *s)
2163 {
2164     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2165 }
2166 
2167 uint16_t float32_to_uint16(float32 a, float_status *s)
2168 {
2169     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2170 }
2171 
2172 uint32_t float32_to_uint32(float32 a, float_status *s)
2173 {
2174     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2175 }
2176 
2177 uint64_t float32_to_uint64(float32 a, float_status *s)
2178 {
2179     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2180 }
2181 
2182 uint16_t float64_to_uint16(float64 a, float_status *s)
2183 {
2184     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2185 }
2186 
2187 uint32_t float64_to_uint32(float64 a, float_status *s)
2188 {
2189     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2190 }
2191 
2192 uint64_t float64_to_uint64(float64 a, float_status *s)
2193 {
2194     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2195 }
2196 
2197 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2198 {
2199     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2200 }
2201 
2202 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2203 {
2204     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2205 }
2206 
2207 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2208 {
2209     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2210 }
2211 
2212 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2213 {
2214     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2215 }
2216 
2217 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2218 {
2219     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2220 }
2221 
2222 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2223 {
2224     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2225 }
2226 
2227 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2228 {
2229     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2230 }
2231 
2232 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2233 {
2234     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2235 }
2236 
2237 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2238 {
2239     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2240 }
2241 
2242 /*
2243  * Integer to float conversions
2244  *
2245  * Returns the result of converting the two's complement integer `a'
2246  * to the floating-point format. The conversion is performed according
2247  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248  */
2249 
2250 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2251 {
2252     FloatParts r = { .sign = false };
2253 
2254     if (a == 0) {
2255         r.cls = float_class_zero;
2256     } else {
2257         uint64_t f = a;
2258         int shift;
2259 
2260         r.cls = float_class_normal;
2261         if (a < 0) {
2262             f = -f;
2263             r.sign = true;
2264         }
2265         shift = clz64(f) - 1;
2266         scale = MIN(MAX(scale, -0x10000), 0x10000);
2267 
2268         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2269         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2270     }
2271 
2272     return r;
2273 }
2274 
2275 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2276 {
2277     FloatParts pa = int_to_float(a, scale, status);
2278     return float16_round_pack_canonical(pa, status);
2279 }
2280 
2281 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2282 {
2283     return int64_to_float16_scalbn(a, scale, status);
2284 }
2285 
2286 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2287 {
2288     return int64_to_float16_scalbn(a, scale, status);
2289 }
2290 
2291 float16 int64_to_float16(int64_t a, float_status *status)
2292 {
2293     return int64_to_float16_scalbn(a, 0, status);
2294 }
2295 
2296 float16 int32_to_float16(int32_t a, float_status *status)
2297 {
2298     return int64_to_float16_scalbn(a, 0, status);
2299 }
2300 
2301 float16 int16_to_float16(int16_t a, float_status *status)
2302 {
2303     return int64_to_float16_scalbn(a, 0, status);
2304 }
2305 
2306 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2307 {
2308     FloatParts pa = int_to_float(a, scale, status);
2309     return float32_round_pack_canonical(pa, status);
2310 }
2311 
2312 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2313 {
2314     return int64_to_float32_scalbn(a, scale, status);
2315 }
2316 
2317 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2318 {
2319     return int64_to_float32_scalbn(a, scale, status);
2320 }
2321 
2322 float32 int64_to_float32(int64_t a, float_status *status)
2323 {
2324     return int64_to_float32_scalbn(a, 0, status);
2325 }
2326 
2327 float32 int32_to_float32(int32_t a, float_status *status)
2328 {
2329     return int64_to_float32_scalbn(a, 0, status);
2330 }
2331 
2332 float32 int16_to_float32(int16_t a, float_status *status)
2333 {
2334     return int64_to_float32_scalbn(a, 0, status);
2335 }
2336 
2337 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2338 {
2339     FloatParts pa = int_to_float(a, scale, status);
2340     return float64_round_pack_canonical(pa, status);
2341 }
2342 
2343 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2344 {
2345     return int64_to_float64_scalbn(a, scale, status);
2346 }
2347 
2348 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2349 {
2350     return int64_to_float64_scalbn(a, scale, status);
2351 }
2352 
2353 float64 int64_to_float64(int64_t a, float_status *status)
2354 {
2355     return int64_to_float64_scalbn(a, 0, status);
2356 }
2357 
2358 float64 int32_to_float64(int32_t a, float_status *status)
2359 {
2360     return int64_to_float64_scalbn(a, 0, status);
2361 }
2362 
2363 float64 int16_to_float64(int16_t a, float_status *status)
2364 {
2365     return int64_to_float64_scalbn(a, 0, status);
2366 }
2367 
2368 
2369 /*
2370  * Unsigned Integer to float conversions
2371  *
2372  * Returns the result of converting the unsigned integer `a' to the
2373  * floating-point format. The conversion is performed according to the
2374  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2375  */
2376 
2377 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2378 {
2379     FloatParts r = { .sign = false };
2380 
2381     if (a == 0) {
2382         r.cls = float_class_zero;
2383     } else {
2384         scale = MIN(MAX(scale, -0x10000), 0x10000);
2385         r.cls = float_class_normal;
2386         if ((int64_t)a < 0) {
2387             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2388             shift64RightJamming(a, 1, &a);
2389             r.frac = a;
2390         } else {
2391             int shift = clz64(a) - 1;
2392             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2393             r.frac = a << shift;
2394         }
2395     }
2396 
2397     return r;
2398 }
2399 
2400 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2401 {
2402     FloatParts pa = uint_to_float(a, scale, status);
2403     return float16_round_pack_canonical(pa, status);
2404 }
2405 
2406 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2407 {
2408     return uint64_to_float16_scalbn(a, scale, status);
2409 }
2410 
2411 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2412 {
2413     return uint64_to_float16_scalbn(a, scale, status);
2414 }
2415 
2416 float16 uint64_to_float16(uint64_t a, float_status *status)
2417 {
2418     return uint64_to_float16_scalbn(a, 0, status);
2419 }
2420 
2421 float16 uint32_to_float16(uint32_t a, float_status *status)
2422 {
2423     return uint64_to_float16_scalbn(a, 0, status);
2424 }
2425 
2426 float16 uint16_to_float16(uint16_t a, float_status *status)
2427 {
2428     return uint64_to_float16_scalbn(a, 0, status);
2429 }
2430 
2431 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2432 {
2433     FloatParts pa = uint_to_float(a, scale, status);
2434     return float32_round_pack_canonical(pa, status);
2435 }
2436 
2437 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2438 {
2439     return uint64_to_float32_scalbn(a, scale, status);
2440 }
2441 
2442 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2443 {
2444     return uint64_to_float32_scalbn(a, scale, status);
2445 }
2446 
2447 float32 uint64_to_float32(uint64_t a, float_status *status)
2448 {
2449     return uint64_to_float32_scalbn(a, 0, status);
2450 }
2451 
2452 float32 uint32_to_float32(uint32_t a, float_status *status)
2453 {
2454     return uint64_to_float32_scalbn(a, 0, status);
2455 }
2456 
2457 float32 uint16_to_float32(uint16_t a, float_status *status)
2458 {
2459     return uint64_to_float32_scalbn(a, 0, status);
2460 }
2461 
2462 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2463 {
2464     FloatParts pa = uint_to_float(a, scale, status);
2465     return float64_round_pack_canonical(pa, status);
2466 }
2467 
2468 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2469 {
2470     return uint64_to_float64_scalbn(a, scale, status);
2471 }
2472 
2473 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2474 {
2475     return uint64_to_float64_scalbn(a, scale, status);
2476 }
2477 
2478 float64 uint64_to_float64(uint64_t a, float_status *status)
2479 {
2480     return uint64_to_float64_scalbn(a, 0, status);
2481 }
2482 
2483 float64 uint32_to_float64(uint32_t a, float_status *status)
2484 {
2485     return uint64_to_float64_scalbn(a, 0, status);
2486 }
2487 
2488 float64 uint16_to_float64(uint16_t a, float_status *status)
2489 {
2490     return uint64_to_float64_scalbn(a, 0, status);
2491 }
2492 
2493 /* Float Min/Max */
2494 /* min() and max() functions. These can't be implemented as
2495  * 'compare and pick one input' because that would mishandle
2496  * NaNs and +0 vs -0.
2497  *
2498  * minnum() and maxnum() functions. These are similar to the min()
2499  * and max() functions but if one of the arguments is a QNaN and
2500  * the other is numerical then the numerical argument is returned.
2501  * SNaNs will get quietened before being returned.
2502  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2503  * and maxNum() operations. min() and max() are the typical min/max
2504  * semantics provided by many CPUs which predate that specification.
2505  *
2506  * minnummag() and maxnummag() functions correspond to minNumMag()
2507  * and minNumMag() from the IEEE-754 2008.
2508  */
2509 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2510                                 bool ieee, bool ismag, float_status *s)
2511 {
2512     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2513         if (ieee) {
2514             /* Takes two floating-point values `a' and `b', one of
2515              * which is a NaN, and returns the appropriate NaN
2516              * result. If either `a' or `b' is a signaling NaN,
2517              * the invalid exception is raised.
2518              */
2519             if (is_snan(a.cls) || is_snan(b.cls)) {
2520                 return pick_nan(a, b, s);
2521             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2522                 return b;
2523             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2524                 return a;
2525             }
2526         }
2527         return pick_nan(a, b, s);
2528     } else {
2529         int a_exp, b_exp;
2530 
2531         switch (a.cls) {
2532         case float_class_normal:
2533             a_exp = a.exp;
2534             break;
2535         case float_class_inf:
2536             a_exp = INT_MAX;
2537             break;
2538         case float_class_zero:
2539             a_exp = INT_MIN;
2540             break;
2541         default:
2542             g_assert_not_reached();
2543             break;
2544         }
2545         switch (b.cls) {
2546         case float_class_normal:
2547             b_exp = b.exp;
2548             break;
2549         case float_class_inf:
2550             b_exp = INT_MAX;
2551             break;
2552         case float_class_zero:
2553             b_exp = INT_MIN;
2554             break;
2555         default:
2556             g_assert_not_reached();
2557             break;
2558         }
2559 
2560         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2561             bool a_less = a_exp < b_exp;
2562             if (a_exp == b_exp) {
2563                 a_less = a.frac < b.frac;
2564             }
2565             return a_less ^ ismin ? b : a;
2566         }
2567 
2568         if (a.sign == b.sign) {
2569             bool a_less = a_exp < b_exp;
2570             if (a_exp == b_exp) {
2571                 a_less = a.frac < b.frac;
2572             }
2573             return a.sign ^ a_less ^ ismin ? b : a;
2574         } else {
2575             return a.sign ^ ismin ? b : a;
2576         }
2577     }
2578 }
2579 
2580 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2581 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2582                                      float_status *s)                   \
2583 {                                                                       \
2584     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2585     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2586     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2587                                                                         \
2588     return float ## sz ## _round_pack_canonical(pr, s);                 \
2589 }
2590 
2591 MINMAX(16, min, true, false, false)
2592 MINMAX(16, minnum, true, true, false)
2593 MINMAX(16, minnummag, true, true, true)
2594 MINMAX(16, max, false, false, false)
2595 MINMAX(16, maxnum, false, true, false)
2596 MINMAX(16, maxnummag, false, true, true)
2597 
2598 MINMAX(32, min, true, false, false)
2599 MINMAX(32, minnum, true, true, false)
2600 MINMAX(32, minnummag, true, true, true)
2601 MINMAX(32, max, false, false, false)
2602 MINMAX(32, maxnum, false, true, false)
2603 MINMAX(32, maxnummag, false, true, true)
2604 
2605 MINMAX(64, min, true, false, false)
2606 MINMAX(64, minnum, true, true, false)
2607 MINMAX(64, minnummag, true, true, true)
2608 MINMAX(64, max, false, false, false)
2609 MINMAX(64, maxnum, false, true, false)
2610 MINMAX(64, maxnummag, false, true, true)
2611 
2612 #undef MINMAX
2613 
2614 /* Floating point compare */
2615 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2616                           float_status *s)
2617 {
2618     if (is_nan(a.cls) || is_nan(b.cls)) {
2619         if (!is_quiet ||
2620             a.cls == float_class_snan ||
2621             b.cls == float_class_snan) {
2622             s->float_exception_flags |= float_flag_invalid;
2623         }
2624         return float_relation_unordered;
2625     }
2626 
2627     if (a.cls == float_class_zero) {
2628         if (b.cls == float_class_zero) {
2629             return float_relation_equal;
2630         }
2631         return b.sign ? float_relation_greater : float_relation_less;
2632     } else if (b.cls == float_class_zero) {
2633         return a.sign ? float_relation_less : float_relation_greater;
2634     }
2635 
2636     /* The only really important thing about infinity is its sign. If
2637      * both are infinities the sign marks the smallest of the two.
2638      */
2639     if (a.cls == float_class_inf) {
2640         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2641             return float_relation_equal;
2642         }
2643         return a.sign ? float_relation_less : float_relation_greater;
2644     } else if (b.cls == float_class_inf) {
2645         return b.sign ? float_relation_greater : float_relation_less;
2646     }
2647 
2648     if (a.sign != b.sign) {
2649         return a.sign ? float_relation_less : float_relation_greater;
2650     }
2651 
2652     if (a.exp == b.exp) {
2653         if (a.frac == b.frac) {
2654             return float_relation_equal;
2655         }
2656         if (a.sign) {
2657             return a.frac > b.frac ?
2658                 float_relation_less : float_relation_greater;
2659         } else {
2660             return a.frac > b.frac ?
2661                 float_relation_greater : float_relation_less;
2662         }
2663     } else {
2664         if (a.sign) {
2665             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2666         } else {
2667             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2668         }
2669     }
2670 }
2671 
2672 #define COMPARE(sz)                                                     \
2673 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2674                             float_status *s)                            \
2675 {                                                                       \
2676     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2677     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2678     return compare_floats(pa, pb, false, s);                            \
2679 }                                                                       \
2680 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2681                                   float_status *s)                      \
2682 {                                                                       \
2683     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2684     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2685     return compare_floats(pa, pb, true, s);                             \
2686 }
2687 
2688 COMPARE(16)
2689 COMPARE(32)
2690 COMPARE(64)
2691 
2692 #undef COMPARE
2693 
2694 /* Multiply A by 2 raised to the power N.  */
2695 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2696 {
2697     if (unlikely(is_nan(a.cls))) {
2698         return return_nan(a, s);
2699     }
2700     if (a.cls == float_class_normal) {
2701         /* The largest float type (even though not supported by FloatParts)
2702          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2703          * still allows rounding to infinity, without allowing overflow
2704          * within the int32_t that backs FloatParts.exp.
2705          */
2706         n = MIN(MAX(n, -0x10000), 0x10000);
2707         a.exp += n;
2708     }
2709     return a;
2710 }
2711 
2712 float16 float16_scalbn(float16 a, int n, float_status *status)
2713 {
2714     FloatParts pa = float16_unpack_canonical(a, status);
2715     FloatParts pr = scalbn_decomposed(pa, n, status);
2716     return float16_round_pack_canonical(pr, status);
2717 }
2718 
2719 float32 float32_scalbn(float32 a, int n, float_status *status)
2720 {
2721     FloatParts pa = float32_unpack_canonical(a, status);
2722     FloatParts pr = scalbn_decomposed(pa, n, status);
2723     return float32_round_pack_canonical(pr, status);
2724 }
2725 
2726 float64 float64_scalbn(float64 a, int n, float_status *status)
2727 {
2728     FloatParts pa = float64_unpack_canonical(a, status);
2729     FloatParts pr = scalbn_decomposed(pa, n, status);
2730     return float64_round_pack_canonical(pr, status);
2731 }
2732 
2733 /*
2734  * Square Root
2735  *
2736  * The old softfloat code did an approximation step before zeroing in
2737  * on the final result. However for simpleness we just compute the
2738  * square root by iterating down from the implicit bit to enough extra
2739  * bits to ensure we get a correctly rounded result.
2740  *
2741  * This does mean however the calculation is slower than before,
2742  * especially for 64 bit floats.
2743  */
2744 
2745 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2746 {
2747     uint64_t a_frac, r_frac, s_frac;
2748     int bit, last_bit;
2749 
2750     if (is_nan(a.cls)) {
2751         return return_nan(a, s);
2752     }
2753     if (a.cls == float_class_zero) {
2754         return a;  /* sqrt(+-0) = +-0 */
2755     }
2756     if (a.sign) {
2757         s->float_exception_flags |= float_flag_invalid;
2758         return parts_default_nan(s);
2759     }
2760     if (a.cls == float_class_inf) {
2761         return a;  /* sqrt(+inf) = +inf */
2762     }
2763 
2764     assert(a.cls == float_class_normal);
2765 
2766     /* We need two overflow bits at the top. Adding room for that is a
2767      * right shift. If the exponent is odd, we can discard the low bit
2768      * by multiplying the fraction by 2; that's a left shift. Combine
2769      * those and we shift right if the exponent is even.
2770      */
2771     a_frac = a.frac;
2772     if (!(a.exp & 1)) {
2773         a_frac >>= 1;
2774     }
2775     a.exp >>= 1;
2776 
2777     /* Bit-by-bit computation of sqrt.  */
2778     r_frac = 0;
2779     s_frac = 0;
2780 
2781     /* Iterate from implicit bit down to the 3 extra bits to compute a
2782      * properly rounded result. Remember we've inserted one more bit
2783      * at the top, so these positions are one less.
2784      */
2785     bit = DECOMPOSED_BINARY_POINT - 1;
2786     last_bit = MAX(p->frac_shift - 4, 0);
2787     do {
2788         uint64_t q = 1ULL << bit;
2789         uint64_t t_frac = s_frac + q;
2790         if (t_frac <= a_frac) {
2791             s_frac = t_frac + q;
2792             a_frac -= t_frac;
2793             r_frac += q;
2794         }
2795         a_frac <<= 1;
2796     } while (--bit >= last_bit);
2797 
2798     /* Undo the right shift done above. If there is any remaining
2799      * fraction, the result is inexact. Set the sticky bit.
2800      */
2801     a.frac = (r_frac << 1) + (a_frac != 0);
2802 
2803     return a;
2804 }
2805 
2806 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2807 {
2808     FloatParts pa = float16_unpack_canonical(a, status);
2809     FloatParts pr = sqrt_float(pa, status, &float16_params);
2810     return float16_round_pack_canonical(pr, status);
2811 }
2812 
2813 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2814 {
2815     FloatParts pa = float32_unpack_canonical(a, status);
2816     FloatParts pr = sqrt_float(pa, status, &float32_params);
2817     return float32_round_pack_canonical(pr, status);
2818 }
2819 
2820 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2821 {
2822     FloatParts pa = float64_unpack_canonical(a, status);
2823     FloatParts pr = sqrt_float(pa, status, &float64_params);
2824     return float64_round_pack_canonical(pr, status);
2825 }
2826 
2827 /*----------------------------------------------------------------------------
2828 | The pattern for a default generated NaN.
2829 *----------------------------------------------------------------------------*/
2830 
2831 float16 float16_default_nan(float_status *status)
2832 {
2833     FloatParts p = parts_default_nan(status);
2834     p.frac >>= float16_params.frac_shift;
2835     return float16_pack_raw(p);
2836 }
2837 
2838 float32 float32_default_nan(float_status *status)
2839 {
2840     FloatParts p = parts_default_nan(status);
2841     p.frac >>= float32_params.frac_shift;
2842     return float32_pack_raw(p);
2843 }
2844 
2845 float64 float64_default_nan(float_status *status)
2846 {
2847     FloatParts p = parts_default_nan(status);
2848     p.frac >>= float64_params.frac_shift;
2849     return float64_pack_raw(p);
2850 }
2851 
2852 float128 float128_default_nan(float_status *status)
2853 {
2854     FloatParts p = parts_default_nan(status);
2855     float128 r;
2856 
2857     /* Extrapolate from the choices made by parts_default_nan to fill
2858      * in the quad-floating format.  If the low bit is set, assume we
2859      * want to set all non-snan bits.
2860      */
2861     r.low = -(p.frac & 1);
2862     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2863     r.high |= LIT64(0x7FFF000000000000);
2864     r.high |= (uint64_t)p.sign << 63;
2865 
2866     return r;
2867 }
2868 
2869 /*----------------------------------------------------------------------------
2870 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2871 *----------------------------------------------------------------------------*/
2872 
2873 float16 float16_silence_nan(float16 a, float_status *status)
2874 {
2875     FloatParts p = float16_unpack_raw(a);
2876     p.frac <<= float16_params.frac_shift;
2877     p = parts_silence_nan(p, status);
2878     p.frac >>= float16_params.frac_shift;
2879     return float16_pack_raw(p);
2880 }
2881 
2882 float32 float32_silence_nan(float32 a, float_status *status)
2883 {
2884     FloatParts p = float32_unpack_raw(a);
2885     p.frac <<= float32_params.frac_shift;
2886     p = parts_silence_nan(p, status);
2887     p.frac >>= float32_params.frac_shift;
2888     return float32_pack_raw(p);
2889 }
2890 
2891 float64 float64_silence_nan(float64 a, float_status *status)
2892 {
2893     FloatParts p = float64_unpack_raw(a);
2894     p.frac <<= float64_params.frac_shift;
2895     p = parts_silence_nan(p, status);
2896     p.frac >>= float64_params.frac_shift;
2897     return float64_pack_raw(p);
2898 }
2899 
2900 /*----------------------------------------------------------------------------
2901 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2902 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2903 | input.  If `zSign' is 1, the input is negated before being converted to an
2904 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2905 | is simply rounded to an integer, with the inexact exception raised if the
2906 | input cannot be represented exactly as an integer.  However, if the fixed-
2907 | point input is too large, the invalid exception is raised and the largest
2908 | positive or negative integer is returned.
2909 *----------------------------------------------------------------------------*/
2910 
2911 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2912 {
2913     int8_t roundingMode;
2914     flag roundNearestEven;
2915     int8_t roundIncrement, roundBits;
2916     int32_t z;
2917 
2918     roundingMode = status->float_rounding_mode;
2919     roundNearestEven = ( roundingMode == float_round_nearest_even );
2920     switch (roundingMode) {
2921     case float_round_nearest_even:
2922     case float_round_ties_away:
2923         roundIncrement = 0x40;
2924         break;
2925     case float_round_to_zero:
2926         roundIncrement = 0;
2927         break;
2928     case float_round_up:
2929         roundIncrement = zSign ? 0 : 0x7f;
2930         break;
2931     case float_round_down:
2932         roundIncrement = zSign ? 0x7f : 0;
2933         break;
2934     default:
2935         abort();
2936     }
2937     roundBits = absZ & 0x7F;
2938     absZ = ( absZ + roundIncrement )>>7;
2939     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2940     z = absZ;
2941     if ( zSign ) z = - z;
2942     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2943         float_raise(float_flag_invalid, status);
2944         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2945     }
2946     if (roundBits) {
2947         status->float_exception_flags |= float_flag_inexact;
2948     }
2949     return z;
2950 
2951 }
2952 
2953 /*----------------------------------------------------------------------------
2954 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2955 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2956 | and returns the properly rounded 64-bit integer corresponding to the input.
2957 | If `zSign' is 1, the input is negated before being converted to an integer.
2958 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2959 | the inexact exception raised if the input cannot be represented exactly as
2960 | an integer.  However, if the fixed-point input is too large, the invalid
2961 | exception is raised and the largest positive or negative integer is
2962 | returned.
2963 *----------------------------------------------------------------------------*/
2964 
2965 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2966                                float_status *status)
2967 {
2968     int8_t roundingMode;
2969     flag roundNearestEven, increment;
2970     int64_t z;
2971 
2972     roundingMode = status->float_rounding_mode;
2973     roundNearestEven = ( roundingMode == float_round_nearest_even );
2974     switch (roundingMode) {
2975     case float_round_nearest_even:
2976     case float_round_ties_away:
2977         increment = ((int64_t) absZ1 < 0);
2978         break;
2979     case float_round_to_zero:
2980         increment = 0;
2981         break;
2982     case float_round_up:
2983         increment = !zSign && absZ1;
2984         break;
2985     case float_round_down:
2986         increment = zSign && absZ1;
2987         break;
2988     default:
2989         abort();
2990     }
2991     if ( increment ) {
2992         ++absZ0;
2993         if ( absZ0 == 0 ) goto overflow;
2994         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2995     }
2996     z = absZ0;
2997     if ( zSign ) z = - z;
2998     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2999  overflow:
3000         float_raise(float_flag_invalid, status);
3001         return
3002               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3003             : LIT64( 0x7FFFFFFFFFFFFFFF );
3004     }
3005     if (absZ1) {
3006         status->float_exception_flags |= float_flag_inexact;
3007     }
3008     return z;
3009 
3010 }
3011 
3012 /*----------------------------------------------------------------------------
3013 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3014 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3015 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3016 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3017 | with the inexact exception raised if the input cannot be represented exactly
3018 | as an integer.  However, if the fixed-point input is too large, the invalid
3019 | exception is raised and the largest unsigned integer is returned.
3020 *----------------------------------------------------------------------------*/
3021 
3022 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3023                                 uint64_t absZ1, float_status *status)
3024 {
3025     int8_t roundingMode;
3026     flag roundNearestEven, increment;
3027 
3028     roundingMode = status->float_rounding_mode;
3029     roundNearestEven = (roundingMode == float_round_nearest_even);
3030     switch (roundingMode) {
3031     case float_round_nearest_even:
3032     case float_round_ties_away:
3033         increment = ((int64_t)absZ1 < 0);
3034         break;
3035     case float_round_to_zero:
3036         increment = 0;
3037         break;
3038     case float_round_up:
3039         increment = !zSign && absZ1;
3040         break;
3041     case float_round_down:
3042         increment = zSign && absZ1;
3043         break;
3044     default:
3045         abort();
3046     }
3047     if (increment) {
3048         ++absZ0;
3049         if (absZ0 == 0) {
3050             float_raise(float_flag_invalid, status);
3051             return LIT64(0xFFFFFFFFFFFFFFFF);
3052         }
3053         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3054     }
3055 
3056     if (zSign && absZ0) {
3057         float_raise(float_flag_invalid, status);
3058         return 0;
3059     }
3060 
3061     if (absZ1) {
3062         status->float_exception_flags |= float_flag_inexact;
3063     }
3064     return absZ0;
3065 }
3066 
3067 /*----------------------------------------------------------------------------
3068 | If `a' is denormal and we are in flush-to-zero mode then set the
3069 | input-denormal exception and return zero. Otherwise just return the value.
3070 *----------------------------------------------------------------------------*/
3071 float32 float32_squash_input_denormal(float32 a, float_status *status)
3072 {
3073     if (status->flush_inputs_to_zero) {
3074         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3075             float_raise(float_flag_input_denormal, status);
3076             return make_float32(float32_val(a) & 0x80000000);
3077         }
3078     }
3079     return a;
3080 }
3081 
3082 /*----------------------------------------------------------------------------
3083 | Normalizes the subnormal single-precision floating-point value represented
3084 | by the denormalized significand `aSig'.  The normalized exponent and
3085 | significand are stored at the locations pointed to by `zExpPtr' and
3086 | `zSigPtr', respectively.
3087 *----------------------------------------------------------------------------*/
3088 
3089 static void
3090  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3091 {
3092     int8_t shiftCount;
3093 
3094     shiftCount = clz32(aSig) - 8;
3095     *zSigPtr = aSig<<shiftCount;
3096     *zExpPtr = 1 - shiftCount;
3097 
3098 }
3099 
3100 /*----------------------------------------------------------------------------
3101 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3102 | and significand `zSig', and returns the proper single-precision floating-
3103 | point value corresponding to the abstract input.  Ordinarily, the abstract
3104 | value is simply rounded and packed into the single-precision format, with
3105 | the inexact exception raised if the abstract input cannot be represented
3106 | exactly.  However, if the abstract value is too large, the overflow and
3107 | inexact exceptions are raised and an infinity or maximal finite value is
3108 | returned.  If the abstract value is too small, the input value is rounded to
3109 | a subnormal number, and the underflow and inexact exceptions are raised if
3110 | the abstract input cannot be represented exactly as a subnormal single-
3111 | precision floating-point number.
3112 |     The input significand `zSig' has its binary point between bits 30
3113 | and 29, which is 7 bits to the left of the usual location.  This shifted
3114 | significand must be normalized or smaller.  If `zSig' is not normalized,
3115 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3116 | and it must not require rounding.  In the usual case that `zSig' is
3117 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3118 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3119 | Binary Floating-Point Arithmetic.
3120 *----------------------------------------------------------------------------*/
3121 
3122 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3123                                    float_status *status)
3124 {
3125     int8_t roundingMode;
3126     flag roundNearestEven;
3127     int8_t roundIncrement, roundBits;
3128     flag isTiny;
3129 
3130     roundingMode = status->float_rounding_mode;
3131     roundNearestEven = ( roundingMode == float_round_nearest_even );
3132     switch (roundingMode) {
3133     case float_round_nearest_even:
3134     case float_round_ties_away:
3135         roundIncrement = 0x40;
3136         break;
3137     case float_round_to_zero:
3138         roundIncrement = 0;
3139         break;
3140     case float_round_up:
3141         roundIncrement = zSign ? 0 : 0x7f;
3142         break;
3143     case float_round_down:
3144         roundIncrement = zSign ? 0x7f : 0;
3145         break;
3146     default:
3147         abort();
3148         break;
3149     }
3150     roundBits = zSig & 0x7F;
3151     if ( 0xFD <= (uint16_t) zExp ) {
3152         if (    ( 0xFD < zExp )
3153              || (    ( zExp == 0xFD )
3154                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3155            ) {
3156             float_raise(float_flag_overflow | float_flag_inexact, status);
3157             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3158         }
3159         if ( zExp < 0 ) {
3160             if (status->flush_to_zero) {
3161                 float_raise(float_flag_output_denormal, status);
3162                 return packFloat32(zSign, 0, 0);
3163             }
3164             isTiny =
3165                 (status->float_detect_tininess
3166                  == float_tininess_before_rounding)
3167                 || ( zExp < -1 )
3168                 || ( zSig + roundIncrement < 0x80000000 );
3169             shift32RightJamming( zSig, - zExp, &zSig );
3170             zExp = 0;
3171             roundBits = zSig & 0x7F;
3172             if (isTiny && roundBits) {
3173                 float_raise(float_flag_underflow, status);
3174             }
3175         }
3176     }
3177     if (roundBits) {
3178         status->float_exception_flags |= float_flag_inexact;
3179     }
3180     zSig = ( zSig + roundIncrement )>>7;
3181     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3182     if ( zSig == 0 ) zExp = 0;
3183     return packFloat32( zSign, zExp, zSig );
3184 
3185 }
3186 
3187 /*----------------------------------------------------------------------------
3188 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3189 | and significand `zSig', and returns the proper single-precision floating-
3190 | point value corresponding to the abstract input.  This routine is just like
3191 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3192 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3193 | floating-point exponent.
3194 *----------------------------------------------------------------------------*/
3195 
3196 static float32
3197  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3198                               float_status *status)
3199 {
3200     int8_t shiftCount;
3201 
3202     shiftCount = clz32(zSig) - 1;
3203     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3204                                status);
3205 
3206 }
3207 
3208 /*----------------------------------------------------------------------------
3209 | If `a' is denormal and we are in flush-to-zero mode then set the
3210 | input-denormal exception and return zero. Otherwise just return the value.
3211 *----------------------------------------------------------------------------*/
3212 float64 float64_squash_input_denormal(float64 a, float_status *status)
3213 {
3214     if (status->flush_inputs_to_zero) {
3215         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3216             float_raise(float_flag_input_denormal, status);
3217             return make_float64(float64_val(a) & (1ULL << 63));
3218         }
3219     }
3220     return a;
3221 }
3222 
3223 /*----------------------------------------------------------------------------
3224 | Normalizes the subnormal double-precision floating-point value represented
3225 | by the denormalized significand `aSig'.  The normalized exponent and
3226 | significand are stored at the locations pointed to by `zExpPtr' and
3227 | `zSigPtr', respectively.
3228 *----------------------------------------------------------------------------*/
3229 
3230 static void
3231  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3232 {
3233     int8_t shiftCount;
3234 
3235     shiftCount = clz64(aSig) - 11;
3236     *zSigPtr = aSig<<shiftCount;
3237     *zExpPtr = 1 - shiftCount;
3238 
3239 }
3240 
3241 /*----------------------------------------------------------------------------
3242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3243 | double-precision floating-point value, returning the result.  After being
3244 | shifted into the proper positions, the three fields are simply added
3245 | together to form the result.  This means that any integer portion of `zSig'
3246 | will be added into the exponent.  Since a properly normalized significand
3247 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3248 | than the desired result exponent whenever `zSig' is a complete, normalized
3249 | significand.
3250 *----------------------------------------------------------------------------*/
3251 
3252 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3253 {
3254 
3255     return make_float64(
3256         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3257 
3258 }
3259 
3260 /*----------------------------------------------------------------------------
3261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3262 | and significand `zSig', and returns the proper double-precision floating-
3263 | point value corresponding to the abstract input.  Ordinarily, the abstract
3264 | value is simply rounded and packed into the double-precision format, with
3265 | the inexact exception raised if the abstract input cannot be represented
3266 | exactly.  However, if the abstract value is too large, the overflow and
3267 | inexact exceptions are raised and an infinity or maximal finite value is
3268 | returned.  If the abstract value is too small, the input value is rounded to
3269 | a subnormal number, and the underflow and inexact exceptions are raised if
3270 | the abstract input cannot be represented exactly as a subnormal double-
3271 | precision floating-point number.
3272 |     The input significand `zSig' has its binary point between bits 62
3273 | and 61, which is 10 bits to the left of the usual location.  This shifted
3274 | significand must be normalized or smaller.  If `zSig' is not normalized,
3275 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3276 | and it must not require rounding.  In the usual case that `zSig' is
3277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3278 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3279 | Binary Floating-Point Arithmetic.
3280 *----------------------------------------------------------------------------*/
3281 
3282 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3283                                    float_status *status)
3284 {
3285     int8_t roundingMode;
3286     flag roundNearestEven;
3287     int roundIncrement, roundBits;
3288     flag isTiny;
3289 
3290     roundingMode = status->float_rounding_mode;
3291     roundNearestEven = ( roundingMode == float_round_nearest_even );
3292     switch (roundingMode) {
3293     case float_round_nearest_even:
3294     case float_round_ties_away:
3295         roundIncrement = 0x200;
3296         break;
3297     case float_round_to_zero:
3298         roundIncrement = 0;
3299         break;
3300     case float_round_up:
3301         roundIncrement = zSign ? 0 : 0x3ff;
3302         break;
3303     case float_round_down:
3304         roundIncrement = zSign ? 0x3ff : 0;
3305         break;
3306     case float_round_to_odd:
3307         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3308         break;
3309     default:
3310         abort();
3311     }
3312     roundBits = zSig & 0x3FF;
3313     if ( 0x7FD <= (uint16_t) zExp ) {
3314         if (    ( 0x7FD < zExp )
3315              || (    ( zExp == 0x7FD )
3316                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3317            ) {
3318             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3319                                    roundIncrement != 0;
3320             float_raise(float_flag_overflow | float_flag_inexact, status);
3321             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3322         }
3323         if ( zExp < 0 ) {
3324             if (status->flush_to_zero) {
3325                 float_raise(float_flag_output_denormal, status);
3326                 return packFloat64(zSign, 0, 0);
3327             }
3328             isTiny =
3329                    (status->float_detect_tininess
3330                     == float_tininess_before_rounding)
3331                 || ( zExp < -1 )
3332                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3333             shift64RightJamming( zSig, - zExp, &zSig );
3334             zExp = 0;
3335             roundBits = zSig & 0x3FF;
3336             if (isTiny && roundBits) {
3337                 float_raise(float_flag_underflow, status);
3338             }
3339             if (roundingMode == float_round_to_odd) {
3340                 /*
3341                  * For round-to-odd case, the roundIncrement depends on
3342                  * zSig which just changed.
3343                  */
3344                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3345             }
3346         }
3347     }
3348     if (roundBits) {
3349         status->float_exception_flags |= float_flag_inexact;
3350     }
3351     zSig = ( zSig + roundIncrement )>>10;
3352     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3353     if ( zSig == 0 ) zExp = 0;
3354     return packFloat64( zSign, zExp, zSig );
3355 
3356 }
3357 
3358 /*----------------------------------------------------------------------------
3359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3360 | and significand `zSig', and returns the proper double-precision floating-
3361 | point value corresponding to the abstract input.  This routine is just like
3362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3364 | floating-point exponent.
3365 *----------------------------------------------------------------------------*/
3366 
3367 static float64
3368  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3369                               float_status *status)
3370 {
3371     int8_t shiftCount;
3372 
3373     shiftCount = clz64(zSig) - 1;
3374     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3375                                status);
3376 
3377 }
3378 
3379 /*----------------------------------------------------------------------------
3380 | Normalizes the subnormal extended double-precision floating-point value
3381 | represented by the denormalized significand `aSig'.  The normalized exponent
3382 | and significand are stored at the locations pointed to by `zExpPtr' and
3383 | `zSigPtr', respectively.
3384 *----------------------------------------------------------------------------*/
3385 
3386 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3387                                 uint64_t *zSigPtr)
3388 {
3389     int8_t shiftCount;
3390 
3391     shiftCount = clz64(aSig);
3392     *zSigPtr = aSig<<shiftCount;
3393     *zExpPtr = 1 - shiftCount;
3394 }
3395 
3396 /*----------------------------------------------------------------------------
3397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3398 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3399 | and returns the proper extended double-precision floating-point value
3400 | corresponding to the abstract input.  Ordinarily, the abstract value is
3401 | rounded and packed into the extended double-precision format, with the
3402 | inexact exception raised if the abstract input cannot be represented
3403 | exactly.  However, if the abstract value is too large, the overflow and
3404 | inexact exceptions are raised and an infinity or maximal finite value is
3405 | returned.  If the abstract value is too small, the input value is rounded to
3406 | a subnormal number, and the underflow and inexact exceptions are raised if
3407 | the abstract input cannot be represented exactly as a subnormal extended
3408 | double-precision floating-point number.
3409 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3410 | number of bits as single or double precision, respectively.  Otherwise, the
3411 | result is rounded to the full precision of the extended double-precision
3412 | format.
3413 |     The input significand must be normalized or smaller.  If the input
3414 | significand is not normalized, `zExp' must be 0; in that case, the result
3415 | returned is a subnormal number, and it must not require rounding.  The
3416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3417 | Floating-Point Arithmetic.
3418 *----------------------------------------------------------------------------*/
3419 
3420 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3421                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3422                               float_status *status)
3423 {
3424     int8_t roundingMode;
3425     flag roundNearestEven, increment, isTiny;
3426     int64_t roundIncrement, roundMask, roundBits;
3427 
3428     roundingMode = status->float_rounding_mode;
3429     roundNearestEven = ( roundingMode == float_round_nearest_even );
3430     if ( roundingPrecision == 80 ) goto precision80;
3431     if ( roundingPrecision == 64 ) {
3432         roundIncrement = LIT64( 0x0000000000000400 );
3433         roundMask = LIT64( 0x00000000000007FF );
3434     }
3435     else if ( roundingPrecision == 32 ) {
3436         roundIncrement = LIT64( 0x0000008000000000 );
3437         roundMask = LIT64( 0x000000FFFFFFFFFF );
3438     }
3439     else {
3440         goto precision80;
3441     }
3442     zSig0 |= ( zSig1 != 0 );
3443     switch (roundingMode) {
3444     case float_round_nearest_even:
3445     case float_round_ties_away:
3446         break;
3447     case float_round_to_zero:
3448         roundIncrement = 0;
3449         break;
3450     case float_round_up:
3451         roundIncrement = zSign ? 0 : roundMask;
3452         break;
3453     case float_round_down:
3454         roundIncrement = zSign ? roundMask : 0;
3455         break;
3456     default:
3457         abort();
3458     }
3459     roundBits = zSig0 & roundMask;
3460     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3461         if (    ( 0x7FFE < zExp )
3462              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3463            ) {
3464             goto overflow;
3465         }
3466         if ( zExp <= 0 ) {
3467             if (status->flush_to_zero) {
3468                 float_raise(float_flag_output_denormal, status);
3469                 return packFloatx80(zSign, 0, 0);
3470             }
3471             isTiny =
3472                    (status->float_detect_tininess
3473                     == float_tininess_before_rounding)
3474                 || ( zExp < 0 )
3475                 || ( zSig0 <= zSig0 + roundIncrement );
3476             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3477             zExp = 0;
3478             roundBits = zSig0 & roundMask;
3479             if (isTiny && roundBits) {
3480                 float_raise(float_flag_underflow, status);
3481             }
3482             if (roundBits) {
3483                 status->float_exception_flags |= float_flag_inexact;
3484             }
3485             zSig0 += roundIncrement;
3486             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3487             roundIncrement = roundMask + 1;
3488             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3489                 roundMask |= roundIncrement;
3490             }
3491             zSig0 &= ~ roundMask;
3492             return packFloatx80( zSign, zExp, zSig0 );
3493         }
3494     }
3495     if (roundBits) {
3496         status->float_exception_flags |= float_flag_inexact;
3497     }
3498     zSig0 += roundIncrement;
3499     if ( zSig0 < roundIncrement ) {
3500         ++zExp;
3501         zSig0 = LIT64( 0x8000000000000000 );
3502     }
3503     roundIncrement = roundMask + 1;
3504     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3505         roundMask |= roundIncrement;
3506     }
3507     zSig0 &= ~ roundMask;
3508     if ( zSig0 == 0 ) zExp = 0;
3509     return packFloatx80( zSign, zExp, zSig0 );
3510  precision80:
3511     switch (roundingMode) {
3512     case float_round_nearest_even:
3513     case float_round_ties_away:
3514         increment = ((int64_t)zSig1 < 0);
3515         break;
3516     case float_round_to_zero:
3517         increment = 0;
3518         break;
3519     case float_round_up:
3520         increment = !zSign && zSig1;
3521         break;
3522     case float_round_down:
3523         increment = zSign && zSig1;
3524         break;
3525     default:
3526         abort();
3527     }
3528     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3529         if (    ( 0x7FFE < zExp )
3530              || (    ( zExp == 0x7FFE )
3531                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3532                   && increment
3533                 )
3534            ) {
3535             roundMask = 0;
3536  overflow:
3537             float_raise(float_flag_overflow | float_flag_inexact, status);
3538             if (    ( roundingMode == float_round_to_zero )
3539                  || ( zSign && ( roundingMode == float_round_up ) )
3540                  || ( ! zSign && ( roundingMode == float_round_down ) )
3541                ) {
3542                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3543             }
3544             return packFloatx80(zSign,
3545                                 floatx80_infinity_high,
3546                                 floatx80_infinity_low);
3547         }
3548         if ( zExp <= 0 ) {
3549             isTiny =
3550                    (status->float_detect_tininess
3551                     == float_tininess_before_rounding)
3552                 || ( zExp < 0 )
3553                 || ! increment
3554                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3555             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3556             zExp = 0;
3557             if (isTiny && zSig1) {
3558                 float_raise(float_flag_underflow, status);
3559             }
3560             if (zSig1) {
3561                 status->float_exception_flags |= float_flag_inexact;
3562             }
3563             switch (roundingMode) {
3564             case float_round_nearest_even:
3565             case float_round_ties_away:
3566                 increment = ((int64_t)zSig1 < 0);
3567                 break;
3568             case float_round_to_zero:
3569                 increment = 0;
3570                 break;
3571             case float_round_up:
3572                 increment = !zSign && zSig1;
3573                 break;
3574             case float_round_down:
3575                 increment = zSign && zSig1;
3576                 break;
3577             default:
3578                 abort();
3579             }
3580             if ( increment ) {
3581                 ++zSig0;
3582                 zSig0 &=
3583                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3584                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3585             }
3586             return packFloatx80( zSign, zExp, zSig0 );
3587         }
3588     }
3589     if (zSig1) {
3590         status->float_exception_flags |= float_flag_inexact;
3591     }
3592     if ( increment ) {
3593         ++zSig0;
3594         if ( zSig0 == 0 ) {
3595             ++zExp;
3596             zSig0 = LIT64( 0x8000000000000000 );
3597         }
3598         else {
3599             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3600         }
3601     }
3602     else {
3603         if ( zSig0 == 0 ) zExp = 0;
3604     }
3605     return packFloatx80( zSign, zExp, zSig0 );
3606 
3607 }
3608 
3609 /*----------------------------------------------------------------------------
3610 | Takes an abstract floating-point value having sign `zSign', exponent
3611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3612 | and returns the proper extended double-precision floating-point value
3613 | corresponding to the abstract input.  This routine is just like
3614 | `roundAndPackFloatx80' except that the input significand does not have to be
3615 | normalized.
3616 *----------------------------------------------------------------------------*/
3617 
3618 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3619                                        flag zSign, int32_t zExp,
3620                                        uint64_t zSig0, uint64_t zSig1,
3621                                        float_status *status)
3622 {
3623     int8_t shiftCount;
3624 
3625     if ( zSig0 == 0 ) {
3626         zSig0 = zSig1;
3627         zSig1 = 0;
3628         zExp -= 64;
3629     }
3630     shiftCount = clz64(zSig0);
3631     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3632     zExp -= shiftCount;
3633     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3634                                 zSig0, zSig1, status);
3635 
3636 }
3637 
3638 /*----------------------------------------------------------------------------
3639 | Returns the least-significant 64 fraction bits of the quadruple-precision
3640 | floating-point value `a'.
3641 *----------------------------------------------------------------------------*/
3642 
3643 static inline uint64_t extractFloat128Frac1( float128 a )
3644 {
3645 
3646     return a.low;
3647 
3648 }
3649 
3650 /*----------------------------------------------------------------------------
3651 | Returns the most-significant 48 fraction bits of the quadruple-precision
3652 | floating-point value `a'.
3653 *----------------------------------------------------------------------------*/
3654 
3655 static inline uint64_t extractFloat128Frac0( float128 a )
3656 {
3657 
3658     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3659 
3660 }
3661 
3662 /*----------------------------------------------------------------------------
3663 | Returns the exponent bits of the quadruple-precision floating-point value
3664 | `a'.
3665 *----------------------------------------------------------------------------*/
3666 
3667 static inline int32_t extractFloat128Exp( float128 a )
3668 {
3669 
3670     return ( a.high>>48 ) & 0x7FFF;
3671 
3672 }
3673 
3674 /*----------------------------------------------------------------------------
3675 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3676 *----------------------------------------------------------------------------*/
3677 
3678 static inline flag extractFloat128Sign( float128 a )
3679 {
3680 
3681     return a.high>>63;
3682 
3683 }
3684 
3685 /*----------------------------------------------------------------------------
3686 | Normalizes the subnormal quadruple-precision floating-point value
3687 | represented by the denormalized significand formed by the concatenation of
3688 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3689 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3690 | significand are stored at the location pointed to by `zSig0Ptr', and the
3691 | least significant 64 bits of the normalized significand are stored at the
3692 | location pointed to by `zSig1Ptr'.
3693 *----------------------------------------------------------------------------*/
3694 
3695 static void
3696  normalizeFloat128Subnormal(
3697      uint64_t aSig0,
3698      uint64_t aSig1,
3699      int32_t *zExpPtr,
3700      uint64_t *zSig0Ptr,
3701      uint64_t *zSig1Ptr
3702  )
3703 {
3704     int8_t shiftCount;
3705 
3706     if ( aSig0 == 0 ) {
3707         shiftCount = clz64(aSig1) - 15;
3708         if ( shiftCount < 0 ) {
3709             *zSig0Ptr = aSig1>>( - shiftCount );
3710             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3711         }
3712         else {
3713             *zSig0Ptr = aSig1<<shiftCount;
3714             *zSig1Ptr = 0;
3715         }
3716         *zExpPtr = - shiftCount - 63;
3717     }
3718     else {
3719         shiftCount = clz64(aSig0) - 15;
3720         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3721         *zExpPtr = 1 - shiftCount;
3722     }
3723 
3724 }
3725 
3726 /*----------------------------------------------------------------------------
3727 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3728 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3729 | floating-point value, returning the result.  After being shifted into the
3730 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3731 | added together to form the most significant 32 bits of the result.  This
3732 | means that any integer portion of `zSig0' will be added into the exponent.
3733 | Since a properly normalized significand will have an integer portion equal
3734 | to 1, the `zExp' input should be 1 less than the desired result exponent
3735 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3736 | significand.
3737 *----------------------------------------------------------------------------*/
3738 
3739 static inline float128
3740  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3741 {
3742     float128 z;
3743 
3744     z.low = zSig1;
3745     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3746     return z;
3747 
3748 }
3749 
3750 /*----------------------------------------------------------------------------
3751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3752 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3753 | and `zSig2', and returns the proper quadruple-precision floating-point value
3754 | corresponding to the abstract input.  Ordinarily, the abstract value is
3755 | simply rounded and packed into the quadruple-precision format, with the
3756 | inexact exception raised if the abstract input cannot be represented
3757 | exactly.  However, if the abstract value is too large, the overflow and
3758 | inexact exceptions are raised and an infinity or maximal finite value is
3759 | returned.  If the abstract value is too small, the input value is rounded to
3760 | a subnormal number, and the underflow and inexact exceptions are raised if
3761 | the abstract input cannot be represented exactly as a subnormal quadruple-
3762 | precision floating-point number.
3763 |     The input significand must be normalized or smaller.  If the input
3764 | significand is not normalized, `zExp' must be 0; in that case, the result
3765 | returned is a subnormal number, and it must not require rounding.  In the
3766 | usual case that the input significand is normalized, `zExp' must be 1 less
3767 | than the ``true'' floating-point exponent.  The handling of underflow and
3768 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3769 *----------------------------------------------------------------------------*/
3770 
3771 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3772                                      uint64_t zSig0, uint64_t zSig1,
3773                                      uint64_t zSig2, float_status *status)
3774 {
3775     int8_t roundingMode;
3776     flag roundNearestEven, increment, isTiny;
3777 
3778     roundingMode = status->float_rounding_mode;
3779     roundNearestEven = ( roundingMode == float_round_nearest_even );
3780     switch (roundingMode) {
3781     case float_round_nearest_even:
3782     case float_round_ties_away:
3783         increment = ((int64_t)zSig2 < 0);
3784         break;
3785     case float_round_to_zero:
3786         increment = 0;
3787         break;
3788     case float_round_up:
3789         increment = !zSign && zSig2;
3790         break;
3791     case float_round_down:
3792         increment = zSign && zSig2;
3793         break;
3794     case float_round_to_odd:
3795         increment = !(zSig1 & 0x1) && zSig2;
3796         break;
3797     default:
3798         abort();
3799     }
3800     if ( 0x7FFD <= (uint32_t) zExp ) {
3801         if (    ( 0x7FFD < zExp )
3802              || (    ( zExp == 0x7FFD )
3803                   && eq128(
3804                          LIT64( 0x0001FFFFFFFFFFFF ),
3805                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3806                          zSig0,
3807                          zSig1
3808                      )
3809                   && increment
3810                 )
3811            ) {
3812             float_raise(float_flag_overflow | float_flag_inexact, status);
3813             if (    ( roundingMode == float_round_to_zero )
3814                  || ( zSign && ( roundingMode == float_round_up ) )
3815                  || ( ! zSign && ( roundingMode == float_round_down ) )
3816                  || (roundingMode == float_round_to_odd)
3817                ) {
3818                 return
3819                     packFloat128(
3820                         zSign,
3821                         0x7FFE,
3822                         LIT64( 0x0000FFFFFFFFFFFF ),
3823                         LIT64( 0xFFFFFFFFFFFFFFFF )
3824                     );
3825             }
3826             return packFloat128( zSign, 0x7FFF, 0, 0 );
3827         }
3828         if ( zExp < 0 ) {
3829             if (status->flush_to_zero) {
3830                 float_raise(float_flag_output_denormal, status);
3831                 return packFloat128(zSign, 0, 0, 0);
3832             }
3833             isTiny =
3834                    (status->float_detect_tininess
3835                     == float_tininess_before_rounding)
3836                 || ( zExp < -1 )
3837                 || ! increment
3838                 || lt128(
3839                        zSig0,
3840                        zSig1,
3841                        LIT64( 0x0001FFFFFFFFFFFF ),
3842                        LIT64( 0xFFFFFFFFFFFFFFFF )
3843                    );
3844             shift128ExtraRightJamming(
3845                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3846             zExp = 0;
3847             if (isTiny && zSig2) {
3848                 float_raise(float_flag_underflow, status);
3849             }
3850             switch (roundingMode) {
3851             case float_round_nearest_even:
3852             case float_round_ties_away:
3853                 increment = ((int64_t)zSig2 < 0);
3854                 break;
3855             case float_round_to_zero:
3856                 increment = 0;
3857                 break;
3858             case float_round_up:
3859                 increment = !zSign && zSig2;
3860                 break;
3861             case float_round_down:
3862                 increment = zSign && zSig2;
3863                 break;
3864             case float_round_to_odd:
3865                 increment = !(zSig1 & 0x1) && zSig2;
3866                 break;
3867             default:
3868                 abort();
3869             }
3870         }
3871     }
3872     if (zSig2) {
3873         status->float_exception_flags |= float_flag_inexact;
3874     }
3875     if ( increment ) {
3876         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3877         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3878     }
3879     else {
3880         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3881     }
3882     return packFloat128( zSign, zExp, zSig0, zSig1 );
3883 
3884 }
3885 
3886 /*----------------------------------------------------------------------------
3887 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3888 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3889 | returns the proper quadruple-precision floating-point value corresponding
3890 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3891 | except that the input significand has fewer bits and does not have to be
3892 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3893 | point exponent.
3894 *----------------------------------------------------------------------------*/
3895 
3896 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3897                                               uint64_t zSig0, uint64_t zSig1,
3898                                               float_status *status)
3899 {
3900     int8_t shiftCount;
3901     uint64_t zSig2;
3902 
3903     if ( zSig0 == 0 ) {
3904         zSig0 = zSig1;
3905         zSig1 = 0;
3906         zExp -= 64;
3907     }
3908     shiftCount = clz64(zSig0) - 15;
3909     if ( 0 <= shiftCount ) {
3910         zSig2 = 0;
3911         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3912     }
3913     else {
3914         shift128ExtraRightJamming(
3915             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3916     }
3917     zExp -= shiftCount;
3918     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3919 
3920 }
3921 
3922 
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of converting the 32-bit two's complement integer `a'
3925 | to the extended double-precision floating-point format.  The conversion
3926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3927 | Arithmetic.
3928 *----------------------------------------------------------------------------*/
3929 
3930 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3931 {
3932     flag zSign;
3933     uint32_t absA;
3934     int8_t shiftCount;
3935     uint64_t zSig;
3936 
3937     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3938     zSign = ( a < 0 );
3939     absA = zSign ? - a : a;
3940     shiftCount = clz32(absA) + 32;
3941     zSig = absA;
3942     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3943 
3944 }
3945 
3946 /*----------------------------------------------------------------------------
3947 | Returns the result of converting the 32-bit two's complement integer `a' to
3948 | the quadruple-precision floating-point format.  The conversion is performed
3949 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3950 *----------------------------------------------------------------------------*/
3951 
3952 float128 int32_to_float128(int32_t a, float_status *status)
3953 {
3954     flag zSign;
3955     uint32_t absA;
3956     int8_t shiftCount;
3957     uint64_t zSig0;
3958 
3959     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3960     zSign = ( a < 0 );
3961     absA = zSign ? - a : a;
3962     shiftCount = clz32(absA) + 17;
3963     zSig0 = absA;
3964     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3965 
3966 }
3967 
3968 /*----------------------------------------------------------------------------
3969 | Returns the result of converting the 64-bit two's complement integer `a'
3970 | to the extended double-precision floating-point format.  The conversion
3971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3972 | Arithmetic.
3973 *----------------------------------------------------------------------------*/
3974 
3975 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3976 {
3977     flag zSign;
3978     uint64_t absA;
3979     int8_t shiftCount;
3980 
3981     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3982     zSign = ( a < 0 );
3983     absA = zSign ? - a : a;
3984     shiftCount = clz64(absA);
3985     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3986 
3987 }
3988 
3989 /*----------------------------------------------------------------------------
3990 | Returns the result of converting the 64-bit two's complement integer `a' to
3991 | the quadruple-precision floating-point format.  The conversion is performed
3992 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3993 *----------------------------------------------------------------------------*/
3994 
3995 float128 int64_to_float128(int64_t a, float_status *status)
3996 {
3997     flag zSign;
3998     uint64_t absA;
3999     int8_t shiftCount;
4000     int32_t zExp;
4001     uint64_t zSig0, zSig1;
4002 
4003     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4004     zSign = ( a < 0 );
4005     absA = zSign ? - a : a;
4006     shiftCount = clz64(absA) + 49;
4007     zExp = 0x406E - shiftCount;
4008     if ( 64 <= shiftCount ) {
4009         zSig1 = 0;
4010         zSig0 = absA;
4011         shiftCount -= 64;
4012     }
4013     else {
4014         zSig1 = absA;
4015         zSig0 = 0;
4016     }
4017     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4018     return packFloat128( zSign, zExp, zSig0, zSig1 );
4019 
4020 }
4021 
4022 /*----------------------------------------------------------------------------
4023 | Returns the result of converting the 64-bit unsigned integer `a'
4024 | to the quadruple-precision floating-point format.  The conversion is performed
4025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4026 *----------------------------------------------------------------------------*/
4027 
4028 float128 uint64_to_float128(uint64_t a, float_status *status)
4029 {
4030     if (a == 0) {
4031         return float128_zero;
4032     }
4033     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4034 }
4035 
4036 /*----------------------------------------------------------------------------
4037 | Returns the result of converting the single-precision floating-point value
4038 | `a' to the extended double-precision floating-point format.  The conversion
4039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4040 | Arithmetic.
4041 *----------------------------------------------------------------------------*/
4042 
4043 floatx80 float32_to_floatx80(float32 a, float_status *status)
4044 {
4045     flag aSign;
4046     int aExp;
4047     uint32_t aSig;
4048 
4049     a = float32_squash_input_denormal(a, status);
4050     aSig = extractFloat32Frac( a );
4051     aExp = extractFloat32Exp( a );
4052     aSign = extractFloat32Sign( a );
4053     if ( aExp == 0xFF ) {
4054         if (aSig) {
4055             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4056         }
4057         return packFloatx80(aSign,
4058                             floatx80_infinity_high,
4059                             floatx80_infinity_low);
4060     }
4061     if ( aExp == 0 ) {
4062         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4063         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4064     }
4065     aSig |= 0x00800000;
4066     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4067 
4068 }
4069 
4070 /*----------------------------------------------------------------------------
4071 | Returns the result of converting the single-precision floating-point value
4072 | `a' to the double-precision floating-point format.  The conversion is
4073 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4074 | Arithmetic.
4075 *----------------------------------------------------------------------------*/
4076 
4077 float128 float32_to_float128(float32 a, float_status *status)
4078 {
4079     flag aSign;
4080     int aExp;
4081     uint32_t aSig;
4082 
4083     a = float32_squash_input_denormal(a, status);
4084     aSig = extractFloat32Frac( a );
4085     aExp = extractFloat32Exp( a );
4086     aSign = extractFloat32Sign( a );
4087     if ( aExp == 0xFF ) {
4088         if (aSig) {
4089             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4090         }
4091         return packFloat128( aSign, 0x7FFF, 0, 0 );
4092     }
4093     if ( aExp == 0 ) {
4094         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4095         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4096         --aExp;
4097     }
4098     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4099 
4100 }
4101 
4102 /*----------------------------------------------------------------------------
4103 | Returns the remainder of the single-precision floating-point value `a'
4104 | with respect to the corresponding value `b'.  The operation is performed
4105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4106 *----------------------------------------------------------------------------*/
4107 
4108 float32 float32_rem(float32 a, float32 b, float_status *status)
4109 {
4110     flag aSign, zSign;
4111     int aExp, bExp, expDiff;
4112     uint32_t aSig, bSig;
4113     uint32_t q;
4114     uint64_t aSig64, bSig64, q64;
4115     uint32_t alternateASig;
4116     int32_t sigMean;
4117     a = float32_squash_input_denormal(a, status);
4118     b = float32_squash_input_denormal(b, status);
4119 
4120     aSig = extractFloat32Frac( a );
4121     aExp = extractFloat32Exp( a );
4122     aSign = extractFloat32Sign( a );
4123     bSig = extractFloat32Frac( b );
4124     bExp = extractFloat32Exp( b );
4125     if ( aExp == 0xFF ) {
4126         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4127             return propagateFloat32NaN(a, b, status);
4128         }
4129         float_raise(float_flag_invalid, status);
4130         return float32_default_nan(status);
4131     }
4132     if ( bExp == 0xFF ) {
4133         if (bSig) {
4134             return propagateFloat32NaN(a, b, status);
4135         }
4136         return a;
4137     }
4138     if ( bExp == 0 ) {
4139         if ( bSig == 0 ) {
4140             float_raise(float_flag_invalid, status);
4141             return float32_default_nan(status);
4142         }
4143         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4144     }
4145     if ( aExp == 0 ) {
4146         if ( aSig == 0 ) return a;
4147         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4148     }
4149     expDiff = aExp - bExp;
4150     aSig |= 0x00800000;
4151     bSig |= 0x00800000;
4152     if ( expDiff < 32 ) {
4153         aSig <<= 8;
4154         bSig <<= 8;
4155         if ( expDiff < 0 ) {
4156             if ( expDiff < -1 ) return a;
4157             aSig >>= 1;
4158         }
4159         q = ( bSig <= aSig );
4160         if ( q ) aSig -= bSig;
4161         if ( 0 < expDiff ) {
4162             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4163             q >>= 32 - expDiff;
4164             bSig >>= 2;
4165             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4166         }
4167         else {
4168             aSig >>= 2;
4169             bSig >>= 2;
4170         }
4171     }
4172     else {
4173         if ( bSig <= aSig ) aSig -= bSig;
4174         aSig64 = ( (uint64_t) aSig )<<40;
4175         bSig64 = ( (uint64_t) bSig )<<40;
4176         expDiff -= 64;
4177         while ( 0 < expDiff ) {
4178             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4179             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4180             aSig64 = - ( ( bSig * q64 )<<38 );
4181             expDiff -= 62;
4182         }
4183         expDiff += 64;
4184         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4185         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4186         q = q64>>( 64 - expDiff );
4187         bSig <<= 6;
4188         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4189     }
4190     do {
4191         alternateASig = aSig;
4192         ++q;
4193         aSig -= bSig;
4194     } while ( 0 <= (int32_t) aSig );
4195     sigMean = aSig + alternateASig;
4196     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4197         aSig = alternateASig;
4198     }
4199     zSign = ( (int32_t) aSig < 0 );
4200     if ( zSign ) aSig = - aSig;
4201     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4202 }
4203 
4204 
4205 
4206 /*----------------------------------------------------------------------------
4207 | Returns the binary exponential of the single-precision floating-point value
4208 | `a'. The operation is performed according to the IEC/IEEE Standard for
4209 | Binary Floating-Point Arithmetic.
4210 |
4211 | Uses the following identities:
4212 |
4213 | 1. -------------------------------------------------------------------------
4214 |      x    x*ln(2)
4215 |     2  = e
4216 |
4217 | 2. -------------------------------------------------------------------------
4218 |                      2     3     4     5           n
4219 |      x        x     x     x     x     x           x
4220 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4221 |               1!    2!    3!    4!    5!          n!
4222 *----------------------------------------------------------------------------*/
4223 
4224 static const float64 float32_exp2_coefficients[15] =
4225 {
4226     const_float64( 0x3ff0000000000000ll ), /*  1 */
4227     const_float64( 0x3fe0000000000000ll ), /*  2 */
4228     const_float64( 0x3fc5555555555555ll ), /*  3 */
4229     const_float64( 0x3fa5555555555555ll ), /*  4 */
4230     const_float64( 0x3f81111111111111ll ), /*  5 */
4231     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4232     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4233     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4234     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4235     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4236     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4237     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4238     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4239     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4240     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4241 };
4242 
4243 float32 float32_exp2(float32 a, float_status *status)
4244 {
4245     flag aSign;
4246     int aExp;
4247     uint32_t aSig;
4248     float64 r, x, xn;
4249     int i;
4250     a = float32_squash_input_denormal(a, status);
4251 
4252     aSig = extractFloat32Frac( a );
4253     aExp = extractFloat32Exp( a );
4254     aSign = extractFloat32Sign( a );
4255 
4256     if ( aExp == 0xFF) {
4257         if (aSig) {
4258             return propagateFloat32NaN(a, float32_zero, status);
4259         }
4260         return (aSign) ? float32_zero : a;
4261     }
4262     if (aExp == 0) {
4263         if (aSig == 0) return float32_one;
4264     }
4265 
4266     float_raise(float_flag_inexact, status);
4267 
4268     /* ******************************* */
4269     /* using float64 for approximation */
4270     /* ******************************* */
4271     x = float32_to_float64(a, status);
4272     x = float64_mul(x, float64_ln2, status);
4273 
4274     xn = x;
4275     r = float64_one;
4276     for (i = 0 ; i < 15 ; i++) {
4277         float64 f;
4278 
4279         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4280         r = float64_add(r, f, status);
4281 
4282         xn = float64_mul(xn, x, status);
4283     }
4284 
4285     return float64_to_float32(r, status);
4286 }
4287 
4288 /*----------------------------------------------------------------------------
4289 | Returns the binary log of the single-precision floating-point value `a'.
4290 | The operation is performed according to the IEC/IEEE Standard for Binary
4291 | Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4293 float32 float32_log2(float32 a, float_status *status)
4294 {
4295     flag aSign, zSign;
4296     int aExp;
4297     uint32_t aSig, zSig, i;
4298 
4299     a = float32_squash_input_denormal(a, status);
4300     aSig = extractFloat32Frac( a );
4301     aExp = extractFloat32Exp( a );
4302     aSign = extractFloat32Sign( a );
4303 
4304     if ( aExp == 0 ) {
4305         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4306         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4307     }
4308     if ( aSign ) {
4309         float_raise(float_flag_invalid, status);
4310         return float32_default_nan(status);
4311     }
4312     if ( aExp == 0xFF ) {
4313         if (aSig) {
4314             return propagateFloat32NaN(a, float32_zero, status);
4315         }
4316         return a;
4317     }
4318 
4319     aExp -= 0x7F;
4320     aSig |= 0x00800000;
4321     zSign = aExp < 0;
4322     zSig = aExp << 23;
4323 
4324     for (i = 1 << 22; i > 0; i >>= 1) {
4325         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4326         if ( aSig & 0x01000000 ) {
4327             aSig >>= 1;
4328             zSig |= i;
4329         }
4330     }
4331 
4332     if ( zSign )
4333         zSig = -zSig;
4334 
4335     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4336 }
4337 
4338 /*----------------------------------------------------------------------------
4339 | Returns 1 if the single-precision floating-point value `a' is equal to
4340 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4341 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4342 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4343 *----------------------------------------------------------------------------*/
4344 
4345 int float32_eq(float32 a, float32 b, float_status *status)
4346 {
4347     uint32_t av, bv;
4348     a = float32_squash_input_denormal(a, status);
4349     b = float32_squash_input_denormal(b, status);
4350 
4351     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4352          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4353        ) {
4354         float_raise(float_flag_invalid, status);
4355         return 0;
4356     }
4357     av = float32_val(a);
4358     bv = float32_val(b);
4359     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4360 }
4361 
4362 /*----------------------------------------------------------------------------
4363 | Returns 1 if the single-precision floating-point value `a' is less than
4364 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4365 | exception is raised if either operand is a NaN.  The comparison is performed
4366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4367 *----------------------------------------------------------------------------*/
4368 
4369 int float32_le(float32 a, float32 b, float_status *status)
4370 {
4371     flag aSign, bSign;
4372     uint32_t av, bv;
4373     a = float32_squash_input_denormal(a, status);
4374     b = float32_squash_input_denormal(b, status);
4375 
4376     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4377          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4378        ) {
4379         float_raise(float_flag_invalid, status);
4380         return 0;
4381     }
4382     aSign = extractFloat32Sign( a );
4383     bSign = extractFloat32Sign( b );
4384     av = float32_val(a);
4385     bv = float32_val(b);
4386     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4387     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4388 
4389 }
4390 
4391 /*----------------------------------------------------------------------------
4392 | Returns 1 if the single-precision floating-point value `a' is less than
4393 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4394 | raised if either operand is a NaN.  The comparison is performed according
4395 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4397 
4398 int float32_lt(float32 a, float32 b, float_status *status)
4399 {
4400     flag aSign, bSign;
4401     uint32_t av, bv;
4402     a = float32_squash_input_denormal(a, status);
4403     b = float32_squash_input_denormal(b, status);
4404 
4405     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4406          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4407        ) {
4408         float_raise(float_flag_invalid, status);
4409         return 0;
4410     }
4411     aSign = extractFloat32Sign( a );
4412     bSign = extractFloat32Sign( b );
4413     av = float32_val(a);
4414     bv = float32_val(b);
4415     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4416     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4417 
4418 }
4419 
4420 /*----------------------------------------------------------------------------
4421 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4422 | be compared, and 0 otherwise.  The invalid exception is raised if either
4423 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4424 | Standard for Binary Floating-Point Arithmetic.
4425 *----------------------------------------------------------------------------*/
4426 
4427 int float32_unordered(float32 a, float32 b, float_status *status)
4428 {
4429     a = float32_squash_input_denormal(a, status);
4430     b = float32_squash_input_denormal(b, status);
4431 
4432     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4433          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4434        ) {
4435         float_raise(float_flag_invalid, status);
4436         return 1;
4437     }
4438     return 0;
4439 }
4440 
4441 /*----------------------------------------------------------------------------
4442 | Returns 1 if the single-precision floating-point value `a' is equal to
4443 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4444 | exception.  The comparison is performed according to the IEC/IEEE Standard
4445 | for Binary Floating-Point Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447 
4448 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4449 {
4450     a = float32_squash_input_denormal(a, status);
4451     b = float32_squash_input_denormal(b, status);
4452 
4453     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4454          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4455        ) {
4456         if (float32_is_signaling_nan(a, status)
4457          || float32_is_signaling_nan(b, status)) {
4458             float_raise(float_flag_invalid, status);
4459         }
4460         return 0;
4461     }
4462     return ( float32_val(a) == float32_val(b) ) ||
4463             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4464 }
4465 
4466 /*----------------------------------------------------------------------------
4467 | Returns 1 if the single-precision floating-point value `a' is less than or
4468 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4469 | cause an exception.  Otherwise, the comparison is performed according to the
4470 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472 
4473 int float32_le_quiet(float32 a, float32 b, float_status *status)
4474 {
4475     flag aSign, bSign;
4476     uint32_t av, bv;
4477     a = float32_squash_input_denormal(a, status);
4478     b = float32_squash_input_denormal(b, status);
4479 
4480     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4481          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4482        ) {
4483         if (float32_is_signaling_nan(a, status)
4484          || float32_is_signaling_nan(b, status)) {
4485             float_raise(float_flag_invalid, status);
4486         }
4487         return 0;
4488     }
4489     aSign = extractFloat32Sign( a );
4490     bSign = extractFloat32Sign( b );
4491     av = float32_val(a);
4492     bv = float32_val(b);
4493     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4494     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4495 
4496 }
4497 
4498 /*----------------------------------------------------------------------------
4499 | Returns 1 if the single-precision floating-point value `a' is less than
4500 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4501 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4502 | Standard for Binary Floating-Point Arithmetic.
4503 *----------------------------------------------------------------------------*/
4504 
4505 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4506 {
4507     flag aSign, bSign;
4508     uint32_t av, bv;
4509     a = float32_squash_input_denormal(a, status);
4510     b = float32_squash_input_denormal(b, status);
4511 
4512     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4513          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4514        ) {
4515         if (float32_is_signaling_nan(a, status)
4516          || float32_is_signaling_nan(b, status)) {
4517             float_raise(float_flag_invalid, status);
4518         }
4519         return 0;
4520     }
4521     aSign = extractFloat32Sign( a );
4522     bSign = extractFloat32Sign( b );
4523     av = float32_val(a);
4524     bv = float32_val(b);
4525     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4526     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4527 
4528 }
4529 
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4532 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4533 | comparison is performed according to the IEC/IEEE Standard for Binary
4534 | Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4536 
4537 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4538 {
4539     a = float32_squash_input_denormal(a, status);
4540     b = float32_squash_input_denormal(b, status);
4541 
4542     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4543          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4544        ) {
4545         if (float32_is_signaling_nan(a, status)
4546          || float32_is_signaling_nan(b, status)) {
4547             float_raise(float_flag_invalid, status);
4548         }
4549         return 1;
4550     }
4551     return 0;
4552 }
4553 
4554 /*----------------------------------------------------------------------------
4555 | If `a' is denormal and we are in flush-to-zero mode then set the
4556 | input-denormal exception and return zero. Otherwise just return the value.
4557 *----------------------------------------------------------------------------*/
4558 float16 float16_squash_input_denormal(float16 a, float_status *status)
4559 {
4560     if (status->flush_inputs_to_zero) {
4561         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4562             float_raise(float_flag_input_denormal, status);
4563             return make_float16(float16_val(a) & 0x8000);
4564         }
4565     }
4566     return a;
4567 }
4568 
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the double-precision floating-point value
4571 | `a' to the extended double-precision floating-point format.  The conversion
4572 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4573 | Arithmetic.
4574 *----------------------------------------------------------------------------*/
4575 
4576 floatx80 float64_to_floatx80(float64 a, float_status *status)
4577 {
4578     flag aSign;
4579     int aExp;
4580     uint64_t aSig;
4581 
4582     a = float64_squash_input_denormal(a, status);
4583     aSig = extractFloat64Frac( a );
4584     aExp = extractFloat64Exp( a );
4585     aSign = extractFloat64Sign( a );
4586     if ( aExp == 0x7FF ) {
4587         if (aSig) {
4588             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4589         }
4590         return packFloatx80(aSign,
4591                             floatx80_infinity_high,
4592                             floatx80_infinity_low);
4593     }
4594     if ( aExp == 0 ) {
4595         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4596         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4597     }
4598     return
4599         packFloatx80(
4600             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4601 
4602 }
4603 
4604 /*----------------------------------------------------------------------------
4605 | Returns the result of converting the double-precision floating-point value
4606 | `a' to the quadruple-precision floating-point format.  The conversion is
4607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4608 | Arithmetic.
4609 *----------------------------------------------------------------------------*/
4610 
4611 float128 float64_to_float128(float64 a, float_status *status)
4612 {
4613     flag aSign;
4614     int aExp;
4615     uint64_t aSig, zSig0, zSig1;
4616 
4617     a = float64_squash_input_denormal(a, status);
4618     aSig = extractFloat64Frac( a );
4619     aExp = extractFloat64Exp( a );
4620     aSign = extractFloat64Sign( a );
4621     if ( aExp == 0x7FF ) {
4622         if (aSig) {
4623             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4624         }
4625         return packFloat128( aSign, 0x7FFF, 0, 0 );
4626     }
4627     if ( aExp == 0 ) {
4628         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4629         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4630         --aExp;
4631     }
4632     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4633     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4634 
4635 }
4636 
4637 
4638 /*----------------------------------------------------------------------------
4639 | Returns the remainder of the double-precision floating-point value `a'
4640 | with respect to the corresponding value `b'.  The operation is performed
4641 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4642 *----------------------------------------------------------------------------*/
4643 
4644 float64 float64_rem(float64 a, float64 b, float_status *status)
4645 {
4646     flag aSign, zSign;
4647     int aExp, bExp, expDiff;
4648     uint64_t aSig, bSig;
4649     uint64_t q, alternateASig;
4650     int64_t sigMean;
4651 
4652     a = float64_squash_input_denormal(a, status);
4653     b = float64_squash_input_denormal(b, status);
4654     aSig = extractFloat64Frac( a );
4655     aExp = extractFloat64Exp( a );
4656     aSign = extractFloat64Sign( a );
4657     bSig = extractFloat64Frac( b );
4658     bExp = extractFloat64Exp( b );
4659     if ( aExp == 0x7FF ) {
4660         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4661             return propagateFloat64NaN(a, b, status);
4662         }
4663         float_raise(float_flag_invalid, status);
4664         return float64_default_nan(status);
4665     }
4666     if ( bExp == 0x7FF ) {
4667         if (bSig) {
4668             return propagateFloat64NaN(a, b, status);
4669         }
4670         return a;
4671     }
4672     if ( bExp == 0 ) {
4673         if ( bSig == 0 ) {
4674             float_raise(float_flag_invalid, status);
4675             return float64_default_nan(status);
4676         }
4677         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4678     }
4679     if ( aExp == 0 ) {
4680         if ( aSig == 0 ) return a;
4681         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4682     }
4683     expDiff = aExp - bExp;
4684     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4685     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4686     if ( expDiff < 0 ) {
4687         if ( expDiff < -1 ) return a;
4688         aSig >>= 1;
4689     }
4690     q = ( bSig <= aSig );
4691     if ( q ) aSig -= bSig;
4692     expDiff -= 64;
4693     while ( 0 < expDiff ) {
4694         q = estimateDiv128To64( aSig, 0, bSig );
4695         q = ( 2 < q ) ? q - 2 : 0;
4696         aSig = - ( ( bSig>>2 ) * q );
4697         expDiff -= 62;
4698     }
4699     expDiff += 64;
4700     if ( 0 < expDiff ) {
4701         q = estimateDiv128To64( aSig, 0, bSig );
4702         q = ( 2 < q ) ? q - 2 : 0;
4703         q >>= 64 - expDiff;
4704         bSig >>= 2;
4705         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4706     }
4707     else {
4708         aSig >>= 2;
4709         bSig >>= 2;
4710     }
4711     do {
4712         alternateASig = aSig;
4713         ++q;
4714         aSig -= bSig;
4715     } while ( 0 <= (int64_t) aSig );
4716     sigMean = aSig + alternateASig;
4717     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4718         aSig = alternateASig;
4719     }
4720     zSign = ( (int64_t) aSig < 0 );
4721     if ( zSign ) aSig = - aSig;
4722     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4723 
4724 }
4725 
4726 /*----------------------------------------------------------------------------
4727 | Returns the binary log of the double-precision floating-point value `a'.
4728 | The operation is performed according to the IEC/IEEE Standard for Binary
4729 | Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4731 float64 float64_log2(float64 a, float_status *status)
4732 {
4733     flag aSign, zSign;
4734     int aExp;
4735     uint64_t aSig, aSig0, aSig1, zSig, i;
4736     a = float64_squash_input_denormal(a, status);
4737 
4738     aSig = extractFloat64Frac( a );
4739     aExp = extractFloat64Exp( a );
4740     aSign = extractFloat64Sign( a );
4741 
4742     if ( aExp == 0 ) {
4743         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4744         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4745     }
4746     if ( aSign ) {
4747         float_raise(float_flag_invalid, status);
4748         return float64_default_nan(status);
4749     }
4750     if ( aExp == 0x7FF ) {
4751         if (aSig) {
4752             return propagateFloat64NaN(a, float64_zero, status);
4753         }
4754         return a;
4755     }
4756 
4757     aExp -= 0x3FF;
4758     aSig |= LIT64( 0x0010000000000000 );
4759     zSign = aExp < 0;
4760     zSig = (uint64_t)aExp << 52;
4761     for (i = 1LL << 51; i > 0; i >>= 1) {
4762         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4763         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4764         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4765             aSig >>= 1;
4766             zSig |= i;
4767         }
4768     }
4769 
4770     if ( zSign )
4771         zSig = -zSig;
4772     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4773 }
4774 
4775 /*----------------------------------------------------------------------------
4776 | Returns 1 if the double-precision floating-point value `a' is equal to the
4777 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4778 | if either operand is a NaN.  Otherwise, the comparison is performed
4779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4780 *----------------------------------------------------------------------------*/
4781 
4782 int float64_eq(float64 a, float64 b, float_status *status)
4783 {
4784     uint64_t av, bv;
4785     a = float64_squash_input_denormal(a, status);
4786     b = float64_squash_input_denormal(b, status);
4787 
4788     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4789          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4790        ) {
4791         float_raise(float_flag_invalid, status);
4792         return 0;
4793     }
4794     av = float64_val(a);
4795     bv = float64_val(b);
4796     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4797 
4798 }
4799 
4800 /*----------------------------------------------------------------------------
4801 | Returns 1 if the double-precision floating-point value `a' is less than or
4802 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4803 | exception is raised if either operand is a NaN.  The comparison is performed
4804 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4805 *----------------------------------------------------------------------------*/
4806 
4807 int float64_le(float64 a, float64 b, float_status *status)
4808 {
4809     flag aSign, bSign;
4810     uint64_t av, bv;
4811     a = float64_squash_input_denormal(a, status);
4812     b = float64_squash_input_denormal(b, status);
4813 
4814     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4815          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4816        ) {
4817         float_raise(float_flag_invalid, status);
4818         return 0;
4819     }
4820     aSign = extractFloat64Sign( a );
4821     bSign = extractFloat64Sign( b );
4822     av = float64_val(a);
4823     bv = float64_val(b);
4824     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4825     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4826 
4827 }
4828 
4829 /*----------------------------------------------------------------------------
4830 | Returns 1 if the double-precision floating-point value `a' is less than
4831 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4832 | raised if either operand is a NaN.  The comparison is performed according
4833 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4834 *----------------------------------------------------------------------------*/
4835 
4836 int float64_lt(float64 a, float64 b, float_status *status)
4837 {
4838     flag aSign, bSign;
4839     uint64_t av, bv;
4840 
4841     a = float64_squash_input_denormal(a, status);
4842     b = float64_squash_input_denormal(b, status);
4843     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4844          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4845        ) {
4846         float_raise(float_flag_invalid, status);
4847         return 0;
4848     }
4849     aSign = extractFloat64Sign( a );
4850     bSign = extractFloat64Sign( b );
4851     av = float64_val(a);
4852     bv = float64_val(b);
4853     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4854     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4855 
4856 }
4857 
4858 /*----------------------------------------------------------------------------
4859 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4860 | be compared, and 0 otherwise.  The invalid exception is raised if either
4861 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4862 | Standard for Binary Floating-Point Arithmetic.
4863 *----------------------------------------------------------------------------*/
4864 
4865 int float64_unordered(float64 a, float64 b, float_status *status)
4866 {
4867     a = float64_squash_input_denormal(a, status);
4868     b = float64_squash_input_denormal(b, status);
4869 
4870     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4871          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4872        ) {
4873         float_raise(float_flag_invalid, status);
4874         return 1;
4875     }
4876     return 0;
4877 }
4878 
4879 /*----------------------------------------------------------------------------
4880 | Returns 1 if the double-precision floating-point value `a' is equal to the
4881 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4882 | exception.The comparison is performed according to the IEC/IEEE Standard
4883 | for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4885 
4886 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4887 {
4888     uint64_t av, bv;
4889     a = float64_squash_input_denormal(a, status);
4890     b = float64_squash_input_denormal(b, status);
4891 
4892     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4893          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4894        ) {
4895         if (float64_is_signaling_nan(a, status)
4896          || float64_is_signaling_nan(b, status)) {
4897             float_raise(float_flag_invalid, status);
4898         }
4899         return 0;
4900     }
4901     av = float64_val(a);
4902     bv = float64_val(b);
4903     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4904 
4905 }
4906 
4907 /*----------------------------------------------------------------------------
4908 | Returns 1 if the double-precision floating-point value `a' is less than or
4909 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4910 | cause an exception.  Otherwise, the comparison is performed according to the
4911 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4913 
4914 int float64_le_quiet(float64 a, float64 b, float_status *status)
4915 {
4916     flag aSign, bSign;
4917     uint64_t av, bv;
4918     a = float64_squash_input_denormal(a, status);
4919     b = float64_squash_input_denormal(b, status);
4920 
4921     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4922          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4923        ) {
4924         if (float64_is_signaling_nan(a, status)
4925          || float64_is_signaling_nan(b, status)) {
4926             float_raise(float_flag_invalid, status);
4927         }
4928         return 0;
4929     }
4930     aSign = extractFloat64Sign( a );
4931     bSign = extractFloat64Sign( b );
4932     av = float64_val(a);
4933     bv = float64_val(b);
4934     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4935     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4936 
4937 }
4938 
4939 /*----------------------------------------------------------------------------
4940 | Returns 1 if the double-precision floating-point value `a' is less than
4941 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4942 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4943 | Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4945 
4946 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4947 {
4948     flag aSign, bSign;
4949     uint64_t av, bv;
4950     a = float64_squash_input_denormal(a, status);
4951     b = float64_squash_input_denormal(b, status);
4952 
4953     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4954          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4955        ) {
4956         if (float64_is_signaling_nan(a, status)
4957          || float64_is_signaling_nan(b, status)) {
4958             float_raise(float_flag_invalid, status);
4959         }
4960         return 0;
4961     }
4962     aSign = extractFloat64Sign( a );
4963     bSign = extractFloat64Sign( b );
4964     av = float64_val(a);
4965     bv = float64_val(b);
4966     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4967     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4968 
4969 }
4970 
4971 /*----------------------------------------------------------------------------
4972 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4973 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4974 | comparison is performed according to the IEC/IEEE Standard for Binary
4975 | Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4977 
4978 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4979 {
4980     a = float64_squash_input_denormal(a, status);
4981     b = float64_squash_input_denormal(b, status);
4982 
4983     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4984          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4985        ) {
4986         if (float64_is_signaling_nan(a, status)
4987          || float64_is_signaling_nan(b, status)) {
4988             float_raise(float_flag_invalid, status);
4989         }
4990         return 1;
4991     }
4992     return 0;
4993 }
4994 
4995 /*----------------------------------------------------------------------------
4996 | Returns the result of converting the extended double-precision floating-
4997 | point value `a' to the 32-bit two's complement integer format.  The
4998 | conversion is performed according to the IEC/IEEE Standard for Binary
4999 | Floating-Point Arithmetic---which means in particular that the conversion
5000 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5001 | largest positive integer is returned.  Otherwise, if the conversion
5002 | overflows, the largest integer with the same sign as `a' is returned.
5003 *----------------------------------------------------------------------------*/
5004 
5005 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5006 {
5007     flag aSign;
5008     int32_t aExp, shiftCount;
5009     uint64_t aSig;
5010 
5011     if (floatx80_invalid_encoding(a)) {
5012         float_raise(float_flag_invalid, status);
5013         return 1 << 31;
5014     }
5015     aSig = extractFloatx80Frac( a );
5016     aExp = extractFloatx80Exp( a );
5017     aSign = extractFloatx80Sign( a );
5018     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5019     shiftCount = 0x4037 - aExp;
5020     if ( shiftCount <= 0 ) shiftCount = 1;
5021     shift64RightJamming( aSig, shiftCount, &aSig );
5022     return roundAndPackInt32(aSign, aSig, status);
5023 
5024 }
5025 
5026 /*----------------------------------------------------------------------------
5027 | Returns the result of converting the extended double-precision floating-
5028 | point value `a' to the 32-bit two's complement integer format.  The
5029 | conversion is performed according to the IEC/IEEE Standard for Binary
5030 | Floating-Point Arithmetic, except that the conversion is always rounded
5031 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5032 | Otherwise, if the conversion overflows, the largest integer with the same
5033 | sign as `a' is returned.
5034 *----------------------------------------------------------------------------*/
5035 
5036 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5037 {
5038     flag aSign;
5039     int32_t aExp, shiftCount;
5040     uint64_t aSig, savedASig;
5041     int32_t z;
5042 
5043     if (floatx80_invalid_encoding(a)) {
5044         float_raise(float_flag_invalid, status);
5045         return 1 << 31;
5046     }
5047     aSig = extractFloatx80Frac( a );
5048     aExp = extractFloatx80Exp( a );
5049     aSign = extractFloatx80Sign( a );
5050     if ( 0x401E < aExp ) {
5051         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5052         goto invalid;
5053     }
5054     else if ( aExp < 0x3FFF ) {
5055         if (aExp || aSig) {
5056             status->float_exception_flags |= float_flag_inexact;
5057         }
5058         return 0;
5059     }
5060     shiftCount = 0x403E - aExp;
5061     savedASig = aSig;
5062     aSig >>= shiftCount;
5063     z = aSig;
5064     if ( aSign ) z = - z;
5065     if ( ( z < 0 ) ^ aSign ) {
5066  invalid:
5067         float_raise(float_flag_invalid, status);
5068         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5069     }
5070     if ( ( aSig<<shiftCount ) != savedASig ) {
5071         status->float_exception_flags |= float_flag_inexact;
5072     }
5073     return z;
5074 
5075 }
5076 
5077 /*----------------------------------------------------------------------------
5078 | Returns the result of converting the extended double-precision floating-
5079 | point value `a' to the 64-bit two's complement integer format.  The
5080 | conversion is performed according to the IEC/IEEE Standard for Binary
5081 | Floating-Point Arithmetic---which means in particular that the conversion
5082 | is rounded according to the current rounding mode.  If `a' is a NaN,
5083 | the largest positive integer is returned.  Otherwise, if the conversion
5084 | overflows, the largest integer with the same sign as `a' is returned.
5085 *----------------------------------------------------------------------------*/
5086 
5087 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5088 {
5089     flag aSign;
5090     int32_t aExp, shiftCount;
5091     uint64_t aSig, aSigExtra;
5092 
5093     if (floatx80_invalid_encoding(a)) {
5094         float_raise(float_flag_invalid, status);
5095         return 1ULL << 63;
5096     }
5097     aSig = extractFloatx80Frac( a );
5098     aExp = extractFloatx80Exp( a );
5099     aSign = extractFloatx80Sign( a );
5100     shiftCount = 0x403E - aExp;
5101     if ( shiftCount <= 0 ) {
5102         if ( shiftCount ) {
5103             float_raise(float_flag_invalid, status);
5104             if (!aSign || floatx80_is_any_nan(a)) {
5105                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5106             }
5107             return (int64_t) LIT64( 0x8000000000000000 );
5108         }
5109         aSigExtra = 0;
5110     }
5111     else {
5112         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5113     }
5114     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5115 
5116 }
5117 
5118 /*----------------------------------------------------------------------------
5119 | Returns the result of converting the extended double-precision floating-
5120 | point value `a' to the 64-bit two's complement integer format.  The
5121 | conversion is performed according to the IEC/IEEE Standard for Binary
5122 | Floating-Point Arithmetic, except that the conversion is always rounded
5123 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5124 | Otherwise, if the conversion overflows, the largest integer with the same
5125 | sign as `a' is returned.
5126 *----------------------------------------------------------------------------*/
5127 
5128 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5129 {
5130     flag aSign;
5131     int32_t aExp, shiftCount;
5132     uint64_t aSig;
5133     int64_t z;
5134 
5135     if (floatx80_invalid_encoding(a)) {
5136         float_raise(float_flag_invalid, status);
5137         return 1ULL << 63;
5138     }
5139     aSig = extractFloatx80Frac( a );
5140     aExp = extractFloatx80Exp( a );
5141     aSign = extractFloatx80Sign( a );
5142     shiftCount = aExp - 0x403E;
5143     if ( 0 <= shiftCount ) {
5144         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5145         if ( ( a.high != 0xC03E ) || aSig ) {
5146             float_raise(float_flag_invalid, status);
5147             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5148                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5149             }
5150         }
5151         return (int64_t) LIT64( 0x8000000000000000 );
5152     }
5153     else if ( aExp < 0x3FFF ) {
5154         if (aExp | aSig) {
5155             status->float_exception_flags |= float_flag_inexact;
5156         }
5157         return 0;
5158     }
5159     z = aSig>>( - shiftCount );
5160     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5161         status->float_exception_flags |= float_flag_inexact;
5162     }
5163     if ( aSign ) z = - z;
5164     return z;
5165 
5166 }
5167 
5168 /*----------------------------------------------------------------------------
5169 | Returns the result of converting the extended double-precision floating-
5170 | point value `a' to the single-precision floating-point format.  The
5171 | conversion is performed according to the IEC/IEEE Standard for Binary
5172 | Floating-Point Arithmetic.
5173 *----------------------------------------------------------------------------*/
5174 
5175 float32 floatx80_to_float32(floatx80 a, float_status *status)
5176 {
5177     flag aSign;
5178     int32_t aExp;
5179     uint64_t aSig;
5180 
5181     if (floatx80_invalid_encoding(a)) {
5182         float_raise(float_flag_invalid, status);
5183         return float32_default_nan(status);
5184     }
5185     aSig = extractFloatx80Frac( a );
5186     aExp = extractFloatx80Exp( a );
5187     aSign = extractFloatx80Sign( a );
5188     if ( aExp == 0x7FFF ) {
5189         if ( (uint64_t) ( aSig<<1 ) ) {
5190             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5191         }
5192         return packFloat32( aSign, 0xFF, 0 );
5193     }
5194     shift64RightJamming( aSig, 33, &aSig );
5195     if ( aExp || aSig ) aExp -= 0x3F81;
5196     return roundAndPackFloat32(aSign, aExp, aSig, status);
5197 
5198 }
5199 
5200 /*----------------------------------------------------------------------------
5201 | Returns the result of converting the extended double-precision floating-
5202 | point value `a' to the double-precision floating-point format.  The
5203 | conversion is performed according to the IEC/IEEE Standard for Binary
5204 | Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5206 
5207 float64 floatx80_to_float64(floatx80 a, float_status *status)
5208 {
5209     flag aSign;
5210     int32_t aExp;
5211     uint64_t aSig, zSig;
5212 
5213     if (floatx80_invalid_encoding(a)) {
5214         float_raise(float_flag_invalid, status);
5215         return float64_default_nan(status);
5216     }
5217     aSig = extractFloatx80Frac( a );
5218     aExp = extractFloatx80Exp( a );
5219     aSign = extractFloatx80Sign( a );
5220     if ( aExp == 0x7FFF ) {
5221         if ( (uint64_t) ( aSig<<1 ) ) {
5222             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5223         }
5224         return packFloat64( aSign, 0x7FF, 0 );
5225     }
5226     shift64RightJamming( aSig, 1, &zSig );
5227     if ( aExp || aSig ) aExp -= 0x3C01;
5228     return roundAndPackFloat64(aSign, aExp, zSig, status);
5229 
5230 }
5231 
5232 /*----------------------------------------------------------------------------
5233 | Returns the result of converting the extended double-precision floating-
5234 | point value `a' to the quadruple-precision floating-point format.  The
5235 | conversion is performed according to the IEC/IEEE Standard for Binary
5236 | Floating-Point Arithmetic.
5237 *----------------------------------------------------------------------------*/
5238 
5239 float128 floatx80_to_float128(floatx80 a, float_status *status)
5240 {
5241     flag aSign;
5242     int aExp;
5243     uint64_t aSig, zSig0, zSig1;
5244 
5245     if (floatx80_invalid_encoding(a)) {
5246         float_raise(float_flag_invalid, status);
5247         return float128_default_nan(status);
5248     }
5249     aSig = extractFloatx80Frac( a );
5250     aExp = extractFloatx80Exp( a );
5251     aSign = extractFloatx80Sign( a );
5252     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5253         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5254     }
5255     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5256     return packFloat128( aSign, aExp, zSig0, zSig1 );
5257 
5258 }
5259 
5260 /*----------------------------------------------------------------------------
5261 | Rounds the extended double-precision floating-point value `a'
5262 | to the precision provided by floatx80_rounding_precision and returns the
5263 | result as an extended double-precision floating-point value.
5264 | The operation is performed according to the IEC/IEEE Standard for Binary
5265 | Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5267 
5268 floatx80 floatx80_round(floatx80 a, float_status *status)
5269 {
5270     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5271                                 extractFloatx80Sign(a),
5272                                 extractFloatx80Exp(a),
5273                                 extractFloatx80Frac(a), 0, status);
5274 }
5275 
5276 /*----------------------------------------------------------------------------
5277 | Rounds the extended double-precision floating-point value `a' to an integer,
5278 | and returns the result as an extended quadruple-precision floating-point
5279 | value.  The operation is performed according to the IEC/IEEE Standard for
5280 | Binary Floating-Point Arithmetic.
5281 *----------------------------------------------------------------------------*/
5282 
5283 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5284 {
5285     flag aSign;
5286     int32_t aExp;
5287     uint64_t lastBitMask, roundBitsMask;
5288     floatx80 z;
5289 
5290     if (floatx80_invalid_encoding(a)) {
5291         float_raise(float_flag_invalid, status);
5292         return floatx80_default_nan(status);
5293     }
5294     aExp = extractFloatx80Exp( a );
5295     if ( 0x403E <= aExp ) {
5296         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5297             return propagateFloatx80NaN(a, a, status);
5298         }
5299         return a;
5300     }
5301     if ( aExp < 0x3FFF ) {
5302         if (    ( aExp == 0 )
5303              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5304             return a;
5305         }
5306         status->float_exception_flags |= float_flag_inexact;
5307         aSign = extractFloatx80Sign( a );
5308         switch (status->float_rounding_mode) {
5309          case float_round_nearest_even:
5310             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5311                ) {
5312                 return
5313                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5314             }
5315             break;
5316         case float_round_ties_away:
5317             if (aExp == 0x3FFE) {
5318                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5319             }
5320             break;
5321          case float_round_down:
5322             return
5323                   aSign ?
5324                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5325                 : packFloatx80( 0, 0, 0 );
5326          case float_round_up:
5327             return
5328                   aSign ? packFloatx80( 1, 0, 0 )
5329                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5330         }
5331         return packFloatx80( aSign, 0, 0 );
5332     }
5333     lastBitMask = 1;
5334     lastBitMask <<= 0x403E - aExp;
5335     roundBitsMask = lastBitMask - 1;
5336     z = a;
5337     switch (status->float_rounding_mode) {
5338     case float_round_nearest_even:
5339         z.low += lastBitMask>>1;
5340         if ((z.low & roundBitsMask) == 0) {
5341             z.low &= ~lastBitMask;
5342         }
5343         break;
5344     case float_round_ties_away:
5345         z.low += lastBitMask >> 1;
5346         break;
5347     case float_round_to_zero:
5348         break;
5349     case float_round_up:
5350         if (!extractFloatx80Sign(z)) {
5351             z.low += roundBitsMask;
5352         }
5353         break;
5354     case float_round_down:
5355         if (extractFloatx80Sign(z)) {
5356             z.low += roundBitsMask;
5357         }
5358         break;
5359     default:
5360         abort();
5361     }
5362     z.low &= ~ roundBitsMask;
5363     if ( z.low == 0 ) {
5364         ++z.high;
5365         z.low = LIT64( 0x8000000000000000 );
5366     }
5367     if (z.low != a.low) {
5368         status->float_exception_flags |= float_flag_inexact;
5369     }
5370     return z;
5371 
5372 }
5373 
5374 /*----------------------------------------------------------------------------
5375 | Returns the result of adding the absolute values of the extended double-
5376 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5377 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5378 | The addition is performed according to the IEC/IEEE Standard for Binary
5379 | Floating-Point Arithmetic.
5380 *----------------------------------------------------------------------------*/
5381 
5382 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5383                                 float_status *status)
5384 {
5385     int32_t aExp, bExp, zExp;
5386     uint64_t aSig, bSig, zSig0, zSig1;
5387     int32_t expDiff;
5388 
5389     aSig = extractFloatx80Frac( a );
5390     aExp = extractFloatx80Exp( a );
5391     bSig = extractFloatx80Frac( b );
5392     bExp = extractFloatx80Exp( b );
5393     expDiff = aExp - bExp;
5394     if ( 0 < expDiff ) {
5395         if ( aExp == 0x7FFF ) {
5396             if ((uint64_t)(aSig << 1)) {
5397                 return propagateFloatx80NaN(a, b, status);
5398             }
5399             return a;
5400         }
5401         if ( bExp == 0 ) --expDiff;
5402         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5403         zExp = aExp;
5404     }
5405     else if ( expDiff < 0 ) {
5406         if ( bExp == 0x7FFF ) {
5407             if ((uint64_t)(bSig << 1)) {
5408                 return propagateFloatx80NaN(a, b, status);
5409             }
5410             return packFloatx80(zSign,
5411                                 floatx80_infinity_high,
5412                                 floatx80_infinity_low);
5413         }
5414         if ( aExp == 0 ) ++expDiff;
5415         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5416         zExp = bExp;
5417     }
5418     else {
5419         if ( aExp == 0x7FFF ) {
5420             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5421                 return propagateFloatx80NaN(a, b, status);
5422             }
5423             return a;
5424         }
5425         zSig1 = 0;
5426         zSig0 = aSig + bSig;
5427         if ( aExp == 0 ) {
5428             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5429             goto roundAndPack;
5430         }
5431         zExp = aExp;
5432         goto shiftRight1;
5433     }
5434     zSig0 = aSig + bSig;
5435     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5436  shiftRight1:
5437     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5438     zSig0 |= LIT64( 0x8000000000000000 );
5439     ++zExp;
5440  roundAndPack:
5441     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5442                                 zSign, zExp, zSig0, zSig1, status);
5443 }
5444 
5445 /*----------------------------------------------------------------------------
5446 | Returns the result of subtracting the absolute values of the extended
5447 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5448 | difference is negated before being returned.  `zSign' is ignored if the
5449 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5450 | Standard for Binary Floating-Point Arithmetic.
5451 *----------------------------------------------------------------------------*/
5452 
5453 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5454                                 float_status *status)
5455 {
5456     int32_t aExp, bExp, zExp;
5457     uint64_t aSig, bSig, zSig0, zSig1;
5458     int32_t expDiff;
5459 
5460     aSig = extractFloatx80Frac( a );
5461     aExp = extractFloatx80Exp( a );
5462     bSig = extractFloatx80Frac( b );
5463     bExp = extractFloatx80Exp( b );
5464     expDiff = aExp - bExp;
5465     if ( 0 < expDiff ) goto aExpBigger;
5466     if ( expDiff < 0 ) goto bExpBigger;
5467     if ( aExp == 0x7FFF ) {
5468         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5469             return propagateFloatx80NaN(a, b, status);
5470         }
5471         float_raise(float_flag_invalid, status);
5472         return floatx80_default_nan(status);
5473     }
5474     if ( aExp == 0 ) {
5475         aExp = 1;
5476         bExp = 1;
5477     }
5478     zSig1 = 0;
5479     if ( bSig < aSig ) goto aBigger;
5480     if ( aSig < bSig ) goto bBigger;
5481     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5482  bExpBigger:
5483     if ( bExp == 0x7FFF ) {
5484         if ((uint64_t)(bSig << 1)) {
5485             return propagateFloatx80NaN(a, b, status);
5486         }
5487         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5488                             floatx80_infinity_low);
5489     }
5490     if ( aExp == 0 ) ++expDiff;
5491     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5492  bBigger:
5493     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5494     zExp = bExp;
5495     zSign ^= 1;
5496     goto normalizeRoundAndPack;
5497  aExpBigger:
5498     if ( aExp == 0x7FFF ) {
5499         if ((uint64_t)(aSig << 1)) {
5500             return propagateFloatx80NaN(a, b, status);
5501         }
5502         return a;
5503     }
5504     if ( bExp == 0 ) --expDiff;
5505     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5506  aBigger:
5507     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5508     zExp = aExp;
5509  normalizeRoundAndPack:
5510     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5511                                          zSign, zExp, zSig0, zSig1, status);
5512 }
5513 
5514 /*----------------------------------------------------------------------------
5515 | Returns the result of adding the extended double-precision floating-point
5516 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5517 | Standard for Binary Floating-Point Arithmetic.
5518 *----------------------------------------------------------------------------*/
5519 
5520 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5521 {
5522     flag aSign, bSign;
5523 
5524     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5525         float_raise(float_flag_invalid, status);
5526         return floatx80_default_nan(status);
5527     }
5528     aSign = extractFloatx80Sign( a );
5529     bSign = extractFloatx80Sign( b );
5530     if ( aSign == bSign ) {
5531         return addFloatx80Sigs(a, b, aSign, status);
5532     }
5533     else {
5534         return subFloatx80Sigs(a, b, aSign, status);
5535     }
5536 
5537 }
5538 
5539 /*----------------------------------------------------------------------------
5540 | Returns the result of subtracting the extended double-precision floating-
5541 | point values `a' and `b'.  The operation is performed according to the
5542 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5543 *----------------------------------------------------------------------------*/
5544 
5545 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5546 {
5547     flag aSign, bSign;
5548 
5549     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5550         float_raise(float_flag_invalid, status);
5551         return floatx80_default_nan(status);
5552     }
5553     aSign = extractFloatx80Sign( a );
5554     bSign = extractFloatx80Sign( b );
5555     if ( aSign == bSign ) {
5556         return subFloatx80Sigs(a, b, aSign, status);
5557     }
5558     else {
5559         return addFloatx80Sigs(a, b, aSign, status);
5560     }
5561 
5562 }
5563 
5564 /*----------------------------------------------------------------------------
5565 | Returns the result of multiplying the extended double-precision floating-
5566 | point values `a' and `b'.  The operation is performed according to the
5567 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5568 *----------------------------------------------------------------------------*/
5569 
5570 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5571 {
5572     flag aSign, bSign, zSign;
5573     int32_t aExp, bExp, zExp;
5574     uint64_t aSig, bSig, zSig0, zSig1;
5575 
5576     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5577         float_raise(float_flag_invalid, status);
5578         return floatx80_default_nan(status);
5579     }
5580     aSig = extractFloatx80Frac( a );
5581     aExp = extractFloatx80Exp( a );
5582     aSign = extractFloatx80Sign( a );
5583     bSig = extractFloatx80Frac( b );
5584     bExp = extractFloatx80Exp( b );
5585     bSign = extractFloatx80Sign( b );
5586     zSign = aSign ^ bSign;
5587     if ( aExp == 0x7FFF ) {
5588         if (    (uint64_t) ( aSig<<1 )
5589              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5590             return propagateFloatx80NaN(a, b, status);
5591         }
5592         if ( ( bExp | bSig ) == 0 ) goto invalid;
5593         return packFloatx80(zSign, floatx80_infinity_high,
5594                                    floatx80_infinity_low);
5595     }
5596     if ( bExp == 0x7FFF ) {
5597         if ((uint64_t)(bSig << 1)) {
5598             return propagateFloatx80NaN(a, b, status);
5599         }
5600         if ( ( aExp | aSig ) == 0 ) {
5601  invalid:
5602             float_raise(float_flag_invalid, status);
5603             return floatx80_default_nan(status);
5604         }
5605         return packFloatx80(zSign, floatx80_infinity_high,
5606                                    floatx80_infinity_low);
5607     }
5608     if ( aExp == 0 ) {
5609         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5610         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5611     }
5612     if ( bExp == 0 ) {
5613         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5614         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5615     }
5616     zExp = aExp + bExp - 0x3FFE;
5617     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5618     if ( 0 < (int64_t) zSig0 ) {
5619         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5620         --zExp;
5621     }
5622     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5623                                 zSign, zExp, zSig0, zSig1, status);
5624 }
5625 
5626 /*----------------------------------------------------------------------------
5627 | Returns the result of dividing the extended double-precision floating-point
5628 | value `a' by the corresponding value `b'.  The operation is performed
5629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5631 
5632 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5633 {
5634     flag aSign, bSign, zSign;
5635     int32_t aExp, bExp, zExp;
5636     uint64_t aSig, bSig, zSig0, zSig1;
5637     uint64_t rem0, rem1, rem2, term0, term1, term2;
5638 
5639     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5640         float_raise(float_flag_invalid, status);
5641         return floatx80_default_nan(status);
5642     }
5643     aSig = extractFloatx80Frac( a );
5644     aExp = extractFloatx80Exp( a );
5645     aSign = extractFloatx80Sign( a );
5646     bSig = extractFloatx80Frac( b );
5647     bExp = extractFloatx80Exp( b );
5648     bSign = extractFloatx80Sign( b );
5649     zSign = aSign ^ bSign;
5650     if ( aExp == 0x7FFF ) {
5651         if ((uint64_t)(aSig << 1)) {
5652             return propagateFloatx80NaN(a, b, status);
5653         }
5654         if ( bExp == 0x7FFF ) {
5655             if ((uint64_t)(bSig << 1)) {
5656                 return propagateFloatx80NaN(a, b, status);
5657             }
5658             goto invalid;
5659         }
5660         return packFloatx80(zSign, floatx80_infinity_high,
5661                                    floatx80_infinity_low);
5662     }
5663     if ( bExp == 0x7FFF ) {
5664         if ((uint64_t)(bSig << 1)) {
5665             return propagateFloatx80NaN(a, b, status);
5666         }
5667         return packFloatx80( zSign, 0, 0 );
5668     }
5669     if ( bExp == 0 ) {
5670         if ( bSig == 0 ) {
5671             if ( ( aExp | aSig ) == 0 ) {
5672  invalid:
5673                 float_raise(float_flag_invalid, status);
5674                 return floatx80_default_nan(status);
5675             }
5676             float_raise(float_flag_divbyzero, status);
5677             return packFloatx80(zSign, floatx80_infinity_high,
5678                                        floatx80_infinity_low);
5679         }
5680         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5681     }
5682     if ( aExp == 0 ) {
5683         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5684         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5685     }
5686     zExp = aExp - bExp + 0x3FFE;
5687     rem1 = 0;
5688     if ( bSig <= aSig ) {
5689         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5690         ++zExp;
5691     }
5692     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5693     mul64To128( bSig, zSig0, &term0, &term1 );
5694     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5695     while ( (int64_t) rem0 < 0 ) {
5696         --zSig0;
5697         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5698     }
5699     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5700     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5701         mul64To128( bSig, zSig1, &term1, &term2 );
5702         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5703         while ( (int64_t) rem1 < 0 ) {
5704             --zSig1;
5705             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5706         }
5707         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5708     }
5709     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5710                                 zSign, zExp, zSig0, zSig1, status);
5711 }
5712 
5713 /*----------------------------------------------------------------------------
5714 | Returns the remainder of the extended double-precision floating-point value
5715 | `a' with respect to the corresponding value `b'.  The operation is performed
5716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5717 *----------------------------------------------------------------------------*/
5718 
5719 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5720 {
5721     flag aSign, zSign;
5722     int32_t aExp, bExp, expDiff;
5723     uint64_t aSig0, aSig1, bSig;
5724     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5725 
5726     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5727         float_raise(float_flag_invalid, status);
5728         return floatx80_default_nan(status);
5729     }
5730     aSig0 = extractFloatx80Frac( a );
5731     aExp = extractFloatx80Exp( a );
5732     aSign = extractFloatx80Sign( a );
5733     bSig = extractFloatx80Frac( b );
5734     bExp = extractFloatx80Exp( b );
5735     if ( aExp == 0x7FFF ) {
5736         if (    (uint64_t) ( aSig0<<1 )
5737              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5738             return propagateFloatx80NaN(a, b, status);
5739         }
5740         goto invalid;
5741     }
5742     if ( bExp == 0x7FFF ) {
5743         if ((uint64_t)(bSig << 1)) {
5744             return propagateFloatx80NaN(a, b, status);
5745         }
5746         return a;
5747     }
5748     if ( bExp == 0 ) {
5749         if ( bSig == 0 ) {
5750  invalid:
5751             float_raise(float_flag_invalid, status);
5752             return floatx80_default_nan(status);
5753         }
5754         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5755     }
5756     if ( aExp == 0 ) {
5757         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5758         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5759     }
5760     bSig |= LIT64( 0x8000000000000000 );
5761     zSign = aSign;
5762     expDiff = aExp - bExp;
5763     aSig1 = 0;
5764     if ( expDiff < 0 ) {
5765         if ( expDiff < -1 ) return a;
5766         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5767         expDiff = 0;
5768     }
5769     q = ( bSig <= aSig0 );
5770     if ( q ) aSig0 -= bSig;
5771     expDiff -= 64;
5772     while ( 0 < expDiff ) {
5773         q = estimateDiv128To64( aSig0, aSig1, bSig );
5774         q = ( 2 < q ) ? q - 2 : 0;
5775         mul64To128( bSig, q, &term0, &term1 );
5776         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5777         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5778         expDiff -= 62;
5779     }
5780     expDiff += 64;
5781     if ( 0 < expDiff ) {
5782         q = estimateDiv128To64( aSig0, aSig1, bSig );
5783         q = ( 2 < q ) ? q - 2 : 0;
5784         q >>= 64 - expDiff;
5785         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5786         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5787         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5788         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5789             ++q;
5790             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5791         }
5792     }
5793     else {
5794         term1 = 0;
5795         term0 = bSig;
5796     }
5797     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5798     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5799          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5800               && ( q & 1 ) )
5801        ) {
5802         aSig0 = alternateASig0;
5803         aSig1 = alternateASig1;
5804         zSign = ! zSign;
5805     }
5806     return
5807         normalizeRoundAndPackFloatx80(
5808             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5809 
5810 }
5811 
5812 /*----------------------------------------------------------------------------
5813 | Returns the square root of the extended double-precision floating-point
5814 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5815 | for Binary Floating-Point Arithmetic.
5816 *----------------------------------------------------------------------------*/
5817 
5818 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5819 {
5820     flag aSign;
5821     int32_t aExp, zExp;
5822     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5823     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5824 
5825     if (floatx80_invalid_encoding(a)) {
5826         float_raise(float_flag_invalid, status);
5827         return floatx80_default_nan(status);
5828     }
5829     aSig0 = extractFloatx80Frac( a );
5830     aExp = extractFloatx80Exp( a );
5831     aSign = extractFloatx80Sign( a );
5832     if ( aExp == 0x7FFF ) {
5833         if ((uint64_t)(aSig0 << 1)) {
5834             return propagateFloatx80NaN(a, a, status);
5835         }
5836         if ( ! aSign ) return a;
5837         goto invalid;
5838     }
5839     if ( aSign ) {
5840         if ( ( aExp | aSig0 ) == 0 ) return a;
5841  invalid:
5842         float_raise(float_flag_invalid, status);
5843         return floatx80_default_nan(status);
5844     }
5845     if ( aExp == 0 ) {
5846         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5847         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5848     }
5849     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5850     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5851     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5852     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5853     doubleZSig0 = zSig0<<1;
5854     mul64To128( zSig0, zSig0, &term0, &term1 );
5855     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5856     while ( (int64_t) rem0 < 0 ) {
5857         --zSig0;
5858         doubleZSig0 -= 2;
5859         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5860     }
5861     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5862     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5863         if ( zSig1 == 0 ) zSig1 = 1;
5864         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5865         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5866         mul64To128( zSig1, zSig1, &term2, &term3 );
5867         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5868         while ( (int64_t) rem1 < 0 ) {
5869             --zSig1;
5870             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5871             term3 |= 1;
5872             term2 |= doubleZSig0;
5873             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5874         }
5875         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5876     }
5877     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5878     zSig0 |= doubleZSig0;
5879     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5880                                 0, zExp, zSig0, zSig1, status);
5881 }
5882 
5883 /*----------------------------------------------------------------------------
5884 | Returns 1 if the extended double-precision floating-point value `a' is equal
5885 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5886 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5888 *----------------------------------------------------------------------------*/
5889 
5890 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5891 {
5892 
5893     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5894         || (extractFloatx80Exp(a) == 0x7FFF
5895             && (uint64_t) (extractFloatx80Frac(a) << 1))
5896         || (extractFloatx80Exp(b) == 0x7FFF
5897             && (uint64_t) (extractFloatx80Frac(b) << 1))
5898        ) {
5899         float_raise(float_flag_invalid, status);
5900         return 0;
5901     }
5902     return
5903            ( a.low == b.low )
5904         && (    ( a.high == b.high )
5905              || (    ( a.low == 0 )
5906                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5907            );
5908 
5909 }
5910 
5911 /*----------------------------------------------------------------------------
5912 | Returns 1 if the extended double-precision floating-point value `a' is
5913 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5914 | invalid exception is raised if either operand is a NaN.  The comparison is
5915 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5916 | Arithmetic.
5917 *----------------------------------------------------------------------------*/
5918 
5919 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5920 {
5921     flag aSign, bSign;
5922 
5923     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5924         || (extractFloatx80Exp(a) == 0x7FFF
5925             && (uint64_t) (extractFloatx80Frac(a) << 1))
5926         || (extractFloatx80Exp(b) == 0x7FFF
5927             && (uint64_t) (extractFloatx80Frac(b) << 1))
5928        ) {
5929         float_raise(float_flag_invalid, status);
5930         return 0;
5931     }
5932     aSign = extractFloatx80Sign( a );
5933     bSign = extractFloatx80Sign( b );
5934     if ( aSign != bSign ) {
5935         return
5936                aSign
5937             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5938                  == 0 );
5939     }
5940     return
5941           aSign ? le128( b.high, b.low, a.high, a.low )
5942         : le128( a.high, a.low, b.high, b.low );
5943 
5944 }
5945 
5946 /*----------------------------------------------------------------------------
5947 | Returns 1 if the extended double-precision floating-point value `a' is
5948 | less than the corresponding value `b', and 0 otherwise.  The invalid
5949 | exception is raised if either operand is a NaN.  The comparison is performed
5950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5952 
5953 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5954 {
5955     flag aSign, bSign;
5956 
5957     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5958         || (extractFloatx80Exp(a) == 0x7FFF
5959             && (uint64_t) (extractFloatx80Frac(a) << 1))
5960         || (extractFloatx80Exp(b) == 0x7FFF
5961             && (uint64_t) (extractFloatx80Frac(b) << 1))
5962        ) {
5963         float_raise(float_flag_invalid, status);
5964         return 0;
5965     }
5966     aSign = extractFloatx80Sign( a );
5967     bSign = extractFloatx80Sign( b );
5968     if ( aSign != bSign ) {
5969         return
5970                aSign
5971             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5972                  != 0 );
5973     }
5974     return
5975           aSign ? lt128( b.high, b.low, a.high, a.low )
5976         : lt128( a.high, a.low, b.high, b.low );
5977 
5978 }
5979 
5980 /*----------------------------------------------------------------------------
5981 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5982 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5983 | either operand is a NaN.   The comparison is performed according to the
5984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5985 *----------------------------------------------------------------------------*/
5986 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5987 {
5988     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5989         || (extractFloatx80Exp(a) == 0x7FFF
5990             && (uint64_t) (extractFloatx80Frac(a) << 1))
5991         || (extractFloatx80Exp(b) == 0x7FFF
5992             && (uint64_t) (extractFloatx80Frac(b) << 1))
5993        ) {
5994         float_raise(float_flag_invalid, status);
5995         return 1;
5996     }
5997     return 0;
5998 }
5999 
6000 /*----------------------------------------------------------------------------
6001 | Returns 1 if the extended double-precision floating-point value `a' is
6002 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6003 | cause an exception.  The comparison is performed according to the IEC/IEEE
6004 | Standard for Binary Floating-Point Arithmetic.
6005 *----------------------------------------------------------------------------*/
6006 
6007 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6008 {
6009 
6010     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6011         float_raise(float_flag_invalid, status);
6012         return 0;
6013     }
6014     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6015               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6016          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6017               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6018        ) {
6019         if (floatx80_is_signaling_nan(a, status)
6020          || floatx80_is_signaling_nan(b, status)) {
6021             float_raise(float_flag_invalid, status);
6022         }
6023         return 0;
6024     }
6025     return
6026            ( a.low == b.low )
6027         && (    ( a.high == b.high )
6028              || (    ( a.low == 0 )
6029                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6030            );
6031 
6032 }
6033 
6034 /*----------------------------------------------------------------------------
6035 | Returns 1 if the extended double-precision floating-point value `a' is less
6036 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6037 | do not cause an exception.  Otherwise, the comparison is performed according
6038 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6039 *----------------------------------------------------------------------------*/
6040 
6041 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6042 {
6043     flag aSign, bSign;
6044 
6045     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6046         float_raise(float_flag_invalid, status);
6047         return 0;
6048     }
6049     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6050               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6051          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6052               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6053        ) {
6054         if (floatx80_is_signaling_nan(a, status)
6055          || floatx80_is_signaling_nan(b, status)) {
6056             float_raise(float_flag_invalid, status);
6057         }
6058         return 0;
6059     }
6060     aSign = extractFloatx80Sign( a );
6061     bSign = extractFloatx80Sign( b );
6062     if ( aSign != bSign ) {
6063         return
6064                aSign
6065             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6066                  == 0 );
6067     }
6068     return
6069           aSign ? le128( b.high, b.low, a.high, a.low )
6070         : le128( a.high, a.low, b.high, b.low );
6071 
6072 }
6073 
6074 /*----------------------------------------------------------------------------
6075 | Returns 1 if the extended double-precision floating-point value `a' is less
6076 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6077 | an exception.  Otherwise, the comparison is performed according to the
6078 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6079 *----------------------------------------------------------------------------*/
6080 
6081 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6082 {
6083     flag aSign, bSign;
6084 
6085     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6086         float_raise(float_flag_invalid, status);
6087         return 0;
6088     }
6089     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6090               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6091          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6092               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6093        ) {
6094         if (floatx80_is_signaling_nan(a, status)
6095          || floatx80_is_signaling_nan(b, status)) {
6096             float_raise(float_flag_invalid, status);
6097         }
6098         return 0;
6099     }
6100     aSign = extractFloatx80Sign( a );
6101     bSign = extractFloatx80Sign( b );
6102     if ( aSign != bSign ) {
6103         return
6104                aSign
6105             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6106                  != 0 );
6107     }
6108     return
6109           aSign ? lt128( b.high, b.low, a.high, a.low )
6110         : lt128( a.high, a.low, b.high, b.low );
6111 
6112 }
6113 
6114 /*----------------------------------------------------------------------------
6115 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6116 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6117 | The comparison is performed according to the IEC/IEEE Standard for Binary
6118 | Floating-Point Arithmetic.
6119 *----------------------------------------------------------------------------*/
6120 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6121 {
6122     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6123         float_raise(float_flag_invalid, status);
6124         return 1;
6125     }
6126     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6127               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6128          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6129               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6130        ) {
6131         if (floatx80_is_signaling_nan(a, status)
6132          || floatx80_is_signaling_nan(b, status)) {
6133             float_raise(float_flag_invalid, status);
6134         }
6135         return 1;
6136     }
6137     return 0;
6138 }
6139 
6140 /*----------------------------------------------------------------------------
6141 | Returns the result of converting the quadruple-precision floating-point
6142 | value `a' to the 32-bit two's complement integer format.  The conversion
6143 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6144 | Arithmetic---which means in particular that the conversion is rounded
6145 | according to the current rounding mode.  If `a' is a NaN, the largest
6146 | positive integer is returned.  Otherwise, if the conversion overflows, the
6147 | largest integer with the same sign as `a' is returned.
6148 *----------------------------------------------------------------------------*/
6149 
6150 int32_t float128_to_int32(float128 a, float_status *status)
6151 {
6152     flag aSign;
6153     int32_t aExp, shiftCount;
6154     uint64_t aSig0, aSig1;
6155 
6156     aSig1 = extractFloat128Frac1( a );
6157     aSig0 = extractFloat128Frac0( a );
6158     aExp = extractFloat128Exp( a );
6159     aSign = extractFloat128Sign( a );
6160     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6161     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6162     aSig0 |= ( aSig1 != 0 );
6163     shiftCount = 0x4028 - aExp;
6164     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6165     return roundAndPackInt32(aSign, aSig0, status);
6166 
6167 }
6168 
6169 /*----------------------------------------------------------------------------
6170 | Returns the result of converting the quadruple-precision floating-point
6171 | value `a' to the 32-bit two's complement integer format.  The conversion
6172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6173 | Arithmetic, except that the conversion is always rounded toward zero.  If
6174 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6175 | conversion overflows, the largest integer with the same sign as `a' is
6176 | returned.
6177 *----------------------------------------------------------------------------*/
6178 
6179 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6180 {
6181     flag aSign;
6182     int32_t aExp, shiftCount;
6183     uint64_t aSig0, aSig1, savedASig;
6184     int32_t z;
6185 
6186     aSig1 = extractFloat128Frac1( a );
6187     aSig0 = extractFloat128Frac0( a );
6188     aExp = extractFloat128Exp( a );
6189     aSign = extractFloat128Sign( a );
6190     aSig0 |= ( aSig1 != 0 );
6191     if ( 0x401E < aExp ) {
6192         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6193         goto invalid;
6194     }
6195     else if ( aExp < 0x3FFF ) {
6196         if (aExp || aSig0) {
6197             status->float_exception_flags |= float_flag_inexact;
6198         }
6199         return 0;
6200     }
6201     aSig0 |= LIT64( 0x0001000000000000 );
6202     shiftCount = 0x402F - aExp;
6203     savedASig = aSig0;
6204     aSig0 >>= shiftCount;
6205     z = aSig0;
6206     if ( aSign ) z = - z;
6207     if ( ( z < 0 ) ^ aSign ) {
6208  invalid:
6209         float_raise(float_flag_invalid, status);
6210         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6211     }
6212     if ( ( aSig0<<shiftCount ) != savedASig ) {
6213         status->float_exception_flags |= float_flag_inexact;
6214     }
6215     return z;
6216 
6217 }
6218 
6219 /*----------------------------------------------------------------------------
6220 | Returns the result of converting the quadruple-precision floating-point
6221 | value `a' to the 64-bit two's complement integer format.  The conversion
6222 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6223 | Arithmetic---which means in particular that the conversion is rounded
6224 | according to the current rounding mode.  If `a' is a NaN, the largest
6225 | positive integer is returned.  Otherwise, if the conversion overflows, the
6226 | largest integer with the same sign as `a' is returned.
6227 *----------------------------------------------------------------------------*/
6228 
6229 int64_t float128_to_int64(float128 a, float_status *status)
6230 {
6231     flag aSign;
6232     int32_t aExp, shiftCount;
6233     uint64_t aSig0, aSig1;
6234 
6235     aSig1 = extractFloat128Frac1( a );
6236     aSig0 = extractFloat128Frac0( a );
6237     aExp = extractFloat128Exp( a );
6238     aSign = extractFloat128Sign( a );
6239     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6240     shiftCount = 0x402F - aExp;
6241     if ( shiftCount <= 0 ) {
6242         if ( 0x403E < aExp ) {
6243             float_raise(float_flag_invalid, status);
6244             if (    ! aSign
6245                  || (    ( aExp == 0x7FFF )
6246                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6247                     )
6248                ) {
6249                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6250             }
6251             return (int64_t) LIT64( 0x8000000000000000 );
6252         }
6253         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6254     }
6255     else {
6256         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6257     }
6258     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6259 
6260 }
6261 
6262 /*----------------------------------------------------------------------------
6263 | Returns the result of converting the quadruple-precision floating-point
6264 | value `a' to the 64-bit two's complement integer format.  The conversion
6265 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6266 | Arithmetic, except that the conversion is always rounded toward zero.
6267 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6268 | the conversion overflows, the largest integer with the same sign as `a' is
6269 | returned.
6270 *----------------------------------------------------------------------------*/
6271 
6272 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6273 {
6274     flag aSign;
6275     int32_t aExp, shiftCount;
6276     uint64_t aSig0, aSig1;
6277     int64_t z;
6278 
6279     aSig1 = extractFloat128Frac1( a );
6280     aSig0 = extractFloat128Frac0( a );
6281     aExp = extractFloat128Exp( a );
6282     aSign = extractFloat128Sign( a );
6283     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6284     shiftCount = aExp - 0x402F;
6285     if ( 0 < shiftCount ) {
6286         if ( 0x403E <= aExp ) {
6287             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6288             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6289                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6290                 if (aSig1) {
6291                     status->float_exception_flags |= float_flag_inexact;
6292                 }
6293             }
6294             else {
6295                 float_raise(float_flag_invalid, status);
6296                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6297                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6298                 }
6299             }
6300             return (int64_t) LIT64( 0x8000000000000000 );
6301         }
6302         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6303         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6304             status->float_exception_flags |= float_flag_inexact;
6305         }
6306     }
6307     else {
6308         if ( aExp < 0x3FFF ) {
6309             if ( aExp | aSig0 | aSig1 ) {
6310                 status->float_exception_flags |= float_flag_inexact;
6311             }
6312             return 0;
6313         }
6314         z = aSig0>>( - shiftCount );
6315         if (    aSig1
6316              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6317             status->float_exception_flags |= float_flag_inexact;
6318         }
6319     }
6320     if ( aSign ) z = - z;
6321     return z;
6322 
6323 }
6324 
6325 /*----------------------------------------------------------------------------
6326 | Returns the result of converting the quadruple-precision floating-point value
6327 | `a' to the 64-bit unsigned integer format.  The conversion is
6328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6329 | Arithmetic---which means in particular that the conversion is rounded
6330 | according to the current rounding mode.  If `a' is a NaN, the largest
6331 | positive integer is returned.  If the conversion overflows, the
6332 | largest unsigned integer is returned.  If 'a' is negative, the value is
6333 | rounded and zero is returned; negative values that do not round to zero
6334 | will raise the inexact exception.
6335 *----------------------------------------------------------------------------*/
6336 
6337 uint64_t float128_to_uint64(float128 a, float_status *status)
6338 {
6339     flag aSign;
6340     int aExp;
6341     int shiftCount;
6342     uint64_t aSig0, aSig1;
6343 
6344     aSig0 = extractFloat128Frac0(a);
6345     aSig1 = extractFloat128Frac1(a);
6346     aExp = extractFloat128Exp(a);
6347     aSign = extractFloat128Sign(a);
6348     if (aSign && (aExp > 0x3FFE)) {
6349         float_raise(float_flag_invalid, status);
6350         if (float128_is_any_nan(a)) {
6351             return LIT64(0xFFFFFFFFFFFFFFFF);
6352         } else {
6353             return 0;
6354         }
6355     }
6356     if (aExp) {
6357         aSig0 |= LIT64(0x0001000000000000);
6358     }
6359     shiftCount = 0x402F - aExp;
6360     if (shiftCount <= 0) {
6361         if (0x403E < aExp) {
6362             float_raise(float_flag_invalid, status);
6363             return LIT64(0xFFFFFFFFFFFFFFFF);
6364         }
6365         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6366     } else {
6367         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6368     }
6369     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6370 }
6371 
6372 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6373 {
6374     uint64_t v;
6375     signed char current_rounding_mode = status->float_rounding_mode;
6376 
6377     set_float_rounding_mode(float_round_to_zero, status);
6378     v = float128_to_uint64(a, status);
6379     set_float_rounding_mode(current_rounding_mode, status);
6380 
6381     return v;
6382 }
6383 
6384 /*----------------------------------------------------------------------------
6385 | Returns the result of converting the quadruple-precision floating-point
6386 | value `a' to the 32-bit unsigned integer format.  The conversion
6387 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6388 | Arithmetic except that the conversion is always rounded toward zero.
6389 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6390 | if the conversion overflows, the largest unsigned integer is returned.
6391 | If 'a' is negative, the value is rounded and zero is returned; negative
6392 | values that do not round to zero will raise the inexact exception.
6393 *----------------------------------------------------------------------------*/
6394 
6395 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6396 {
6397     uint64_t v;
6398     uint32_t res;
6399     int old_exc_flags = get_float_exception_flags(status);
6400 
6401     v = float128_to_uint64_round_to_zero(a, status);
6402     if (v > 0xffffffff) {
6403         res = 0xffffffff;
6404     } else {
6405         return v;
6406     }
6407     set_float_exception_flags(old_exc_flags, status);
6408     float_raise(float_flag_invalid, status);
6409     return res;
6410 }
6411 
6412 /*----------------------------------------------------------------------------
6413 | Returns the result of converting the quadruple-precision floating-point
6414 | value `a' to the single-precision floating-point format.  The conversion
6415 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6416 | Arithmetic.
6417 *----------------------------------------------------------------------------*/
6418 
6419 float32 float128_to_float32(float128 a, float_status *status)
6420 {
6421     flag aSign;
6422     int32_t aExp;
6423     uint64_t aSig0, aSig1;
6424     uint32_t zSig;
6425 
6426     aSig1 = extractFloat128Frac1( a );
6427     aSig0 = extractFloat128Frac0( a );
6428     aExp = extractFloat128Exp( a );
6429     aSign = extractFloat128Sign( a );
6430     if ( aExp == 0x7FFF ) {
6431         if ( aSig0 | aSig1 ) {
6432             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6433         }
6434         return packFloat32( aSign, 0xFF, 0 );
6435     }
6436     aSig0 |= ( aSig1 != 0 );
6437     shift64RightJamming( aSig0, 18, &aSig0 );
6438     zSig = aSig0;
6439     if ( aExp || zSig ) {
6440         zSig |= 0x40000000;
6441         aExp -= 0x3F81;
6442     }
6443     return roundAndPackFloat32(aSign, aExp, zSig, status);
6444 
6445 }
6446 
6447 /*----------------------------------------------------------------------------
6448 | Returns the result of converting the quadruple-precision floating-point
6449 | value `a' to the double-precision floating-point format.  The conversion
6450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6451 | Arithmetic.
6452 *----------------------------------------------------------------------------*/
6453 
6454 float64 float128_to_float64(float128 a, float_status *status)
6455 {
6456     flag aSign;
6457     int32_t aExp;
6458     uint64_t aSig0, aSig1;
6459 
6460     aSig1 = extractFloat128Frac1( a );
6461     aSig0 = extractFloat128Frac0( a );
6462     aExp = extractFloat128Exp( a );
6463     aSign = extractFloat128Sign( a );
6464     if ( aExp == 0x7FFF ) {
6465         if ( aSig0 | aSig1 ) {
6466             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6467         }
6468         return packFloat64( aSign, 0x7FF, 0 );
6469     }
6470     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6471     aSig0 |= ( aSig1 != 0 );
6472     if ( aExp || aSig0 ) {
6473         aSig0 |= LIT64( 0x4000000000000000 );
6474         aExp -= 0x3C01;
6475     }
6476     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6477 
6478 }
6479 
6480 /*----------------------------------------------------------------------------
6481 | Returns the result of converting the quadruple-precision floating-point
6482 | value `a' to the extended double-precision floating-point format.  The
6483 | conversion is performed according to the IEC/IEEE Standard for Binary
6484 | Floating-Point Arithmetic.
6485 *----------------------------------------------------------------------------*/
6486 
6487 floatx80 float128_to_floatx80(float128 a, float_status *status)
6488 {
6489     flag aSign;
6490     int32_t aExp;
6491     uint64_t aSig0, aSig1;
6492 
6493     aSig1 = extractFloat128Frac1( a );
6494     aSig0 = extractFloat128Frac0( a );
6495     aExp = extractFloat128Exp( a );
6496     aSign = extractFloat128Sign( a );
6497     if ( aExp == 0x7FFF ) {
6498         if ( aSig0 | aSig1 ) {
6499             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6500         }
6501         return packFloatx80(aSign, floatx80_infinity_high,
6502                                    floatx80_infinity_low);
6503     }
6504     if ( aExp == 0 ) {
6505         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6506         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6507     }
6508     else {
6509         aSig0 |= LIT64( 0x0001000000000000 );
6510     }
6511     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6512     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6513 
6514 }
6515 
6516 /*----------------------------------------------------------------------------
6517 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6518 | returns the result as a quadruple-precision floating-point value.  The
6519 | operation is performed according to the IEC/IEEE Standard for Binary
6520 | Floating-Point Arithmetic.
6521 *----------------------------------------------------------------------------*/
6522 
6523 float128 float128_round_to_int(float128 a, float_status *status)
6524 {
6525     flag aSign;
6526     int32_t aExp;
6527     uint64_t lastBitMask, roundBitsMask;
6528     float128 z;
6529 
6530     aExp = extractFloat128Exp( a );
6531     if ( 0x402F <= aExp ) {
6532         if ( 0x406F <= aExp ) {
6533             if (    ( aExp == 0x7FFF )
6534                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6535                ) {
6536                 return propagateFloat128NaN(a, a, status);
6537             }
6538             return a;
6539         }
6540         lastBitMask = 1;
6541         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6542         roundBitsMask = lastBitMask - 1;
6543         z = a;
6544         switch (status->float_rounding_mode) {
6545         case float_round_nearest_even:
6546             if ( lastBitMask ) {
6547                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6548                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6549             }
6550             else {
6551                 if ( (int64_t) z.low < 0 ) {
6552                     ++z.high;
6553                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6554                 }
6555             }
6556             break;
6557         case float_round_ties_away:
6558             if (lastBitMask) {
6559                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6560             } else {
6561                 if ((int64_t) z.low < 0) {
6562                     ++z.high;
6563                 }
6564             }
6565             break;
6566         case float_round_to_zero:
6567             break;
6568         case float_round_up:
6569             if (!extractFloat128Sign(z)) {
6570                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6571             }
6572             break;
6573         case float_round_down:
6574             if (extractFloat128Sign(z)) {
6575                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6576             }
6577             break;
6578         default:
6579             abort();
6580         }
6581         z.low &= ~ roundBitsMask;
6582     }
6583     else {
6584         if ( aExp < 0x3FFF ) {
6585             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6586             status->float_exception_flags |= float_flag_inexact;
6587             aSign = extractFloat128Sign( a );
6588             switch (status->float_rounding_mode) {
6589              case float_round_nearest_even:
6590                 if (    ( aExp == 0x3FFE )
6591                      && (   extractFloat128Frac0( a )
6592                           | extractFloat128Frac1( a ) )
6593                    ) {
6594                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6595                 }
6596                 break;
6597             case float_round_ties_away:
6598                 if (aExp == 0x3FFE) {
6599                     return packFloat128(aSign, 0x3FFF, 0, 0);
6600                 }
6601                 break;
6602              case float_round_down:
6603                 return
6604                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6605                     : packFloat128( 0, 0, 0, 0 );
6606              case float_round_up:
6607                 return
6608                       aSign ? packFloat128( 1, 0, 0, 0 )
6609                     : packFloat128( 0, 0x3FFF, 0, 0 );
6610             }
6611             return packFloat128( aSign, 0, 0, 0 );
6612         }
6613         lastBitMask = 1;
6614         lastBitMask <<= 0x402F - aExp;
6615         roundBitsMask = lastBitMask - 1;
6616         z.low = 0;
6617         z.high = a.high;
6618         switch (status->float_rounding_mode) {
6619         case float_round_nearest_even:
6620             z.high += lastBitMask>>1;
6621             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6622                 z.high &= ~ lastBitMask;
6623             }
6624             break;
6625         case float_round_ties_away:
6626             z.high += lastBitMask>>1;
6627             break;
6628         case float_round_to_zero:
6629             break;
6630         case float_round_up:
6631             if (!extractFloat128Sign(z)) {
6632                 z.high |= ( a.low != 0 );
6633                 z.high += roundBitsMask;
6634             }
6635             break;
6636         case float_round_down:
6637             if (extractFloat128Sign(z)) {
6638                 z.high |= (a.low != 0);
6639                 z.high += roundBitsMask;
6640             }
6641             break;
6642         default:
6643             abort();
6644         }
6645         z.high &= ~ roundBitsMask;
6646     }
6647     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6648         status->float_exception_flags |= float_flag_inexact;
6649     }
6650     return z;
6651 
6652 }
6653 
6654 /*----------------------------------------------------------------------------
6655 | Returns the result of adding the absolute values of the quadruple-precision
6656 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6657 | before being returned.  `zSign' is ignored if the result is a NaN.
6658 | The addition is performed according to the IEC/IEEE Standard for Binary
6659 | Floating-Point Arithmetic.
6660 *----------------------------------------------------------------------------*/
6661 
6662 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6663                                 float_status *status)
6664 {
6665     int32_t aExp, bExp, zExp;
6666     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6667     int32_t expDiff;
6668 
6669     aSig1 = extractFloat128Frac1( a );
6670     aSig0 = extractFloat128Frac0( a );
6671     aExp = extractFloat128Exp( a );
6672     bSig1 = extractFloat128Frac1( b );
6673     bSig0 = extractFloat128Frac0( b );
6674     bExp = extractFloat128Exp( b );
6675     expDiff = aExp - bExp;
6676     if ( 0 < expDiff ) {
6677         if ( aExp == 0x7FFF ) {
6678             if (aSig0 | aSig1) {
6679                 return propagateFloat128NaN(a, b, status);
6680             }
6681             return a;
6682         }
6683         if ( bExp == 0 ) {
6684             --expDiff;
6685         }
6686         else {
6687             bSig0 |= LIT64( 0x0001000000000000 );
6688         }
6689         shift128ExtraRightJamming(
6690             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6691         zExp = aExp;
6692     }
6693     else if ( expDiff < 0 ) {
6694         if ( bExp == 0x7FFF ) {
6695             if (bSig0 | bSig1) {
6696                 return propagateFloat128NaN(a, b, status);
6697             }
6698             return packFloat128( zSign, 0x7FFF, 0, 0 );
6699         }
6700         if ( aExp == 0 ) {
6701             ++expDiff;
6702         }
6703         else {
6704             aSig0 |= LIT64( 0x0001000000000000 );
6705         }
6706         shift128ExtraRightJamming(
6707             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6708         zExp = bExp;
6709     }
6710     else {
6711         if ( aExp == 0x7FFF ) {
6712             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6713                 return propagateFloat128NaN(a, b, status);
6714             }
6715             return a;
6716         }
6717         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6718         if ( aExp == 0 ) {
6719             if (status->flush_to_zero) {
6720                 if (zSig0 | zSig1) {
6721                     float_raise(float_flag_output_denormal, status);
6722                 }
6723                 return packFloat128(zSign, 0, 0, 0);
6724             }
6725             return packFloat128( zSign, 0, zSig0, zSig1 );
6726         }
6727         zSig2 = 0;
6728         zSig0 |= LIT64( 0x0002000000000000 );
6729         zExp = aExp;
6730         goto shiftRight1;
6731     }
6732     aSig0 |= LIT64( 0x0001000000000000 );
6733     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6734     --zExp;
6735     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6736     ++zExp;
6737  shiftRight1:
6738     shift128ExtraRightJamming(
6739         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6740  roundAndPack:
6741     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6742 
6743 }
6744 
6745 /*----------------------------------------------------------------------------
6746 | Returns the result of subtracting the absolute values of the quadruple-
6747 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6748 | difference is negated before being returned.  `zSign' is ignored if the
6749 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6750 | Standard for Binary Floating-Point Arithmetic.
6751 *----------------------------------------------------------------------------*/
6752 
6753 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6754                                 float_status *status)
6755 {
6756     int32_t aExp, bExp, zExp;
6757     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6758     int32_t expDiff;
6759 
6760     aSig1 = extractFloat128Frac1( a );
6761     aSig0 = extractFloat128Frac0( a );
6762     aExp = extractFloat128Exp( a );
6763     bSig1 = extractFloat128Frac1( b );
6764     bSig0 = extractFloat128Frac0( b );
6765     bExp = extractFloat128Exp( b );
6766     expDiff = aExp - bExp;
6767     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6768     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6769     if ( 0 < expDiff ) goto aExpBigger;
6770     if ( expDiff < 0 ) goto bExpBigger;
6771     if ( aExp == 0x7FFF ) {
6772         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6773             return propagateFloat128NaN(a, b, status);
6774         }
6775         float_raise(float_flag_invalid, status);
6776         return float128_default_nan(status);
6777     }
6778     if ( aExp == 0 ) {
6779         aExp = 1;
6780         bExp = 1;
6781     }
6782     if ( bSig0 < aSig0 ) goto aBigger;
6783     if ( aSig0 < bSig0 ) goto bBigger;
6784     if ( bSig1 < aSig1 ) goto aBigger;
6785     if ( aSig1 < bSig1 ) goto bBigger;
6786     return packFloat128(status->float_rounding_mode == float_round_down,
6787                         0, 0, 0);
6788  bExpBigger:
6789     if ( bExp == 0x7FFF ) {
6790         if (bSig0 | bSig1) {
6791             return propagateFloat128NaN(a, b, status);
6792         }
6793         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6794     }
6795     if ( aExp == 0 ) {
6796         ++expDiff;
6797     }
6798     else {
6799         aSig0 |= LIT64( 0x4000000000000000 );
6800     }
6801     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6802     bSig0 |= LIT64( 0x4000000000000000 );
6803  bBigger:
6804     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6805     zExp = bExp;
6806     zSign ^= 1;
6807     goto normalizeRoundAndPack;
6808  aExpBigger:
6809     if ( aExp == 0x7FFF ) {
6810         if (aSig0 | aSig1) {
6811             return propagateFloat128NaN(a, b, status);
6812         }
6813         return a;
6814     }
6815     if ( bExp == 0 ) {
6816         --expDiff;
6817     }
6818     else {
6819         bSig0 |= LIT64( 0x4000000000000000 );
6820     }
6821     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6822     aSig0 |= LIT64( 0x4000000000000000 );
6823  aBigger:
6824     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6825     zExp = aExp;
6826  normalizeRoundAndPack:
6827     --zExp;
6828     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6829                                          status);
6830 
6831 }
6832 
6833 /*----------------------------------------------------------------------------
6834 | Returns the result of adding the quadruple-precision floating-point values
6835 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6836 | for Binary Floating-Point Arithmetic.
6837 *----------------------------------------------------------------------------*/
6838 
6839 float128 float128_add(float128 a, float128 b, float_status *status)
6840 {
6841     flag aSign, bSign;
6842 
6843     aSign = extractFloat128Sign( a );
6844     bSign = extractFloat128Sign( b );
6845     if ( aSign == bSign ) {
6846         return addFloat128Sigs(a, b, aSign, status);
6847     }
6848     else {
6849         return subFloat128Sigs(a, b, aSign, status);
6850     }
6851 
6852 }
6853 
6854 /*----------------------------------------------------------------------------
6855 | Returns the result of subtracting the quadruple-precision floating-point
6856 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6857 | Standard for Binary Floating-Point Arithmetic.
6858 *----------------------------------------------------------------------------*/
6859 
6860 float128 float128_sub(float128 a, float128 b, float_status *status)
6861 {
6862     flag aSign, bSign;
6863 
6864     aSign = extractFloat128Sign( a );
6865     bSign = extractFloat128Sign( b );
6866     if ( aSign == bSign ) {
6867         return subFloat128Sigs(a, b, aSign, status);
6868     }
6869     else {
6870         return addFloat128Sigs(a, b, aSign, status);
6871     }
6872 
6873 }
6874 
6875 /*----------------------------------------------------------------------------
6876 | Returns the result of multiplying the quadruple-precision floating-point
6877 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6878 | Standard for Binary Floating-Point Arithmetic.
6879 *----------------------------------------------------------------------------*/
6880 
6881 float128 float128_mul(float128 a, float128 b, float_status *status)
6882 {
6883     flag aSign, bSign, zSign;
6884     int32_t aExp, bExp, zExp;
6885     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6886 
6887     aSig1 = extractFloat128Frac1( a );
6888     aSig0 = extractFloat128Frac0( a );
6889     aExp = extractFloat128Exp( a );
6890     aSign = extractFloat128Sign( a );
6891     bSig1 = extractFloat128Frac1( b );
6892     bSig0 = extractFloat128Frac0( b );
6893     bExp = extractFloat128Exp( b );
6894     bSign = extractFloat128Sign( b );
6895     zSign = aSign ^ bSign;
6896     if ( aExp == 0x7FFF ) {
6897         if (    ( aSig0 | aSig1 )
6898              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6899             return propagateFloat128NaN(a, b, status);
6900         }
6901         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6902         return packFloat128( zSign, 0x7FFF, 0, 0 );
6903     }
6904     if ( bExp == 0x7FFF ) {
6905         if (bSig0 | bSig1) {
6906             return propagateFloat128NaN(a, b, status);
6907         }
6908         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6909  invalid:
6910             float_raise(float_flag_invalid, status);
6911             return float128_default_nan(status);
6912         }
6913         return packFloat128( zSign, 0x7FFF, 0, 0 );
6914     }
6915     if ( aExp == 0 ) {
6916         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6917         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6918     }
6919     if ( bExp == 0 ) {
6920         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6921         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6922     }
6923     zExp = aExp + bExp - 0x4000;
6924     aSig0 |= LIT64( 0x0001000000000000 );
6925     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6926     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6927     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6928     zSig2 |= ( zSig3 != 0 );
6929     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6930         shift128ExtraRightJamming(
6931             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6932         ++zExp;
6933     }
6934     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6935 
6936 }
6937 
6938 /*----------------------------------------------------------------------------
6939 | Returns the result of dividing the quadruple-precision floating-point value
6940 | `a' by the corresponding value `b'.  The operation is performed according to
6941 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6942 *----------------------------------------------------------------------------*/
6943 
6944 float128 float128_div(float128 a, float128 b, float_status *status)
6945 {
6946     flag aSign, bSign, zSign;
6947     int32_t aExp, bExp, zExp;
6948     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6949     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6950 
6951     aSig1 = extractFloat128Frac1( a );
6952     aSig0 = extractFloat128Frac0( a );
6953     aExp = extractFloat128Exp( a );
6954     aSign = extractFloat128Sign( a );
6955     bSig1 = extractFloat128Frac1( b );
6956     bSig0 = extractFloat128Frac0( b );
6957     bExp = extractFloat128Exp( b );
6958     bSign = extractFloat128Sign( b );
6959     zSign = aSign ^ bSign;
6960     if ( aExp == 0x7FFF ) {
6961         if (aSig0 | aSig1) {
6962             return propagateFloat128NaN(a, b, status);
6963         }
6964         if ( bExp == 0x7FFF ) {
6965             if (bSig0 | bSig1) {
6966                 return propagateFloat128NaN(a, b, status);
6967             }
6968             goto invalid;
6969         }
6970         return packFloat128( zSign, 0x7FFF, 0, 0 );
6971     }
6972     if ( bExp == 0x7FFF ) {
6973         if (bSig0 | bSig1) {
6974             return propagateFloat128NaN(a, b, status);
6975         }
6976         return packFloat128( zSign, 0, 0, 0 );
6977     }
6978     if ( bExp == 0 ) {
6979         if ( ( bSig0 | bSig1 ) == 0 ) {
6980             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6981  invalid:
6982                 float_raise(float_flag_invalid, status);
6983                 return float128_default_nan(status);
6984             }
6985             float_raise(float_flag_divbyzero, status);
6986             return packFloat128( zSign, 0x7FFF, 0, 0 );
6987         }
6988         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6989     }
6990     if ( aExp == 0 ) {
6991         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6992         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6993     }
6994     zExp = aExp - bExp + 0x3FFD;
6995     shortShift128Left(
6996         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6997     shortShift128Left(
6998         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6999     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7000         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7001         ++zExp;
7002     }
7003     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7004     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7005     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7006     while ( (int64_t) rem0 < 0 ) {
7007         --zSig0;
7008         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7009     }
7010     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7011     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7012         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7013         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7014         while ( (int64_t) rem1 < 0 ) {
7015             --zSig1;
7016             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7017         }
7018         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7019     }
7020     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7021     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7022 
7023 }
7024 
7025 /*----------------------------------------------------------------------------
7026 | Returns the remainder of the quadruple-precision floating-point value `a'
7027 | with respect to the corresponding value `b'.  The operation is performed
7028 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7029 *----------------------------------------------------------------------------*/
7030 
7031 float128 float128_rem(float128 a, float128 b, float_status *status)
7032 {
7033     flag aSign, zSign;
7034     int32_t aExp, bExp, expDiff;
7035     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7036     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7037     int64_t sigMean0;
7038 
7039     aSig1 = extractFloat128Frac1( a );
7040     aSig0 = extractFloat128Frac0( a );
7041     aExp = extractFloat128Exp( a );
7042     aSign = extractFloat128Sign( a );
7043     bSig1 = extractFloat128Frac1( b );
7044     bSig0 = extractFloat128Frac0( b );
7045     bExp = extractFloat128Exp( b );
7046     if ( aExp == 0x7FFF ) {
7047         if (    ( aSig0 | aSig1 )
7048              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7049             return propagateFloat128NaN(a, b, status);
7050         }
7051         goto invalid;
7052     }
7053     if ( bExp == 0x7FFF ) {
7054         if (bSig0 | bSig1) {
7055             return propagateFloat128NaN(a, b, status);
7056         }
7057         return a;
7058     }
7059     if ( bExp == 0 ) {
7060         if ( ( bSig0 | bSig1 ) == 0 ) {
7061  invalid:
7062             float_raise(float_flag_invalid, status);
7063             return float128_default_nan(status);
7064         }
7065         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7066     }
7067     if ( aExp == 0 ) {
7068         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7069         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7070     }
7071     expDiff = aExp - bExp;
7072     if ( expDiff < -1 ) return a;
7073     shortShift128Left(
7074         aSig0 | LIT64( 0x0001000000000000 ),
7075         aSig1,
7076         15 - ( expDiff < 0 ),
7077         &aSig0,
7078         &aSig1
7079     );
7080     shortShift128Left(
7081         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7082     q = le128( bSig0, bSig1, aSig0, aSig1 );
7083     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7084     expDiff -= 64;
7085     while ( 0 < expDiff ) {
7086         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7087         q = ( 4 < q ) ? q - 4 : 0;
7088         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7089         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7090         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7091         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7092         expDiff -= 61;
7093     }
7094     if ( -64 < expDiff ) {
7095         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7096         q = ( 4 < q ) ? q - 4 : 0;
7097         q >>= - expDiff;
7098         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7099         expDiff += 52;
7100         if ( expDiff < 0 ) {
7101             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7102         }
7103         else {
7104             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7105         }
7106         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7107         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7108     }
7109     else {
7110         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7111         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7112     }
7113     do {
7114         alternateASig0 = aSig0;
7115         alternateASig1 = aSig1;
7116         ++q;
7117         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7118     } while ( 0 <= (int64_t) aSig0 );
7119     add128(
7120         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7121     if (    ( sigMean0 < 0 )
7122          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7123         aSig0 = alternateASig0;
7124         aSig1 = alternateASig1;
7125     }
7126     zSign = ( (int64_t) aSig0 < 0 );
7127     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7128     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7129                                          status);
7130 }
7131 
7132 /*----------------------------------------------------------------------------
7133 | Returns the square root of the quadruple-precision floating-point value `a'.
7134 | The operation is performed according to the IEC/IEEE Standard for Binary
7135 | Floating-Point Arithmetic.
7136 *----------------------------------------------------------------------------*/
7137 
7138 float128 float128_sqrt(float128 a, float_status *status)
7139 {
7140     flag aSign;
7141     int32_t aExp, zExp;
7142     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7143     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7144 
7145     aSig1 = extractFloat128Frac1( a );
7146     aSig0 = extractFloat128Frac0( a );
7147     aExp = extractFloat128Exp( a );
7148     aSign = extractFloat128Sign( a );
7149     if ( aExp == 0x7FFF ) {
7150         if (aSig0 | aSig1) {
7151             return propagateFloat128NaN(a, a, status);
7152         }
7153         if ( ! aSign ) return a;
7154         goto invalid;
7155     }
7156     if ( aSign ) {
7157         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7158  invalid:
7159         float_raise(float_flag_invalid, status);
7160         return float128_default_nan(status);
7161     }
7162     if ( aExp == 0 ) {
7163         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7165     }
7166     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7167     aSig0 |= LIT64( 0x0001000000000000 );
7168     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7169     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7170     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7171     doubleZSig0 = zSig0<<1;
7172     mul64To128( zSig0, zSig0, &term0, &term1 );
7173     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7174     while ( (int64_t) rem0 < 0 ) {
7175         --zSig0;
7176         doubleZSig0 -= 2;
7177         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7178     }
7179     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7180     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7181         if ( zSig1 == 0 ) zSig1 = 1;
7182         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7183         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7184         mul64To128( zSig1, zSig1, &term2, &term3 );
7185         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7186         while ( (int64_t) rem1 < 0 ) {
7187             --zSig1;
7188             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7189             term3 |= 1;
7190             term2 |= doubleZSig0;
7191             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7192         }
7193         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7194     }
7195     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7196     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7197 
7198 }
7199 
7200 /*----------------------------------------------------------------------------
7201 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7202 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7203 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7204 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7205 *----------------------------------------------------------------------------*/
7206 
7207 int float128_eq(float128 a, float128 b, float_status *status)
7208 {
7209 
7210     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7211               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7212          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7213               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7214        ) {
7215         float_raise(float_flag_invalid, status);
7216         return 0;
7217     }
7218     return
7219            ( a.low == b.low )
7220         && (    ( a.high == b.high )
7221              || (    ( a.low == 0 )
7222                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7223            );
7224 
7225 }
7226 
7227 /*----------------------------------------------------------------------------
7228 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7229 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7230 | exception is raised if either operand is a NaN.  The comparison is performed
7231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7232 *----------------------------------------------------------------------------*/
7233 
7234 int float128_le(float128 a, float128 b, float_status *status)
7235 {
7236     flag aSign, bSign;
7237 
7238     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7239               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7240          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7241               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7242        ) {
7243         float_raise(float_flag_invalid, status);
7244         return 0;
7245     }
7246     aSign = extractFloat128Sign( a );
7247     bSign = extractFloat128Sign( b );
7248     if ( aSign != bSign ) {
7249         return
7250                aSign
7251             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7252                  == 0 );
7253     }
7254     return
7255           aSign ? le128( b.high, b.low, a.high, a.low )
7256         : le128( a.high, a.low, b.high, b.low );
7257 
7258 }
7259 
7260 /*----------------------------------------------------------------------------
7261 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7262 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7263 | raised if either operand is a NaN.  The comparison is performed according
7264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7265 *----------------------------------------------------------------------------*/
7266 
7267 int float128_lt(float128 a, float128 b, float_status *status)
7268 {
7269     flag aSign, bSign;
7270 
7271     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7272               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7273          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7274               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7275        ) {
7276         float_raise(float_flag_invalid, status);
7277         return 0;
7278     }
7279     aSign = extractFloat128Sign( a );
7280     bSign = extractFloat128Sign( b );
7281     if ( aSign != bSign ) {
7282         return
7283                aSign
7284             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7285                  != 0 );
7286     }
7287     return
7288           aSign ? lt128( b.high, b.low, a.high, a.low )
7289         : lt128( a.high, a.low, b.high, b.low );
7290 
7291 }
7292 
7293 /*----------------------------------------------------------------------------
7294 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7295 | be compared, and 0 otherwise.  The invalid exception is raised if either
7296 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7297 | Standard for Binary Floating-Point Arithmetic.
7298 *----------------------------------------------------------------------------*/
7299 
7300 int float128_unordered(float128 a, float128 b, float_status *status)
7301 {
7302     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7303               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7304          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7305               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7306        ) {
7307         float_raise(float_flag_invalid, status);
7308         return 1;
7309     }
7310     return 0;
7311 }
7312 
7313 /*----------------------------------------------------------------------------
7314 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7315 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7316 | exception.  The comparison is performed according to the IEC/IEEE Standard
7317 | for Binary Floating-Point Arithmetic.
7318 *----------------------------------------------------------------------------*/
7319 
7320 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7321 {
7322 
7323     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7324               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7325          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7326               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7327        ) {
7328         if (float128_is_signaling_nan(a, status)
7329          || float128_is_signaling_nan(b, status)) {
7330             float_raise(float_flag_invalid, status);
7331         }
7332         return 0;
7333     }
7334     return
7335            ( a.low == b.low )
7336         && (    ( a.high == b.high )
7337              || (    ( a.low == 0 )
7338                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7339            );
7340 
7341 }
7342 
7343 /*----------------------------------------------------------------------------
7344 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7345 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7346 | cause an exception.  Otherwise, the comparison is performed according to the
7347 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7348 *----------------------------------------------------------------------------*/
7349 
7350 int float128_le_quiet(float128 a, float128 b, float_status *status)
7351 {
7352     flag aSign, bSign;
7353 
7354     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7355               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7356          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7357               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7358        ) {
7359         if (float128_is_signaling_nan(a, status)
7360          || float128_is_signaling_nan(b, status)) {
7361             float_raise(float_flag_invalid, status);
7362         }
7363         return 0;
7364     }
7365     aSign = extractFloat128Sign( a );
7366     bSign = extractFloat128Sign( b );
7367     if ( aSign != bSign ) {
7368         return
7369                aSign
7370             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7371                  == 0 );
7372     }
7373     return
7374           aSign ? le128( b.high, b.low, a.high, a.low )
7375         : le128( a.high, a.low, b.high, b.low );
7376 
7377 }
7378 
7379 /*----------------------------------------------------------------------------
7380 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7381 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7382 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7383 | Standard for Binary Floating-Point Arithmetic.
7384 *----------------------------------------------------------------------------*/
7385 
7386 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7387 {
7388     flag aSign, bSign;
7389 
7390     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7391               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7392          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7393               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7394        ) {
7395         if (float128_is_signaling_nan(a, status)
7396          || float128_is_signaling_nan(b, status)) {
7397             float_raise(float_flag_invalid, status);
7398         }
7399         return 0;
7400     }
7401     aSign = extractFloat128Sign( a );
7402     bSign = extractFloat128Sign( b );
7403     if ( aSign != bSign ) {
7404         return
7405                aSign
7406             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7407                  != 0 );
7408     }
7409     return
7410           aSign ? lt128( b.high, b.low, a.high, a.low )
7411         : lt128( a.high, a.low, b.high, b.low );
7412 
7413 }
7414 
7415 /*----------------------------------------------------------------------------
7416 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7417 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7418 | comparison is performed according to the IEC/IEEE Standard for Binary
7419 | Floating-Point Arithmetic.
7420 *----------------------------------------------------------------------------*/
7421 
7422 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7423 {
7424     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7425               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7426          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7427               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7428        ) {
7429         if (float128_is_signaling_nan(a, status)
7430          || float128_is_signaling_nan(b, status)) {
7431             float_raise(float_flag_invalid, status);
7432         }
7433         return 1;
7434     }
7435     return 0;
7436 }
7437 
7438 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7439                                             int is_quiet, float_status *status)
7440 {
7441     flag aSign, bSign;
7442 
7443     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7444         float_raise(float_flag_invalid, status);
7445         return float_relation_unordered;
7446     }
7447     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7448           ( extractFloatx80Frac( a )<<1 ) ) ||
7449         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7450           ( extractFloatx80Frac( b )<<1 ) )) {
7451         if (!is_quiet ||
7452             floatx80_is_signaling_nan(a, status) ||
7453             floatx80_is_signaling_nan(b, status)) {
7454             float_raise(float_flag_invalid, status);
7455         }
7456         return float_relation_unordered;
7457     }
7458     aSign = extractFloatx80Sign( a );
7459     bSign = extractFloatx80Sign( b );
7460     if ( aSign != bSign ) {
7461 
7462         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7463              ( ( a.low | b.low ) == 0 ) ) {
7464             /* zero case */
7465             return float_relation_equal;
7466         } else {
7467             return 1 - (2 * aSign);
7468         }
7469     } else {
7470         if (a.low == b.low && a.high == b.high) {
7471             return float_relation_equal;
7472         } else {
7473             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7474         }
7475     }
7476 }
7477 
7478 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7479 {
7480     return floatx80_compare_internal(a, b, 0, status);
7481 }
7482 
7483 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7484 {
7485     return floatx80_compare_internal(a, b, 1, status);
7486 }
7487 
7488 static inline int float128_compare_internal(float128 a, float128 b,
7489                                             int is_quiet, float_status *status)
7490 {
7491     flag aSign, bSign;
7492 
7493     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7494           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7495         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7496           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7497         if (!is_quiet ||
7498             float128_is_signaling_nan(a, status) ||
7499             float128_is_signaling_nan(b, status)) {
7500             float_raise(float_flag_invalid, status);
7501         }
7502         return float_relation_unordered;
7503     }
7504     aSign = extractFloat128Sign( a );
7505     bSign = extractFloat128Sign( b );
7506     if ( aSign != bSign ) {
7507         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7508             /* zero case */
7509             return float_relation_equal;
7510         } else {
7511             return 1 - (2 * aSign);
7512         }
7513     } else {
7514         if (a.low == b.low && a.high == b.high) {
7515             return float_relation_equal;
7516         } else {
7517             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7518         }
7519     }
7520 }
7521 
7522 int float128_compare(float128 a, float128 b, float_status *status)
7523 {
7524     return float128_compare_internal(a, b, 0, status);
7525 }
7526 
7527 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7528 {
7529     return float128_compare_internal(a, b, 1, status);
7530 }
7531 
7532 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7533 {
7534     flag aSign;
7535     int32_t aExp;
7536     uint64_t aSig;
7537 
7538     if (floatx80_invalid_encoding(a)) {
7539         float_raise(float_flag_invalid, status);
7540         return floatx80_default_nan(status);
7541     }
7542     aSig = extractFloatx80Frac( a );
7543     aExp = extractFloatx80Exp( a );
7544     aSign = extractFloatx80Sign( a );
7545 
7546     if ( aExp == 0x7FFF ) {
7547         if ( aSig<<1 ) {
7548             return propagateFloatx80NaN(a, a, status);
7549         }
7550         return a;
7551     }
7552 
7553     if (aExp == 0) {
7554         if (aSig == 0) {
7555             return a;
7556         }
7557         aExp++;
7558     }
7559 
7560     if (n > 0x10000) {
7561         n = 0x10000;
7562     } else if (n < -0x10000) {
7563         n = -0x10000;
7564     }
7565 
7566     aExp += n;
7567     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7568                                          aSign, aExp, aSig, 0, status);
7569 }
7570 
7571 float128 float128_scalbn(float128 a, int n, float_status *status)
7572 {
7573     flag aSign;
7574     int32_t aExp;
7575     uint64_t aSig0, aSig1;
7576 
7577     aSig1 = extractFloat128Frac1( a );
7578     aSig0 = extractFloat128Frac0( a );
7579     aExp = extractFloat128Exp( a );
7580     aSign = extractFloat128Sign( a );
7581     if ( aExp == 0x7FFF ) {
7582         if ( aSig0 | aSig1 ) {
7583             return propagateFloat128NaN(a, a, status);
7584         }
7585         return a;
7586     }
7587     if (aExp != 0) {
7588         aSig0 |= LIT64( 0x0001000000000000 );
7589     } else if (aSig0 == 0 && aSig1 == 0) {
7590         return a;
7591     } else {
7592         aExp++;
7593     }
7594 
7595     if (n > 0x10000) {
7596         n = 0x10000;
7597     } else if (n < -0x10000) {
7598         n = -0x10000;
7599     }
7600 
7601     aExp += n - 1;
7602     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7603                                          , status);
7604 
7605 }
7606