xref: /qemu/fpu/softfloat.c (revision 2dfabc86e656e835c67954c60e143ecd33e15817)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1522                                                 int flags, float_status *status)
1523 {
1524     FloatParts pa = float32_unpack_canonical(a, status);
1525     FloatParts pb = float32_unpack_canonical(b, status);
1526     FloatParts pc = float32_unpack_canonical(c, status);
1527     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1528 
1529     return float32_round_pack_canonical(pr, status);
1530 }
1531 
1532 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1533                                                 int flags, float_status *status)
1534 {
1535     FloatParts pa = float64_unpack_canonical(a, status);
1536     FloatParts pb = float64_unpack_canonical(b, status);
1537     FloatParts pc = float64_unpack_canonical(c, status);
1538     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539 
1540     return float64_round_pack_canonical(pr, status);
1541 }
1542 
1543 /*
1544  * Returns the result of dividing the floating-point value `a' by the
1545  * corresponding value `b'. The operation is performed according to
1546  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1547  */
1548 
1549 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1550 {
1551     bool sign = a.sign ^ b.sign;
1552 
1553     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1554         uint64_t n0, n1, q, r;
1555         int exp = a.exp - b.exp;
1556 
1557         /*
1558          * We want a 2*N / N-bit division to produce exactly an N-bit
1559          * result, so that we do not lose any precision and so that we
1560          * do not have to renormalize afterward.  If A.frac < B.frac,
1561          * then division would produce an (N-1)-bit result; shift A left
1562          * by one to produce the an N-bit result, and decrement the
1563          * exponent to match.
1564          *
1565          * The udiv_qrnnd algorithm that we're using requires normalization,
1566          * i.e. the msb of the denominator must be set.  Since we know that
1567          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1568          * by one (more), and the remainder must be shifted right by one.
1569          */
1570         if (a.frac < b.frac) {
1571             exp -= 1;
1572             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1573         } else {
1574             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1575         }
1576         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1577 
1578         /*
1579          * Set lsb if there is a remainder, to set inexact.
1580          * As mentioned above, to find the actual value of the remainder we
1581          * would need to shift right, but (1) we are only concerned about
1582          * non-zero-ness, and (2) the remainder will always be even because
1583          * both inputs to the division primitive are even.
1584          */
1585         a.frac = q | (r != 0);
1586         a.sign = sign;
1587         a.exp = exp;
1588         return a;
1589     }
1590     /* handle all the NaN cases */
1591     if (is_nan(a.cls) || is_nan(b.cls)) {
1592         return pick_nan(a, b, s);
1593     }
1594     /* 0/0 or Inf/Inf */
1595     if (a.cls == b.cls
1596         &&
1597         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1598         s->float_exception_flags |= float_flag_invalid;
1599         return parts_default_nan(s);
1600     }
1601     /* Inf / x or 0 / x */
1602     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1603         a.sign = sign;
1604         return a;
1605     }
1606     /* Div 0 => Inf */
1607     if (b.cls == float_class_zero) {
1608         s->float_exception_flags |= float_flag_divbyzero;
1609         a.cls = float_class_inf;
1610         a.sign = sign;
1611         return a;
1612     }
1613     /* Div by Inf */
1614     if (b.cls == float_class_inf) {
1615         a.cls = float_class_zero;
1616         a.sign = sign;
1617         return a;
1618     }
1619     g_assert_not_reached();
1620 }
1621 
1622 float16 float16_div(float16 a, float16 b, float_status *status)
1623 {
1624     FloatParts pa = float16_unpack_canonical(a, status);
1625     FloatParts pb = float16_unpack_canonical(b, status);
1626     FloatParts pr = div_floats(pa, pb, status);
1627 
1628     return float16_round_pack_canonical(pr, status);
1629 }
1630 
1631 float32 float32_div(float32 a, float32 b, float_status *status)
1632 {
1633     FloatParts pa = float32_unpack_canonical(a, status);
1634     FloatParts pb = float32_unpack_canonical(b, status);
1635     FloatParts pr = div_floats(pa, pb, status);
1636 
1637     return float32_round_pack_canonical(pr, status);
1638 }
1639 
1640 float64 float64_div(float64 a, float64 b, float_status *status)
1641 {
1642     FloatParts pa = float64_unpack_canonical(a, status);
1643     FloatParts pb = float64_unpack_canonical(b, status);
1644     FloatParts pr = div_floats(pa, pb, status);
1645 
1646     return float64_round_pack_canonical(pr, status);
1647 }
1648 
1649 /*
1650  * Float to Float conversions
1651  *
1652  * Returns the result of converting one float format to another. The
1653  * conversion is performed according to the IEC/IEEE Standard for
1654  * Binary Floating-Point Arithmetic.
1655  *
1656  * The float_to_float helper only needs to take care of raising
1657  * invalid exceptions and handling the conversion on NaNs.
1658  */
1659 
1660 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1661                                  float_status *s)
1662 {
1663     if (dstf->arm_althp) {
1664         switch (a.cls) {
1665         case float_class_qnan:
1666         case float_class_snan:
1667             /* There is no NaN in the destination format.  Raise Invalid
1668              * and return a zero with the sign of the input NaN.
1669              */
1670             s->float_exception_flags |= float_flag_invalid;
1671             a.cls = float_class_zero;
1672             a.frac = 0;
1673             a.exp = 0;
1674             break;
1675 
1676         case float_class_inf:
1677             /* There is no Inf in the destination format.  Raise Invalid
1678              * and return the maximum normal with the correct sign.
1679              */
1680             s->float_exception_flags |= float_flag_invalid;
1681             a.cls = float_class_normal;
1682             a.exp = dstf->exp_max;
1683             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1684             break;
1685 
1686         default:
1687             break;
1688         }
1689     } else if (is_nan(a.cls)) {
1690         if (is_snan(a.cls)) {
1691             s->float_exception_flags |= float_flag_invalid;
1692             a = parts_silence_nan(a, s);
1693         }
1694         if (s->default_nan_mode) {
1695             return parts_default_nan(s);
1696         }
1697     }
1698     return a;
1699 }
1700 
1701 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1702 {
1703     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1704     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1705     FloatParts pr = float_to_float(p, &float32_params, s);
1706     return float32_round_pack_canonical(pr, s);
1707 }
1708 
1709 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1710 {
1711     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1712     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1713     FloatParts pr = float_to_float(p, &float64_params, s);
1714     return float64_round_pack_canonical(pr, s);
1715 }
1716 
1717 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1718 {
1719     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1720     FloatParts p = float32_unpack_canonical(a, s);
1721     FloatParts pr = float_to_float(p, fmt16, s);
1722     return float16a_round_pack_canonical(pr, s, fmt16);
1723 }
1724 
1725 float64 float32_to_float64(float32 a, float_status *s)
1726 {
1727     FloatParts p = float32_unpack_canonical(a, s);
1728     FloatParts pr = float_to_float(p, &float64_params, s);
1729     return float64_round_pack_canonical(pr, s);
1730 }
1731 
1732 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1733 {
1734     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1735     FloatParts p = float64_unpack_canonical(a, s);
1736     FloatParts pr = float_to_float(p, fmt16, s);
1737     return float16a_round_pack_canonical(pr, s, fmt16);
1738 }
1739 
1740 float32 float64_to_float32(float64 a, float_status *s)
1741 {
1742     FloatParts p = float64_unpack_canonical(a, s);
1743     FloatParts pr = float_to_float(p, &float32_params, s);
1744     return float32_round_pack_canonical(pr, s);
1745 }
1746 
1747 /*
1748  * Rounds the floating-point value `a' to an integer, and returns the
1749  * result as a floating-point value. The operation is performed
1750  * according to the IEC/IEEE Standard for Binary Floating-Point
1751  * Arithmetic.
1752  */
1753 
1754 static FloatParts round_to_int(FloatParts a, int rmode,
1755                                int scale, float_status *s)
1756 {
1757     switch (a.cls) {
1758     case float_class_qnan:
1759     case float_class_snan:
1760         return return_nan(a, s);
1761 
1762     case float_class_zero:
1763     case float_class_inf:
1764         /* already "integral" */
1765         break;
1766 
1767     case float_class_normal:
1768         scale = MIN(MAX(scale, -0x10000), 0x10000);
1769         a.exp += scale;
1770 
1771         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1772             /* already integral */
1773             break;
1774         }
1775         if (a.exp < 0) {
1776             bool one;
1777             /* all fractional */
1778             s->float_exception_flags |= float_flag_inexact;
1779             switch (rmode) {
1780             case float_round_nearest_even:
1781                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1782                 break;
1783             case float_round_ties_away:
1784                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1785                 break;
1786             case float_round_to_zero:
1787                 one = false;
1788                 break;
1789             case float_round_up:
1790                 one = !a.sign;
1791                 break;
1792             case float_round_down:
1793                 one = a.sign;
1794                 break;
1795             default:
1796                 g_assert_not_reached();
1797             }
1798 
1799             if (one) {
1800                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1801                 a.exp = 0;
1802             } else {
1803                 a.cls = float_class_zero;
1804             }
1805         } else {
1806             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1807             uint64_t frac_lsbm1 = frac_lsb >> 1;
1808             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1809             uint64_t rnd_mask = rnd_even_mask >> 1;
1810             uint64_t inc;
1811 
1812             switch (rmode) {
1813             case float_round_nearest_even:
1814                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1815                 break;
1816             case float_round_ties_away:
1817                 inc = frac_lsbm1;
1818                 break;
1819             case float_round_to_zero:
1820                 inc = 0;
1821                 break;
1822             case float_round_up:
1823                 inc = a.sign ? 0 : rnd_mask;
1824                 break;
1825             case float_round_down:
1826                 inc = a.sign ? rnd_mask : 0;
1827                 break;
1828             default:
1829                 g_assert_not_reached();
1830             }
1831 
1832             if (a.frac & rnd_mask) {
1833                 s->float_exception_flags |= float_flag_inexact;
1834                 a.frac += inc;
1835                 a.frac &= ~rnd_mask;
1836                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1837                     a.frac >>= 1;
1838                     a.exp++;
1839                 }
1840             }
1841         }
1842         break;
1843     default:
1844         g_assert_not_reached();
1845     }
1846     return a;
1847 }
1848 
1849 float16 float16_round_to_int(float16 a, float_status *s)
1850 {
1851     FloatParts pa = float16_unpack_canonical(a, s);
1852     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1853     return float16_round_pack_canonical(pr, s);
1854 }
1855 
1856 float32 float32_round_to_int(float32 a, float_status *s)
1857 {
1858     FloatParts pa = float32_unpack_canonical(a, s);
1859     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1860     return float32_round_pack_canonical(pr, s);
1861 }
1862 
1863 float64 float64_round_to_int(float64 a, float_status *s)
1864 {
1865     FloatParts pa = float64_unpack_canonical(a, s);
1866     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1867     return float64_round_pack_canonical(pr, s);
1868 }
1869 
1870 /*
1871  * Returns the result of converting the floating-point value `a' to
1872  * the two's complement integer format. The conversion is performed
1873  * according to the IEC/IEEE Standard for Binary Floating-Point
1874  * Arithmetic---which means in particular that the conversion is
1875  * rounded according to the current rounding mode. If `a' is a NaN,
1876  * the largest positive integer is returned. Otherwise, if the
1877  * conversion overflows, the largest integer with the same sign as `a'
1878  * is returned.
1879 */
1880 
1881 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1882                                      int64_t min, int64_t max,
1883                                      float_status *s)
1884 {
1885     uint64_t r;
1886     int orig_flags = get_float_exception_flags(s);
1887     FloatParts p = round_to_int(in, rmode, scale, s);
1888 
1889     switch (p.cls) {
1890     case float_class_snan:
1891     case float_class_qnan:
1892         s->float_exception_flags = orig_flags | float_flag_invalid;
1893         return max;
1894     case float_class_inf:
1895         s->float_exception_flags = orig_flags | float_flag_invalid;
1896         return p.sign ? min : max;
1897     case float_class_zero:
1898         return 0;
1899     case float_class_normal:
1900         if (p.exp < DECOMPOSED_BINARY_POINT) {
1901             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1902         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1903             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1904         } else {
1905             r = UINT64_MAX;
1906         }
1907         if (p.sign) {
1908             if (r <= -(uint64_t) min) {
1909                 return -r;
1910             } else {
1911                 s->float_exception_flags = orig_flags | float_flag_invalid;
1912                 return min;
1913             }
1914         } else {
1915             if (r <= max) {
1916                 return r;
1917             } else {
1918                 s->float_exception_flags = orig_flags | float_flag_invalid;
1919                 return max;
1920             }
1921         }
1922     default:
1923         g_assert_not_reached();
1924     }
1925 }
1926 
1927 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1928                                 float_status *s)
1929 {
1930     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1931                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1932 }
1933 
1934 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1935                                 float_status *s)
1936 {
1937     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1938                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1939 }
1940 
1941 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1942                                 float_status *s)
1943 {
1944     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1945                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1946 }
1947 
1948 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1949                                 float_status *s)
1950 {
1951     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1952                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1953 }
1954 
1955 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1956                                 float_status *s)
1957 {
1958     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1959                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1960 }
1961 
1962 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1963                                 float_status *s)
1964 {
1965     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1966                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1967 }
1968 
1969 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1970                                 float_status *s)
1971 {
1972     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1973                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1974 }
1975 
1976 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1977                                 float_status *s)
1978 {
1979     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1980                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1981 }
1982 
1983 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1984                                 float_status *s)
1985 {
1986     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1987                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1988 }
1989 
1990 int16_t float16_to_int16(float16 a, float_status *s)
1991 {
1992     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1993 }
1994 
1995 int32_t float16_to_int32(float16 a, float_status *s)
1996 {
1997     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1998 }
1999 
2000 int64_t float16_to_int64(float16 a, float_status *s)
2001 {
2002     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2003 }
2004 
2005 int16_t float32_to_int16(float32 a, float_status *s)
2006 {
2007     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2008 }
2009 
2010 int32_t float32_to_int32(float32 a, float_status *s)
2011 {
2012     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2013 }
2014 
2015 int64_t float32_to_int64(float32 a, float_status *s)
2016 {
2017     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2018 }
2019 
2020 int16_t float64_to_int16(float64 a, float_status *s)
2021 {
2022     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2023 }
2024 
2025 int32_t float64_to_int32(float64 a, float_status *s)
2026 {
2027     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2028 }
2029 
2030 int64_t float64_to_int64(float64 a, float_status *s)
2031 {
2032     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2033 }
2034 
2035 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2036 {
2037     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2038 }
2039 
2040 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2041 {
2042     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2043 }
2044 
2045 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2046 {
2047     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2048 }
2049 
2050 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2051 {
2052     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2053 }
2054 
2055 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2056 {
2057     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2058 }
2059 
2060 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2061 {
2062     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2063 }
2064 
2065 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2066 {
2067     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2068 }
2069 
2070 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2071 {
2072     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2073 }
2074 
2075 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2076 {
2077     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2078 }
2079 
2080 /*
2081  *  Returns the result of converting the floating-point value `a' to
2082  *  the unsigned integer format. The conversion is performed according
2083  *  to the IEC/IEEE Standard for Binary Floating-Point
2084  *  Arithmetic---which means in particular that the conversion is
2085  *  rounded according to the current rounding mode. If `a' is a NaN,
2086  *  the largest unsigned integer is returned. Otherwise, if the
2087  *  conversion overflows, the largest unsigned integer is returned. If
2088  *  the 'a' is negative, the result is rounded and zero is returned;
2089  *  values that do not round to zero will raise the inexact exception
2090  *  flag.
2091  */
2092 
2093 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2094                                        uint64_t max, float_status *s)
2095 {
2096     int orig_flags = get_float_exception_flags(s);
2097     FloatParts p = round_to_int(in, rmode, scale, s);
2098     uint64_t r;
2099 
2100     switch (p.cls) {
2101     case float_class_snan:
2102     case float_class_qnan:
2103         s->float_exception_flags = orig_flags | float_flag_invalid;
2104         return max;
2105     case float_class_inf:
2106         s->float_exception_flags = orig_flags | float_flag_invalid;
2107         return p.sign ? 0 : max;
2108     case float_class_zero:
2109         return 0;
2110     case float_class_normal:
2111         if (p.sign) {
2112             s->float_exception_flags = orig_flags | float_flag_invalid;
2113             return 0;
2114         }
2115 
2116         if (p.exp < DECOMPOSED_BINARY_POINT) {
2117             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2118         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2119             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2120         } else {
2121             s->float_exception_flags = orig_flags | float_flag_invalid;
2122             return max;
2123         }
2124 
2125         /* For uint64 this will never trip, but if p.exp is too large
2126          * to shift a decomposed fraction we shall have exited via the
2127          * 3rd leg above.
2128          */
2129         if (r > max) {
2130             s->float_exception_flags = orig_flags | float_flag_invalid;
2131             return max;
2132         }
2133         return r;
2134     default:
2135         g_assert_not_reached();
2136     }
2137 }
2138 
2139 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2140                                   float_status *s)
2141 {
2142     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2143                                   rmode, scale, UINT16_MAX, s);
2144 }
2145 
2146 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2147                                   float_status *s)
2148 {
2149     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2150                                   rmode, scale, UINT32_MAX, s);
2151 }
2152 
2153 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2154                                   float_status *s)
2155 {
2156     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2157                                   rmode, scale, UINT64_MAX, s);
2158 }
2159 
2160 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2161                                   float_status *s)
2162 {
2163     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2164                                   rmode, scale, UINT16_MAX, s);
2165 }
2166 
2167 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2168                                   float_status *s)
2169 {
2170     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2171                                   rmode, scale, UINT32_MAX, s);
2172 }
2173 
2174 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2175                                   float_status *s)
2176 {
2177     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2178                                   rmode, scale, UINT64_MAX, s);
2179 }
2180 
2181 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2182                                   float_status *s)
2183 {
2184     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2185                                   rmode, scale, UINT16_MAX, s);
2186 }
2187 
2188 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2189                                   float_status *s)
2190 {
2191     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2192                                   rmode, scale, UINT32_MAX, s);
2193 }
2194 
2195 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2196                                   float_status *s)
2197 {
2198     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2199                                   rmode, scale, UINT64_MAX, s);
2200 }
2201 
2202 uint16_t float16_to_uint16(float16 a, float_status *s)
2203 {
2204     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2205 }
2206 
2207 uint32_t float16_to_uint32(float16 a, float_status *s)
2208 {
2209     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2210 }
2211 
2212 uint64_t float16_to_uint64(float16 a, float_status *s)
2213 {
2214     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2215 }
2216 
2217 uint16_t float32_to_uint16(float32 a, float_status *s)
2218 {
2219     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2220 }
2221 
2222 uint32_t float32_to_uint32(float32 a, float_status *s)
2223 {
2224     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2225 }
2226 
2227 uint64_t float32_to_uint64(float32 a, float_status *s)
2228 {
2229     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2230 }
2231 
2232 uint16_t float64_to_uint16(float64 a, float_status *s)
2233 {
2234     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2235 }
2236 
2237 uint32_t float64_to_uint32(float64 a, float_status *s)
2238 {
2239     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2240 }
2241 
2242 uint64_t float64_to_uint64(float64 a, float_status *s)
2243 {
2244     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2245 }
2246 
2247 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2248 {
2249     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2250 }
2251 
2252 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2253 {
2254     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2255 }
2256 
2257 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2258 {
2259     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2260 }
2261 
2262 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2263 {
2264     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2265 }
2266 
2267 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2268 {
2269     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2270 }
2271 
2272 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2273 {
2274     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2275 }
2276 
2277 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2278 {
2279     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2280 }
2281 
2282 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2283 {
2284     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2285 }
2286 
2287 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2288 {
2289     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2290 }
2291 
2292 /*
2293  * Integer to float conversions
2294  *
2295  * Returns the result of converting the two's complement integer `a'
2296  * to the floating-point format. The conversion is performed according
2297  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2298  */
2299 
2300 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2301 {
2302     FloatParts r = { .sign = false };
2303 
2304     if (a == 0) {
2305         r.cls = float_class_zero;
2306     } else {
2307         uint64_t f = a;
2308         int shift;
2309 
2310         r.cls = float_class_normal;
2311         if (a < 0) {
2312             f = -f;
2313             r.sign = true;
2314         }
2315         shift = clz64(f) - 1;
2316         scale = MIN(MAX(scale, -0x10000), 0x10000);
2317 
2318         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2319         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2320     }
2321 
2322     return r;
2323 }
2324 
2325 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2326 {
2327     FloatParts pa = int_to_float(a, scale, status);
2328     return float16_round_pack_canonical(pa, status);
2329 }
2330 
2331 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2332 {
2333     return int64_to_float16_scalbn(a, scale, status);
2334 }
2335 
2336 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2337 {
2338     return int64_to_float16_scalbn(a, scale, status);
2339 }
2340 
2341 float16 int64_to_float16(int64_t a, float_status *status)
2342 {
2343     return int64_to_float16_scalbn(a, 0, status);
2344 }
2345 
2346 float16 int32_to_float16(int32_t a, float_status *status)
2347 {
2348     return int64_to_float16_scalbn(a, 0, status);
2349 }
2350 
2351 float16 int16_to_float16(int16_t a, float_status *status)
2352 {
2353     return int64_to_float16_scalbn(a, 0, status);
2354 }
2355 
2356 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2357 {
2358     FloatParts pa = int_to_float(a, scale, status);
2359     return float32_round_pack_canonical(pa, status);
2360 }
2361 
2362 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2363 {
2364     return int64_to_float32_scalbn(a, scale, status);
2365 }
2366 
2367 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2368 {
2369     return int64_to_float32_scalbn(a, scale, status);
2370 }
2371 
2372 float32 int64_to_float32(int64_t a, float_status *status)
2373 {
2374     return int64_to_float32_scalbn(a, 0, status);
2375 }
2376 
2377 float32 int32_to_float32(int32_t a, float_status *status)
2378 {
2379     return int64_to_float32_scalbn(a, 0, status);
2380 }
2381 
2382 float32 int16_to_float32(int16_t a, float_status *status)
2383 {
2384     return int64_to_float32_scalbn(a, 0, status);
2385 }
2386 
2387 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2388 {
2389     FloatParts pa = int_to_float(a, scale, status);
2390     return float64_round_pack_canonical(pa, status);
2391 }
2392 
2393 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2394 {
2395     return int64_to_float64_scalbn(a, scale, status);
2396 }
2397 
2398 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2399 {
2400     return int64_to_float64_scalbn(a, scale, status);
2401 }
2402 
2403 float64 int64_to_float64(int64_t a, float_status *status)
2404 {
2405     return int64_to_float64_scalbn(a, 0, status);
2406 }
2407 
2408 float64 int32_to_float64(int32_t a, float_status *status)
2409 {
2410     return int64_to_float64_scalbn(a, 0, status);
2411 }
2412 
2413 float64 int16_to_float64(int16_t a, float_status *status)
2414 {
2415     return int64_to_float64_scalbn(a, 0, status);
2416 }
2417 
2418 
2419 /*
2420  * Unsigned Integer to float conversions
2421  *
2422  * Returns the result of converting the unsigned integer `a' to the
2423  * floating-point format. The conversion is performed according to the
2424  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2425  */
2426 
2427 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2428 {
2429     FloatParts r = { .sign = false };
2430 
2431     if (a == 0) {
2432         r.cls = float_class_zero;
2433     } else {
2434         scale = MIN(MAX(scale, -0x10000), 0x10000);
2435         r.cls = float_class_normal;
2436         if ((int64_t)a < 0) {
2437             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2438             shift64RightJamming(a, 1, &a);
2439             r.frac = a;
2440         } else {
2441             int shift = clz64(a) - 1;
2442             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2443             r.frac = a << shift;
2444         }
2445     }
2446 
2447     return r;
2448 }
2449 
2450 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2451 {
2452     FloatParts pa = uint_to_float(a, scale, status);
2453     return float16_round_pack_canonical(pa, status);
2454 }
2455 
2456 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2457 {
2458     return uint64_to_float16_scalbn(a, scale, status);
2459 }
2460 
2461 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2462 {
2463     return uint64_to_float16_scalbn(a, scale, status);
2464 }
2465 
2466 float16 uint64_to_float16(uint64_t a, float_status *status)
2467 {
2468     return uint64_to_float16_scalbn(a, 0, status);
2469 }
2470 
2471 float16 uint32_to_float16(uint32_t a, float_status *status)
2472 {
2473     return uint64_to_float16_scalbn(a, 0, status);
2474 }
2475 
2476 float16 uint16_to_float16(uint16_t a, float_status *status)
2477 {
2478     return uint64_to_float16_scalbn(a, 0, status);
2479 }
2480 
2481 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2482 {
2483     FloatParts pa = uint_to_float(a, scale, status);
2484     return float32_round_pack_canonical(pa, status);
2485 }
2486 
2487 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2488 {
2489     return uint64_to_float32_scalbn(a, scale, status);
2490 }
2491 
2492 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2493 {
2494     return uint64_to_float32_scalbn(a, scale, status);
2495 }
2496 
2497 float32 uint64_to_float32(uint64_t a, float_status *status)
2498 {
2499     return uint64_to_float32_scalbn(a, 0, status);
2500 }
2501 
2502 float32 uint32_to_float32(uint32_t a, float_status *status)
2503 {
2504     return uint64_to_float32_scalbn(a, 0, status);
2505 }
2506 
2507 float32 uint16_to_float32(uint16_t a, float_status *status)
2508 {
2509     return uint64_to_float32_scalbn(a, 0, status);
2510 }
2511 
2512 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2513 {
2514     FloatParts pa = uint_to_float(a, scale, status);
2515     return float64_round_pack_canonical(pa, status);
2516 }
2517 
2518 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2519 {
2520     return uint64_to_float64_scalbn(a, scale, status);
2521 }
2522 
2523 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2524 {
2525     return uint64_to_float64_scalbn(a, scale, status);
2526 }
2527 
2528 float64 uint64_to_float64(uint64_t a, float_status *status)
2529 {
2530     return uint64_to_float64_scalbn(a, 0, status);
2531 }
2532 
2533 float64 uint32_to_float64(uint32_t a, float_status *status)
2534 {
2535     return uint64_to_float64_scalbn(a, 0, status);
2536 }
2537 
2538 float64 uint16_to_float64(uint16_t a, float_status *status)
2539 {
2540     return uint64_to_float64_scalbn(a, 0, status);
2541 }
2542 
2543 /* Float Min/Max */
2544 /* min() and max() functions. These can't be implemented as
2545  * 'compare and pick one input' because that would mishandle
2546  * NaNs and +0 vs -0.
2547  *
2548  * minnum() and maxnum() functions. These are similar to the min()
2549  * and max() functions but if one of the arguments is a QNaN and
2550  * the other is numerical then the numerical argument is returned.
2551  * SNaNs will get quietened before being returned.
2552  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2553  * and maxNum() operations. min() and max() are the typical min/max
2554  * semantics provided by many CPUs which predate that specification.
2555  *
2556  * minnummag() and maxnummag() functions correspond to minNumMag()
2557  * and minNumMag() from the IEEE-754 2008.
2558  */
2559 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2560                                 bool ieee, bool ismag, float_status *s)
2561 {
2562     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2563         if (ieee) {
2564             /* Takes two floating-point values `a' and `b', one of
2565              * which is a NaN, and returns the appropriate NaN
2566              * result. If either `a' or `b' is a signaling NaN,
2567              * the invalid exception is raised.
2568              */
2569             if (is_snan(a.cls) || is_snan(b.cls)) {
2570                 return pick_nan(a, b, s);
2571             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2572                 return b;
2573             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2574                 return a;
2575             }
2576         }
2577         return pick_nan(a, b, s);
2578     } else {
2579         int a_exp, b_exp;
2580 
2581         switch (a.cls) {
2582         case float_class_normal:
2583             a_exp = a.exp;
2584             break;
2585         case float_class_inf:
2586             a_exp = INT_MAX;
2587             break;
2588         case float_class_zero:
2589             a_exp = INT_MIN;
2590             break;
2591         default:
2592             g_assert_not_reached();
2593             break;
2594         }
2595         switch (b.cls) {
2596         case float_class_normal:
2597             b_exp = b.exp;
2598             break;
2599         case float_class_inf:
2600             b_exp = INT_MAX;
2601             break;
2602         case float_class_zero:
2603             b_exp = INT_MIN;
2604             break;
2605         default:
2606             g_assert_not_reached();
2607             break;
2608         }
2609 
2610         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2611             bool a_less = a_exp < b_exp;
2612             if (a_exp == b_exp) {
2613                 a_less = a.frac < b.frac;
2614             }
2615             return a_less ^ ismin ? b : a;
2616         }
2617 
2618         if (a.sign == b.sign) {
2619             bool a_less = a_exp < b_exp;
2620             if (a_exp == b_exp) {
2621                 a_less = a.frac < b.frac;
2622             }
2623             return a.sign ^ a_less ^ ismin ? b : a;
2624         } else {
2625             return a.sign ^ ismin ? b : a;
2626         }
2627     }
2628 }
2629 
2630 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2631 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2632                                      float_status *s)                   \
2633 {                                                                       \
2634     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2635     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2636     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2637                                                                         \
2638     return float ## sz ## _round_pack_canonical(pr, s);                 \
2639 }
2640 
2641 MINMAX(16, min, true, false, false)
2642 MINMAX(16, minnum, true, true, false)
2643 MINMAX(16, minnummag, true, true, true)
2644 MINMAX(16, max, false, false, false)
2645 MINMAX(16, maxnum, false, true, false)
2646 MINMAX(16, maxnummag, false, true, true)
2647 
2648 MINMAX(32, min, true, false, false)
2649 MINMAX(32, minnum, true, true, false)
2650 MINMAX(32, minnummag, true, true, true)
2651 MINMAX(32, max, false, false, false)
2652 MINMAX(32, maxnum, false, true, false)
2653 MINMAX(32, maxnummag, false, true, true)
2654 
2655 MINMAX(64, min, true, false, false)
2656 MINMAX(64, minnum, true, true, false)
2657 MINMAX(64, minnummag, true, true, true)
2658 MINMAX(64, max, false, false, false)
2659 MINMAX(64, maxnum, false, true, false)
2660 MINMAX(64, maxnummag, false, true, true)
2661 
2662 #undef MINMAX
2663 
2664 /* Floating point compare */
2665 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2666                           float_status *s)
2667 {
2668     if (is_nan(a.cls) || is_nan(b.cls)) {
2669         if (!is_quiet ||
2670             a.cls == float_class_snan ||
2671             b.cls == float_class_snan) {
2672             s->float_exception_flags |= float_flag_invalid;
2673         }
2674         return float_relation_unordered;
2675     }
2676 
2677     if (a.cls == float_class_zero) {
2678         if (b.cls == float_class_zero) {
2679             return float_relation_equal;
2680         }
2681         return b.sign ? float_relation_greater : float_relation_less;
2682     } else if (b.cls == float_class_zero) {
2683         return a.sign ? float_relation_less : float_relation_greater;
2684     }
2685 
2686     /* The only really important thing about infinity is its sign. If
2687      * both are infinities the sign marks the smallest of the two.
2688      */
2689     if (a.cls == float_class_inf) {
2690         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2691             return float_relation_equal;
2692         }
2693         return a.sign ? float_relation_less : float_relation_greater;
2694     } else if (b.cls == float_class_inf) {
2695         return b.sign ? float_relation_greater : float_relation_less;
2696     }
2697 
2698     if (a.sign != b.sign) {
2699         return a.sign ? float_relation_less : float_relation_greater;
2700     }
2701 
2702     if (a.exp == b.exp) {
2703         if (a.frac == b.frac) {
2704             return float_relation_equal;
2705         }
2706         if (a.sign) {
2707             return a.frac > b.frac ?
2708                 float_relation_less : float_relation_greater;
2709         } else {
2710             return a.frac > b.frac ?
2711                 float_relation_greater : float_relation_less;
2712         }
2713     } else {
2714         if (a.sign) {
2715             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2716         } else {
2717             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2718         }
2719     }
2720 }
2721 
2722 #define COMPARE(sz)                                                     \
2723 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2724                             float_status *s)                            \
2725 {                                                                       \
2726     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2727     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2728     return compare_floats(pa, pb, false, s);                            \
2729 }                                                                       \
2730 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2731                                   float_status *s)                      \
2732 {                                                                       \
2733     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2734     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2735     return compare_floats(pa, pb, true, s);                             \
2736 }
2737 
2738 COMPARE(16)
2739 COMPARE(32)
2740 COMPARE(64)
2741 
2742 #undef COMPARE
2743 
2744 /* Multiply A by 2 raised to the power N.  */
2745 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2746 {
2747     if (unlikely(is_nan(a.cls))) {
2748         return return_nan(a, s);
2749     }
2750     if (a.cls == float_class_normal) {
2751         /* The largest float type (even though not supported by FloatParts)
2752          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2753          * still allows rounding to infinity, without allowing overflow
2754          * within the int32_t that backs FloatParts.exp.
2755          */
2756         n = MIN(MAX(n, -0x10000), 0x10000);
2757         a.exp += n;
2758     }
2759     return a;
2760 }
2761 
2762 float16 float16_scalbn(float16 a, int n, float_status *status)
2763 {
2764     FloatParts pa = float16_unpack_canonical(a, status);
2765     FloatParts pr = scalbn_decomposed(pa, n, status);
2766     return float16_round_pack_canonical(pr, status);
2767 }
2768 
2769 float32 float32_scalbn(float32 a, int n, float_status *status)
2770 {
2771     FloatParts pa = float32_unpack_canonical(a, status);
2772     FloatParts pr = scalbn_decomposed(pa, n, status);
2773     return float32_round_pack_canonical(pr, status);
2774 }
2775 
2776 float64 float64_scalbn(float64 a, int n, float_status *status)
2777 {
2778     FloatParts pa = float64_unpack_canonical(a, status);
2779     FloatParts pr = scalbn_decomposed(pa, n, status);
2780     return float64_round_pack_canonical(pr, status);
2781 }
2782 
2783 /*
2784  * Square Root
2785  *
2786  * The old softfloat code did an approximation step before zeroing in
2787  * on the final result. However for simpleness we just compute the
2788  * square root by iterating down from the implicit bit to enough extra
2789  * bits to ensure we get a correctly rounded result.
2790  *
2791  * This does mean however the calculation is slower than before,
2792  * especially for 64 bit floats.
2793  */
2794 
2795 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2796 {
2797     uint64_t a_frac, r_frac, s_frac;
2798     int bit, last_bit;
2799 
2800     if (is_nan(a.cls)) {
2801         return return_nan(a, s);
2802     }
2803     if (a.cls == float_class_zero) {
2804         return a;  /* sqrt(+-0) = +-0 */
2805     }
2806     if (a.sign) {
2807         s->float_exception_flags |= float_flag_invalid;
2808         return parts_default_nan(s);
2809     }
2810     if (a.cls == float_class_inf) {
2811         return a;  /* sqrt(+inf) = +inf */
2812     }
2813 
2814     assert(a.cls == float_class_normal);
2815 
2816     /* We need two overflow bits at the top. Adding room for that is a
2817      * right shift. If the exponent is odd, we can discard the low bit
2818      * by multiplying the fraction by 2; that's a left shift. Combine
2819      * those and we shift right if the exponent is even.
2820      */
2821     a_frac = a.frac;
2822     if (!(a.exp & 1)) {
2823         a_frac >>= 1;
2824     }
2825     a.exp >>= 1;
2826 
2827     /* Bit-by-bit computation of sqrt.  */
2828     r_frac = 0;
2829     s_frac = 0;
2830 
2831     /* Iterate from implicit bit down to the 3 extra bits to compute a
2832      * properly rounded result. Remember we've inserted one more bit
2833      * at the top, so these positions are one less.
2834      */
2835     bit = DECOMPOSED_BINARY_POINT - 1;
2836     last_bit = MAX(p->frac_shift - 4, 0);
2837     do {
2838         uint64_t q = 1ULL << bit;
2839         uint64_t t_frac = s_frac + q;
2840         if (t_frac <= a_frac) {
2841             s_frac = t_frac + q;
2842             a_frac -= t_frac;
2843             r_frac += q;
2844         }
2845         a_frac <<= 1;
2846     } while (--bit >= last_bit);
2847 
2848     /* Undo the right shift done above. If there is any remaining
2849      * fraction, the result is inexact. Set the sticky bit.
2850      */
2851     a.frac = (r_frac << 1) + (a_frac != 0);
2852 
2853     return a;
2854 }
2855 
2856 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2857 {
2858     FloatParts pa = float16_unpack_canonical(a, status);
2859     FloatParts pr = sqrt_float(pa, status, &float16_params);
2860     return float16_round_pack_canonical(pr, status);
2861 }
2862 
2863 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2864 {
2865     FloatParts pa = float32_unpack_canonical(a, status);
2866     FloatParts pr = sqrt_float(pa, status, &float32_params);
2867     return float32_round_pack_canonical(pr, status);
2868 }
2869 
2870 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2871 {
2872     FloatParts pa = float64_unpack_canonical(a, status);
2873     FloatParts pr = sqrt_float(pa, status, &float64_params);
2874     return float64_round_pack_canonical(pr, status);
2875 }
2876 
2877 /*----------------------------------------------------------------------------
2878 | The pattern for a default generated NaN.
2879 *----------------------------------------------------------------------------*/
2880 
2881 float16 float16_default_nan(float_status *status)
2882 {
2883     FloatParts p = parts_default_nan(status);
2884     p.frac >>= float16_params.frac_shift;
2885     return float16_pack_raw(p);
2886 }
2887 
2888 float32 float32_default_nan(float_status *status)
2889 {
2890     FloatParts p = parts_default_nan(status);
2891     p.frac >>= float32_params.frac_shift;
2892     return float32_pack_raw(p);
2893 }
2894 
2895 float64 float64_default_nan(float_status *status)
2896 {
2897     FloatParts p = parts_default_nan(status);
2898     p.frac >>= float64_params.frac_shift;
2899     return float64_pack_raw(p);
2900 }
2901 
2902 float128 float128_default_nan(float_status *status)
2903 {
2904     FloatParts p = parts_default_nan(status);
2905     float128 r;
2906 
2907     /* Extrapolate from the choices made by parts_default_nan to fill
2908      * in the quad-floating format.  If the low bit is set, assume we
2909      * want to set all non-snan bits.
2910      */
2911     r.low = -(p.frac & 1);
2912     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2913     r.high |= LIT64(0x7FFF000000000000);
2914     r.high |= (uint64_t)p.sign << 63;
2915 
2916     return r;
2917 }
2918 
2919 /*----------------------------------------------------------------------------
2920 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2921 *----------------------------------------------------------------------------*/
2922 
2923 float16 float16_silence_nan(float16 a, float_status *status)
2924 {
2925     FloatParts p = float16_unpack_raw(a);
2926     p.frac <<= float16_params.frac_shift;
2927     p = parts_silence_nan(p, status);
2928     p.frac >>= float16_params.frac_shift;
2929     return float16_pack_raw(p);
2930 }
2931 
2932 float32 float32_silence_nan(float32 a, float_status *status)
2933 {
2934     FloatParts p = float32_unpack_raw(a);
2935     p.frac <<= float32_params.frac_shift;
2936     p = parts_silence_nan(p, status);
2937     p.frac >>= float32_params.frac_shift;
2938     return float32_pack_raw(p);
2939 }
2940 
2941 float64 float64_silence_nan(float64 a, float_status *status)
2942 {
2943     FloatParts p = float64_unpack_raw(a);
2944     p.frac <<= float64_params.frac_shift;
2945     p = parts_silence_nan(p, status);
2946     p.frac >>= float64_params.frac_shift;
2947     return float64_pack_raw(p);
2948 }
2949 
2950 /*----------------------------------------------------------------------------
2951 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2952 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2953 | input.  If `zSign' is 1, the input is negated before being converted to an
2954 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2955 | is simply rounded to an integer, with the inexact exception raised if the
2956 | input cannot be represented exactly as an integer.  However, if the fixed-
2957 | point input is too large, the invalid exception is raised and the largest
2958 | positive or negative integer is returned.
2959 *----------------------------------------------------------------------------*/
2960 
2961 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2962 {
2963     int8_t roundingMode;
2964     flag roundNearestEven;
2965     int8_t roundIncrement, roundBits;
2966     int32_t z;
2967 
2968     roundingMode = status->float_rounding_mode;
2969     roundNearestEven = ( roundingMode == float_round_nearest_even );
2970     switch (roundingMode) {
2971     case float_round_nearest_even:
2972     case float_round_ties_away:
2973         roundIncrement = 0x40;
2974         break;
2975     case float_round_to_zero:
2976         roundIncrement = 0;
2977         break;
2978     case float_round_up:
2979         roundIncrement = zSign ? 0 : 0x7f;
2980         break;
2981     case float_round_down:
2982         roundIncrement = zSign ? 0x7f : 0;
2983         break;
2984     default:
2985         abort();
2986     }
2987     roundBits = absZ & 0x7F;
2988     absZ = ( absZ + roundIncrement )>>7;
2989     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2990     z = absZ;
2991     if ( zSign ) z = - z;
2992     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2993         float_raise(float_flag_invalid, status);
2994         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2995     }
2996     if (roundBits) {
2997         status->float_exception_flags |= float_flag_inexact;
2998     }
2999     return z;
3000 
3001 }
3002 
3003 /*----------------------------------------------------------------------------
3004 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3005 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3006 | and returns the properly rounded 64-bit integer corresponding to the input.
3007 | If `zSign' is 1, the input is negated before being converted to an integer.
3008 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3009 | the inexact exception raised if the input cannot be represented exactly as
3010 | an integer.  However, if the fixed-point input is too large, the invalid
3011 | exception is raised and the largest positive or negative integer is
3012 | returned.
3013 *----------------------------------------------------------------------------*/
3014 
3015 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3016                                float_status *status)
3017 {
3018     int8_t roundingMode;
3019     flag roundNearestEven, increment;
3020     int64_t z;
3021 
3022     roundingMode = status->float_rounding_mode;
3023     roundNearestEven = ( roundingMode == float_round_nearest_even );
3024     switch (roundingMode) {
3025     case float_round_nearest_even:
3026     case float_round_ties_away:
3027         increment = ((int64_t) absZ1 < 0);
3028         break;
3029     case float_round_to_zero:
3030         increment = 0;
3031         break;
3032     case float_round_up:
3033         increment = !zSign && absZ1;
3034         break;
3035     case float_round_down:
3036         increment = zSign && absZ1;
3037         break;
3038     default:
3039         abort();
3040     }
3041     if ( increment ) {
3042         ++absZ0;
3043         if ( absZ0 == 0 ) goto overflow;
3044         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3045     }
3046     z = absZ0;
3047     if ( zSign ) z = - z;
3048     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3049  overflow:
3050         float_raise(float_flag_invalid, status);
3051         return
3052               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3053             : LIT64( 0x7FFFFFFFFFFFFFFF );
3054     }
3055     if (absZ1) {
3056         status->float_exception_flags |= float_flag_inexact;
3057     }
3058     return z;
3059 
3060 }
3061 
3062 /*----------------------------------------------------------------------------
3063 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3064 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3065 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3066 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3067 | with the inexact exception raised if the input cannot be represented exactly
3068 | as an integer.  However, if the fixed-point input is too large, the invalid
3069 | exception is raised and the largest unsigned integer is returned.
3070 *----------------------------------------------------------------------------*/
3071 
3072 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3073                                 uint64_t absZ1, float_status *status)
3074 {
3075     int8_t roundingMode;
3076     flag roundNearestEven, increment;
3077 
3078     roundingMode = status->float_rounding_mode;
3079     roundNearestEven = (roundingMode == float_round_nearest_even);
3080     switch (roundingMode) {
3081     case float_round_nearest_even:
3082     case float_round_ties_away:
3083         increment = ((int64_t)absZ1 < 0);
3084         break;
3085     case float_round_to_zero:
3086         increment = 0;
3087         break;
3088     case float_round_up:
3089         increment = !zSign && absZ1;
3090         break;
3091     case float_round_down:
3092         increment = zSign && absZ1;
3093         break;
3094     default:
3095         abort();
3096     }
3097     if (increment) {
3098         ++absZ0;
3099         if (absZ0 == 0) {
3100             float_raise(float_flag_invalid, status);
3101             return LIT64(0xFFFFFFFFFFFFFFFF);
3102         }
3103         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3104     }
3105 
3106     if (zSign && absZ0) {
3107         float_raise(float_flag_invalid, status);
3108         return 0;
3109     }
3110 
3111     if (absZ1) {
3112         status->float_exception_flags |= float_flag_inexact;
3113     }
3114     return absZ0;
3115 }
3116 
3117 /*----------------------------------------------------------------------------
3118 | If `a' is denormal and we are in flush-to-zero mode then set the
3119 | input-denormal exception and return zero. Otherwise just return the value.
3120 *----------------------------------------------------------------------------*/
3121 float32 float32_squash_input_denormal(float32 a, float_status *status)
3122 {
3123     if (status->flush_inputs_to_zero) {
3124         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3125             float_raise(float_flag_input_denormal, status);
3126             return make_float32(float32_val(a) & 0x80000000);
3127         }
3128     }
3129     return a;
3130 }
3131 
3132 /*----------------------------------------------------------------------------
3133 | Normalizes the subnormal single-precision floating-point value represented
3134 | by the denormalized significand `aSig'.  The normalized exponent and
3135 | significand are stored at the locations pointed to by `zExpPtr' and
3136 | `zSigPtr', respectively.
3137 *----------------------------------------------------------------------------*/
3138 
3139 static void
3140  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3141 {
3142     int8_t shiftCount;
3143 
3144     shiftCount = clz32(aSig) - 8;
3145     *zSigPtr = aSig<<shiftCount;
3146     *zExpPtr = 1 - shiftCount;
3147 
3148 }
3149 
3150 /*----------------------------------------------------------------------------
3151 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3152 | and significand `zSig', and returns the proper single-precision floating-
3153 | point value corresponding to the abstract input.  Ordinarily, the abstract
3154 | value is simply rounded and packed into the single-precision format, with
3155 | the inexact exception raised if the abstract input cannot be represented
3156 | exactly.  However, if the abstract value is too large, the overflow and
3157 | inexact exceptions are raised and an infinity or maximal finite value is
3158 | returned.  If the abstract value is too small, the input value is rounded to
3159 | a subnormal number, and the underflow and inexact exceptions are raised if
3160 | the abstract input cannot be represented exactly as a subnormal single-
3161 | precision floating-point number.
3162 |     The input significand `zSig' has its binary point between bits 30
3163 | and 29, which is 7 bits to the left of the usual location.  This shifted
3164 | significand must be normalized or smaller.  If `zSig' is not normalized,
3165 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3166 | and it must not require rounding.  In the usual case that `zSig' is
3167 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3168 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3169 | Binary Floating-Point Arithmetic.
3170 *----------------------------------------------------------------------------*/
3171 
3172 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3173                                    float_status *status)
3174 {
3175     int8_t roundingMode;
3176     flag roundNearestEven;
3177     int8_t roundIncrement, roundBits;
3178     flag isTiny;
3179 
3180     roundingMode = status->float_rounding_mode;
3181     roundNearestEven = ( roundingMode == float_round_nearest_even );
3182     switch (roundingMode) {
3183     case float_round_nearest_even:
3184     case float_round_ties_away:
3185         roundIncrement = 0x40;
3186         break;
3187     case float_round_to_zero:
3188         roundIncrement = 0;
3189         break;
3190     case float_round_up:
3191         roundIncrement = zSign ? 0 : 0x7f;
3192         break;
3193     case float_round_down:
3194         roundIncrement = zSign ? 0x7f : 0;
3195         break;
3196     default:
3197         abort();
3198         break;
3199     }
3200     roundBits = zSig & 0x7F;
3201     if ( 0xFD <= (uint16_t) zExp ) {
3202         if (    ( 0xFD < zExp )
3203              || (    ( zExp == 0xFD )
3204                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3205            ) {
3206             float_raise(float_flag_overflow | float_flag_inexact, status);
3207             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3208         }
3209         if ( zExp < 0 ) {
3210             if (status->flush_to_zero) {
3211                 float_raise(float_flag_output_denormal, status);
3212                 return packFloat32(zSign, 0, 0);
3213             }
3214             isTiny =
3215                 (status->float_detect_tininess
3216                  == float_tininess_before_rounding)
3217                 || ( zExp < -1 )
3218                 || ( zSig + roundIncrement < 0x80000000 );
3219             shift32RightJamming( zSig, - zExp, &zSig );
3220             zExp = 0;
3221             roundBits = zSig & 0x7F;
3222             if (isTiny && roundBits) {
3223                 float_raise(float_flag_underflow, status);
3224             }
3225         }
3226     }
3227     if (roundBits) {
3228         status->float_exception_flags |= float_flag_inexact;
3229     }
3230     zSig = ( zSig + roundIncrement )>>7;
3231     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3232     if ( zSig == 0 ) zExp = 0;
3233     return packFloat32( zSign, zExp, zSig );
3234 
3235 }
3236 
3237 /*----------------------------------------------------------------------------
3238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3239 | and significand `zSig', and returns the proper single-precision floating-
3240 | point value corresponding to the abstract input.  This routine is just like
3241 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3242 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3243 | floating-point exponent.
3244 *----------------------------------------------------------------------------*/
3245 
3246 static float32
3247  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3248                               float_status *status)
3249 {
3250     int8_t shiftCount;
3251 
3252     shiftCount = clz32(zSig) - 1;
3253     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3254                                status);
3255 
3256 }
3257 
3258 /*----------------------------------------------------------------------------
3259 | If `a' is denormal and we are in flush-to-zero mode then set the
3260 | input-denormal exception and return zero. Otherwise just return the value.
3261 *----------------------------------------------------------------------------*/
3262 float64 float64_squash_input_denormal(float64 a, float_status *status)
3263 {
3264     if (status->flush_inputs_to_zero) {
3265         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3266             float_raise(float_flag_input_denormal, status);
3267             return make_float64(float64_val(a) & (1ULL << 63));
3268         }
3269     }
3270     return a;
3271 }
3272 
3273 /*----------------------------------------------------------------------------
3274 | Normalizes the subnormal double-precision floating-point value represented
3275 | by the denormalized significand `aSig'.  The normalized exponent and
3276 | significand are stored at the locations pointed to by `zExpPtr' and
3277 | `zSigPtr', respectively.
3278 *----------------------------------------------------------------------------*/
3279 
3280 static void
3281  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3282 {
3283     int8_t shiftCount;
3284 
3285     shiftCount = clz64(aSig) - 11;
3286     *zSigPtr = aSig<<shiftCount;
3287     *zExpPtr = 1 - shiftCount;
3288 
3289 }
3290 
3291 /*----------------------------------------------------------------------------
3292 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3293 | double-precision floating-point value, returning the result.  After being
3294 | shifted into the proper positions, the three fields are simply added
3295 | together to form the result.  This means that any integer portion of `zSig'
3296 | will be added into the exponent.  Since a properly normalized significand
3297 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3298 | than the desired result exponent whenever `zSig' is a complete, normalized
3299 | significand.
3300 *----------------------------------------------------------------------------*/
3301 
3302 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3303 {
3304 
3305     return make_float64(
3306         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3307 
3308 }
3309 
3310 /*----------------------------------------------------------------------------
3311 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3312 | and significand `zSig', and returns the proper double-precision floating-
3313 | point value corresponding to the abstract input.  Ordinarily, the abstract
3314 | value is simply rounded and packed into the double-precision format, with
3315 | the inexact exception raised if the abstract input cannot be represented
3316 | exactly.  However, if the abstract value is too large, the overflow and
3317 | inexact exceptions are raised and an infinity or maximal finite value is
3318 | returned.  If the abstract value is too small, the input value is rounded to
3319 | a subnormal number, and the underflow and inexact exceptions are raised if
3320 | the abstract input cannot be represented exactly as a subnormal double-
3321 | precision floating-point number.
3322 |     The input significand `zSig' has its binary point between bits 62
3323 | and 61, which is 10 bits to the left of the usual location.  This shifted
3324 | significand must be normalized or smaller.  If `zSig' is not normalized,
3325 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3326 | and it must not require rounding.  In the usual case that `zSig' is
3327 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3328 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3329 | Binary Floating-Point Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331 
3332 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3333                                    float_status *status)
3334 {
3335     int8_t roundingMode;
3336     flag roundNearestEven;
3337     int roundIncrement, roundBits;
3338     flag isTiny;
3339 
3340     roundingMode = status->float_rounding_mode;
3341     roundNearestEven = ( roundingMode == float_round_nearest_even );
3342     switch (roundingMode) {
3343     case float_round_nearest_even:
3344     case float_round_ties_away:
3345         roundIncrement = 0x200;
3346         break;
3347     case float_round_to_zero:
3348         roundIncrement = 0;
3349         break;
3350     case float_round_up:
3351         roundIncrement = zSign ? 0 : 0x3ff;
3352         break;
3353     case float_round_down:
3354         roundIncrement = zSign ? 0x3ff : 0;
3355         break;
3356     case float_round_to_odd:
3357         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3358         break;
3359     default:
3360         abort();
3361     }
3362     roundBits = zSig & 0x3FF;
3363     if ( 0x7FD <= (uint16_t) zExp ) {
3364         if (    ( 0x7FD < zExp )
3365              || (    ( zExp == 0x7FD )
3366                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3367            ) {
3368             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3369                                    roundIncrement != 0;
3370             float_raise(float_flag_overflow | float_flag_inexact, status);
3371             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3372         }
3373         if ( zExp < 0 ) {
3374             if (status->flush_to_zero) {
3375                 float_raise(float_flag_output_denormal, status);
3376                 return packFloat64(zSign, 0, 0);
3377             }
3378             isTiny =
3379                    (status->float_detect_tininess
3380                     == float_tininess_before_rounding)
3381                 || ( zExp < -1 )
3382                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3383             shift64RightJamming( zSig, - zExp, &zSig );
3384             zExp = 0;
3385             roundBits = zSig & 0x3FF;
3386             if (isTiny && roundBits) {
3387                 float_raise(float_flag_underflow, status);
3388             }
3389             if (roundingMode == float_round_to_odd) {
3390                 /*
3391                  * For round-to-odd case, the roundIncrement depends on
3392                  * zSig which just changed.
3393                  */
3394                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3395             }
3396         }
3397     }
3398     if (roundBits) {
3399         status->float_exception_flags |= float_flag_inexact;
3400     }
3401     zSig = ( zSig + roundIncrement )>>10;
3402     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3403     if ( zSig == 0 ) zExp = 0;
3404     return packFloat64( zSign, zExp, zSig );
3405 
3406 }
3407 
3408 /*----------------------------------------------------------------------------
3409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3410 | and significand `zSig', and returns the proper double-precision floating-
3411 | point value corresponding to the abstract input.  This routine is just like
3412 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3413 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3414 | floating-point exponent.
3415 *----------------------------------------------------------------------------*/
3416 
3417 static float64
3418  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3419                               float_status *status)
3420 {
3421     int8_t shiftCount;
3422 
3423     shiftCount = clz64(zSig) - 1;
3424     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3425                                status);
3426 
3427 }
3428 
3429 /*----------------------------------------------------------------------------
3430 | Normalizes the subnormal extended double-precision floating-point value
3431 | represented by the denormalized significand `aSig'.  The normalized exponent
3432 | and significand are stored at the locations pointed to by `zExpPtr' and
3433 | `zSigPtr', respectively.
3434 *----------------------------------------------------------------------------*/
3435 
3436 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3437                                 uint64_t *zSigPtr)
3438 {
3439     int8_t shiftCount;
3440 
3441     shiftCount = clz64(aSig);
3442     *zSigPtr = aSig<<shiftCount;
3443     *zExpPtr = 1 - shiftCount;
3444 }
3445 
3446 /*----------------------------------------------------------------------------
3447 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3448 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3449 | and returns the proper extended double-precision floating-point value
3450 | corresponding to the abstract input.  Ordinarily, the abstract value is
3451 | rounded and packed into the extended double-precision format, with the
3452 | inexact exception raised if the abstract input cannot be represented
3453 | exactly.  However, if the abstract value is too large, the overflow and
3454 | inexact exceptions are raised and an infinity or maximal finite value is
3455 | returned.  If the abstract value is too small, the input value is rounded to
3456 | a subnormal number, and the underflow and inexact exceptions are raised if
3457 | the abstract input cannot be represented exactly as a subnormal extended
3458 | double-precision floating-point number.
3459 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3460 | number of bits as single or double precision, respectively.  Otherwise, the
3461 | result is rounded to the full precision of the extended double-precision
3462 | format.
3463 |     The input significand must be normalized or smaller.  If the input
3464 | significand is not normalized, `zExp' must be 0; in that case, the result
3465 | returned is a subnormal number, and it must not require rounding.  The
3466 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3467 | Floating-Point Arithmetic.
3468 *----------------------------------------------------------------------------*/
3469 
3470 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3471                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3472                               float_status *status)
3473 {
3474     int8_t roundingMode;
3475     flag roundNearestEven, increment, isTiny;
3476     int64_t roundIncrement, roundMask, roundBits;
3477 
3478     roundingMode = status->float_rounding_mode;
3479     roundNearestEven = ( roundingMode == float_round_nearest_even );
3480     if ( roundingPrecision == 80 ) goto precision80;
3481     if ( roundingPrecision == 64 ) {
3482         roundIncrement = LIT64( 0x0000000000000400 );
3483         roundMask = LIT64( 0x00000000000007FF );
3484     }
3485     else if ( roundingPrecision == 32 ) {
3486         roundIncrement = LIT64( 0x0000008000000000 );
3487         roundMask = LIT64( 0x000000FFFFFFFFFF );
3488     }
3489     else {
3490         goto precision80;
3491     }
3492     zSig0 |= ( zSig1 != 0 );
3493     switch (roundingMode) {
3494     case float_round_nearest_even:
3495     case float_round_ties_away:
3496         break;
3497     case float_round_to_zero:
3498         roundIncrement = 0;
3499         break;
3500     case float_round_up:
3501         roundIncrement = zSign ? 0 : roundMask;
3502         break;
3503     case float_round_down:
3504         roundIncrement = zSign ? roundMask : 0;
3505         break;
3506     default:
3507         abort();
3508     }
3509     roundBits = zSig0 & roundMask;
3510     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3511         if (    ( 0x7FFE < zExp )
3512              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3513            ) {
3514             goto overflow;
3515         }
3516         if ( zExp <= 0 ) {
3517             if (status->flush_to_zero) {
3518                 float_raise(float_flag_output_denormal, status);
3519                 return packFloatx80(zSign, 0, 0);
3520             }
3521             isTiny =
3522                    (status->float_detect_tininess
3523                     == float_tininess_before_rounding)
3524                 || ( zExp < 0 )
3525                 || ( zSig0 <= zSig0 + roundIncrement );
3526             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3527             zExp = 0;
3528             roundBits = zSig0 & roundMask;
3529             if (isTiny && roundBits) {
3530                 float_raise(float_flag_underflow, status);
3531             }
3532             if (roundBits) {
3533                 status->float_exception_flags |= float_flag_inexact;
3534             }
3535             zSig0 += roundIncrement;
3536             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3537             roundIncrement = roundMask + 1;
3538             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3539                 roundMask |= roundIncrement;
3540             }
3541             zSig0 &= ~ roundMask;
3542             return packFloatx80( zSign, zExp, zSig0 );
3543         }
3544     }
3545     if (roundBits) {
3546         status->float_exception_flags |= float_flag_inexact;
3547     }
3548     zSig0 += roundIncrement;
3549     if ( zSig0 < roundIncrement ) {
3550         ++zExp;
3551         zSig0 = LIT64( 0x8000000000000000 );
3552     }
3553     roundIncrement = roundMask + 1;
3554     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3555         roundMask |= roundIncrement;
3556     }
3557     zSig0 &= ~ roundMask;
3558     if ( zSig0 == 0 ) zExp = 0;
3559     return packFloatx80( zSign, zExp, zSig0 );
3560  precision80:
3561     switch (roundingMode) {
3562     case float_round_nearest_even:
3563     case float_round_ties_away:
3564         increment = ((int64_t)zSig1 < 0);
3565         break;
3566     case float_round_to_zero:
3567         increment = 0;
3568         break;
3569     case float_round_up:
3570         increment = !zSign && zSig1;
3571         break;
3572     case float_round_down:
3573         increment = zSign && zSig1;
3574         break;
3575     default:
3576         abort();
3577     }
3578     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3579         if (    ( 0x7FFE < zExp )
3580              || (    ( zExp == 0x7FFE )
3581                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3582                   && increment
3583                 )
3584            ) {
3585             roundMask = 0;
3586  overflow:
3587             float_raise(float_flag_overflow | float_flag_inexact, status);
3588             if (    ( roundingMode == float_round_to_zero )
3589                  || ( zSign && ( roundingMode == float_round_up ) )
3590                  || ( ! zSign && ( roundingMode == float_round_down ) )
3591                ) {
3592                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3593             }
3594             return packFloatx80(zSign,
3595                                 floatx80_infinity_high,
3596                                 floatx80_infinity_low);
3597         }
3598         if ( zExp <= 0 ) {
3599             isTiny =
3600                    (status->float_detect_tininess
3601                     == float_tininess_before_rounding)
3602                 || ( zExp < 0 )
3603                 || ! increment
3604                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3605             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3606             zExp = 0;
3607             if (isTiny && zSig1) {
3608                 float_raise(float_flag_underflow, status);
3609             }
3610             if (zSig1) {
3611                 status->float_exception_flags |= float_flag_inexact;
3612             }
3613             switch (roundingMode) {
3614             case float_round_nearest_even:
3615             case float_round_ties_away:
3616                 increment = ((int64_t)zSig1 < 0);
3617                 break;
3618             case float_round_to_zero:
3619                 increment = 0;
3620                 break;
3621             case float_round_up:
3622                 increment = !zSign && zSig1;
3623                 break;
3624             case float_round_down:
3625                 increment = zSign && zSig1;
3626                 break;
3627             default:
3628                 abort();
3629             }
3630             if ( increment ) {
3631                 ++zSig0;
3632                 zSig0 &=
3633                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3634                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3635             }
3636             return packFloatx80( zSign, zExp, zSig0 );
3637         }
3638     }
3639     if (zSig1) {
3640         status->float_exception_flags |= float_flag_inexact;
3641     }
3642     if ( increment ) {
3643         ++zSig0;
3644         if ( zSig0 == 0 ) {
3645             ++zExp;
3646             zSig0 = LIT64( 0x8000000000000000 );
3647         }
3648         else {
3649             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3650         }
3651     }
3652     else {
3653         if ( zSig0 == 0 ) zExp = 0;
3654     }
3655     return packFloatx80( zSign, zExp, zSig0 );
3656 
3657 }
3658 
3659 /*----------------------------------------------------------------------------
3660 | Takes an abstract floating-point value having sign `zSign', exponent
3661 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3662 | and returns the proper extended double-precision floating-point value
3663 | corresponding to the abstract input.  This routine is just like
3664 | `roundAndPackFloatx80' except that the input significand does not have to be
3665 | normalized.
3666 *----------------------------------------------------------------------------*/
3667 
3668 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3669                                        flag zSign, int32_t zExp,
3670                                        uint64_t zSig0, uint64_t zSig1,
3671                                        float_status *status)
3672 {
3673     int8_t shiftCount;
3674 
3675     if ( zSig0 == 0 ) {
3676         zSig0 = zSig1;
3677         zSig1 = 0;
3678         zExp -= 64;
3679     }
3680     shiftCount = clz64(zSig0);
3681     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3682     zExp -= shiftCount;
3683     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3684                                 zSig0, zSig1, status);
3685 
3686 }
3687 
3688 /*----------------------------------------------------------------------------
3689 | Returns the least-significant 64 fraction bits of the quadruple-precision
3690 | floating-point value `a'.
3691 *----------------------------------------------------------------------------*/
3692 
3693 static inline uint64_t extractFloat128Frac1( float128 a )
3694 {
3695 
3696     return a.low;
3697 
3698 }
3699 
3700 /*----------------------------------------------------------------------------
3701 | Returns the most-significant 48 fraction bits of the quadruple-precision
3702 | floating-point value `a'.
3703 *----------------------------------------------------------------------------*/
3704 
3705 static inline uint64_t extractFloat128Frac0( float128 a )
3706 {
3707 
3708     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3709 
3710 }
3711 
3712 /*----------------------------------------------------------------------------
3713 | Returns the exponent bits of the quadruple-precision floating-point value
3714 | `a'.
3715 *----------------------------------------------------------------------------*/
3716 
3717 static inline int32_t extractFloat128Exp( float128 a )
3718 {
3719 
3720     return ( a.high>>48 ) & 0x7FFF;
3721 
3722 }
3723 
3724 /*----------------------------------------------------------------------------
3725 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3726 *----------------------------------------------------------------------------*/
3727 
3728 static inline flag extractFloat128Sign( float128 a )
3729 {
3730 
3731     return a.high>>63;
3732 
3733 }
3734 
3735 /*----------------------------------------------------------------------------
3736 | Normalizes the subnormal quadruple-precision floating-point value
3737 | represented by the denormalized significand formed by the concatenation of
3738 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3739 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3740 | significand are stored at the location pointed to by `zSig0Ptr', and the
3741 | least significant 64 bits of the normalized significand are stored at the
3742 | location pointed to by `zSig1Ptr'.
3743 *----------------------------------------------------------------------------*/
3744 
3745 static void
3746  normalizeFloat128Subnormal(
3747      uint64_t aSig0,
3748      uint64_t aSig1,
3749      int32_t *zExpPtr,
3750      uint64_t *zSig0Ptr,
3751      uint64_t *zSig1Ptr
3752  )
3753 {
3754     int8_t shiftCount;
3755 
3756     if ( aSig0 == 0 ) {
3757         shiftCount = clz64(aSig1) - 15;
3758         if ( shiftCount < 0 ) {
3759             *zSig0Ptr = aSig1>>( - shiftCount );
3760             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3761         }
3762         else {
3763             *zSig0Ptr = aSig1<<shiftCount;
3764             *zSig1Ptr = 0;
3765         }
3766         *zExpPtr = - shiftCount - 63;
3767     }
3768     else {
3769         shiftCount = clz64(aSig0) - 15;
3770         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3771         *zExpPtr = 1 - shiftCount;
3772     }
3773 
3774 }
3775 
3776 /*----------------------------------------------------------------------------
3777 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3778 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3779 | floating-point value, returning the result.  After being shifted into the
3780 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3781 | added together to form the most significant 32 bits of the result.  This
3782 | means that any integer portion of `zSig0' will be added into the exponent.
3783 | Since a properly normalized significand will have an integer portion equal
3784 | to 1, the `zExp' input should be 1 less than the desired result exponent
3785 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3786 | significand.
3787 *----------------------------------------------------------------------------*/
3788 
3789 static inline float128
3790  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3791 {
3792     float128 z;
3793 
3794     z.low = zSig1;
3795     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3796     return z;
3797 
3798 }
3799 
3800 /*----------------------------------------------------------------------------
3801 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3802 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3803 | and `zSig2', and returns the proper quadruple-precision floating-point value
3804 | corresponding to the abstract input.  Ordinarily, the abstract value is
3805 | simply rounded and packed into the quadruple-precision format, with the
3806 | inexact exception raised if the abstract input cannot be represented
3807 | exactly.  However, if the abstract value is too large, the overflow and
3808 | inexact exceptions are raised and an infinity or maximal finite value is
3809 | returned.  If the abstract value is too small, the input value is rounded to
3810 | a subnormal number, and the underflow and inexact exceptions are raised if
3811 | the abstract input cannot be represented exactly as a subnormal quadruple-
3812 | precision floating-point number.
3813 |     The input significand must be normalized or smaller.  If the input
3814 | significand is not normalized, `zExp' must be 0; in that case, the result
3815 | returned is a subnormal number, and it must not require rounding.  In the
3816 | usual case that the input significand is normalized, `zExp' must be 1 less
3817 | than the ``true'' floating-point exponent.  The handling of underflow and
3818 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3819 *----------------------------------------------------------------------------*/
3820 
3821 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3822                                      uint64_t zSig0, uint64_t zSig1,
3823                                      uint64_t zSig2, float_status *status)
3824 {
3825     int8_t roundingMode;
3826     flag roundNearestEven, increment, isTiny;
3827 
3828     roundingMode = status->float_rounding_mode;
3829     roundNearestEven = ( roundingMode == float_round_nearest_even );
3830     switch (roundingMode) {
3831     case float_round_nearest_even:
3832     case float_round_ties_away:
3833         increment = ((int64_t)zSig2 < 0);
3834         break;
3835     case float_round_to_zero:
3836         increment = 0;
3837         break;
3838     case float_round_up:
3839         increment = !zSign && zSig2;
3840         break;
3841     case float_round_down:
3842         increment = zSign && zSig2;
3843         break;
3844     case float_round_to_odd:
3845         increment = !(zSig1 & 0x1) && zSig2;
3846         break;
3847     default:
3848         abort();
3849     }
3850     if ( 0x7FFD <= (uint32_t) zExp ) {
3851         if (    ( 0x7FFD < zExp )
3852              || (    ( zExp == 0x7FFD )
3853                   && eq128(
3854                          LIT64( 0x0001FFFFFFFFFFFF ),
3855                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3856                          zSig0,
3857                          zSig1
3858                      )
3859                   && increment
3860                 )
3861            ) {
3862             float_raise(float_flag_overflow | float_flag_inexact, status);
3863             if (    ( roundingMode == float_round_to_zero )
3864                  || ( zSign && ( roundingMode == float_round_up ) )
3865                  || ( ! zSign && ( roundingMode == float_round_down ) )
3866                  || (roundingMode == float_round_to_odd)
3867                ) {
3868                 return
3869                     packFloat128(
3870                         zSign,
3871                         0x7FFE,
3872                         LIT64( 0x0000FFFFFFFFFFFF ),
3873                         LIT64( 0xFFFFFFFFFFFFFFFF )
3874                     );
3875             }
3876             return packFloat128( zSign, 0x7FFF, 0, 0 );
3877         }
3878         if ( zExp < 0 ) {
3879             if (status->flush_to_zero) {
3880                 float_raise(float_flag_output_denormal, status);
3881                 return packFloat128(zSign, 0, 0, 0);
3882             }
3883             isTiny =
3884                    (status->float_detect_tininess
3885                     == float_tininess_before_rounding)
3886                 || ( zExp < -1 )
3887                 || ! increment
3888                 || lt128(
3889                        zSig0,
3890                        zSig1,
3891                        LIT64( 0x0001FFFFFFFFFFFF ),
3892                        LIT64( 0xFFFFFFFFFFFFFFFF )
3893                    );
3894             shift128ExtraRightJamming(
3895                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3896             zExp = 0;
3897             if (isTiny && zSig2) {
3898                 float_raise(float_flag_underflow, status);
3899             }
3900             switch (roundingMode) {
3901             case float_round_nearest_even:
3902             case float_round_ties_away:
3903                 increment = ((int64_t)zSig2 < 0);
3904                 break;
3905             case float_round_to_zero:
3906                 increment = 0;
3907                 break;
3908             case float_round_up:
3909                 increment = !zSign && zSig2;
3910                 break;
3911             case float_round_down:
3912                 increment = zSign && zSig2;
3913                 break;
3914             case float_round_to_odd:
3915                 increment = !(zSig1 & 0x1) && zSig2;
3916                 break;
3917             default:
3918                 abort();
3919             }
3920         }
3921     }
3922     if (zSig2) {
3923         status->float_exception_flags |= float_flag_inexact;
3924     }
3925     if ( increment ) {
3926         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3927         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3928     }
3929     else {
3930         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3931     }
3932     return packFloat128( zSign, zExp, zSig0, zSig1 );
3933 
3934 }
3935 
3936 /*----------------------------------------------------------------------------
3937 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3938 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3939 | returns the proper quadruple-precision floating-point value corresponding
3940 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3941 | except that the input significand has fewer bits and does not have to be
3942 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3943 | point exponent.
3944 *----------------------------------------------------------------------------*/
3945 
3946 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3947                                               uint64_t zSig0, uint64_t zSig1,
3948                                               float_status *status)
3949 {
3950     int8_t shiftCount;
3951     uint64_t zSig2;
3952 
3953     if ( zSig0 == 0 ) {
3954         zSig0 = zSig1;
3955         zSig1 = 0;
3956         zExp -= 64;
3957     }
3958     shiftCount = clz64(zSig0) - 15;
3959     if ( 0 <= shiftCount ) {
3960         zSig2 = 0;
3961         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3962     }
3963     else {
3964         shift128ExtraRightJamming(
3965             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3966     }
3967     zExp -= shiftCount;
3968     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3969 
3970 }
3971 
3972 
3973 /*----------------------------------------------------------------------------
3974 | Returns the result of converting the 32-bit two's complement integer `a'
3975 | to the extended double-precision floating-point format.  The conversion
3976 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3977 | Arithmetic.
3978 *----------------------------------------------------------------------------*/
3979 
3980 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3981 {
3982     flag zSign;
3983     uint32_t absA;
3984     int8_t shiftCount;
3985     uint64_t zSig;
3986 
3987     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3988     zSign = ( a < 0 );
3989     absA = zSign ? - a : a;
3990     shiftCount = clz32(absA) + 32;
3991     zSig = absA;
3992     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3993 
3994 }
3995 
3996 /*----------------------------------------------------------------------------
3997 | Returns the result of converting the 32-bit two's complement integer `a' to
3998 | the quadruple-precision floating-point format.  The conversion is performed
3999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4000 *----------------------------------------------------------------------------*/
4001 
4002 float128 int32_to_float128(int32_t a, float_status *status)
4003 {
4004     flag zSign;
4005     uint32_t absA;
4006     int8_t shiftCount;
4007     uint64_t zSig0;
4008 
4009     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4010     zSign = ( a < 0 );
4011     absA = zSign ? - a : a;
4012     shiftCount = clz32(absA) + 17;
4013     zSig0 = absA;
4014     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4015 
4016 }
4017 
4018 /*----------------------------------------------------------------------------
4019 | Returns the result of converting the 64-bit two's complement integer `a'
4020 | to the extended double-precision floating-point format.  The conversion
4021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4022 | Arithmetic.
4023 *----------------------------------------------------------------------------*/
4024 
4025 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4026 {
4027     flag zSign;
4028     uint64_t absA;
4029     int8_t shiftCount;
4030 
4031     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4032     zSign = ( a < 0 );
4033     absA = zSign ? - a : a;
4034     shiftCount = clz64(absA);
4035     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4036 
4037 }
4038 
4039 /*----------------------------------------------------------------------------
4040 | Returns the result of converting the 64-bit two's complement integer `a' to
4041 | the quadruple-precision floating-point format.  The conversion is performed
4042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4043 *----------------------------------------------------------------------------*/
4044 
4045 float128 int64_to_float128(int64_t a, float_status *status)
4046 {
4047     flag zSign;
4048     uint64_t absA;
4049     int8_t shiftCount;
4050     int32_t zExp;
4051     uint64_t zSig0, zSig1;
4052 
4053     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4054     zSign = ( a < 0 );
4055     absA = zSign ? - a : a;
4056     shiftCount = clz64(absA) + 49;
4057     zExp = 0x406E - shiftCount;
4058     if ( 64 <= shiftCount ) {
4059         zSig1 = 0;
4060         zSig0 = absA;
4061         shiftCount -= 64;
4062     }
4063     else {
4064         zSig1 = absA;
4065         zSig0 = 0;
4066     }
4067     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4068     return packFloat128( zSign, zExp, zSig0, zSig1 );
4069 
4070 }
4071 
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the 64-bit unsigned integer `a'
4074 | to the quadruple-precision floating-point format.  The conversion is performed
4075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4076 *----------------------------------------------------------------------------*/
4077 
4078 float128 uint64_to_float128(uint64_t a, float_status *status)
4079 {
4080     if (a == 0) {
4081         return float128_zero;
4082     }
4083     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4084 }
4085 
4086 /*----------------------------------------------------------------------------
4087 | Returns the result of converting the single-precision floating-point value
4088 | `a' to the extended double-precision floating-point format.  The conversion
4089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4090 | Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092 
4093 floatx80 float32_to_floatx80(float32 a, float_status *status)
4094 {
4095     flag aSign;
4096     int aExp;
4097     uint32_t aSig;
4098 
4099     a = float32_squash_input_denormal(a, status);
4100     aSig = extractFloat32Frac( a );
4101     aExp = extractFloat32Exp( a );
4102     aSign = extractFloat32Sign( a );
4103     if ( aExp == 0xFF ) {
4104         if (aSig) {
4105             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4106         }
4107         return packFloatx80(aSign,
4108                             floatx80_infinity_high,
4109                             floatx80_infinity_low);
4110     }
4111     if ( aExp == 0 ) {
4112         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4113         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4114     }
4115     aSig |= 0x00800000;
4116     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4117 
4118 }
4119 
4120 /*----------------------------------------------------------------------------
4121 | Returns the result of converting the single-precision floating-point value
4122 | `a' to the double-precision floating-point format.  The conversion is
4123 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4124 | Arithmetic.
4125 *----------------------------------------------------------------------------*/
4126 
4127 float128 float32_to_float128(float32 a, float_status *status)
4128 {
4129     flag aSign;
4130     int aExp;
4131     uint32_t aSig;
4132 
4133     a = float32_squash_input_denormal(a, status);
4134     aSig = extractFloat32Frac( a );
4135     aExp = extractFloat32Exp( a );
4136     aSign = extractFloat32Sign( a );
4137     if ( aExp == 0xFF ) {
4138         if (aSig) {
4139             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4140         }
4141         return packFloat128( aSign, 0x7FFF, 0, 0 );
4142     }
4143     if ( aExp == 0 ) {
4144         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4145         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4146         --aExp;
4147     }
4148     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4149 
4150 }
4151 
4152 /*----------------------------------------------------------------------------
4153 | Returns the remainder of the single-precision floating-point value `a'
4154 | with respect to the corresponding value `b'.  The operation is performed
4155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4156 *----------------------------------------------------------------------------*/
4157 
4158 float32 float32_rem(float32 a, float32 b, float_status *status)
4159 {
4160     flag aSign, zSign;
4161     int aExp, bExp, expDiff;
4162     uint32_t aSig, bSig;
4163     uint32_t q;
4164     uint64_t aSig64, bSig64, q64;
4165     uint32_t alternateASig;
4166     int32_t sigMean;
4167     a = float32_squash_input_denormal(a, status);
4168     b = float32_squash_input_denormal(b, status);
4169 
4170     aSig = extractFloat32Frac( a );
4171     aExp = extractFloat32Exp( a );
4172     aSign = extractFloat32Sign( a );
4173     bSig = extractFloat32Frac( b );
4174     bExp = extractFloat32Exp( b );
4175     if ( aExp == 0xFF ) {
4176         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4177             return propagateFloat32NaN(a, b, status);
4178         }
4179         float_raise(float_flag_invalid, status);
4180         return float32_default_nan(status);
4181     }
4182     if ( bExp == 0xFF ) {
4183         if (bSig) {
4184             return propagateFloat32NaN(a, b, status);
4185         }
4186         return a;
4187     }
4188     if ( bExp == 0 ) {
4189         if ( bSig == 0 ) {
4190             float_raise(float_flag_invalid, status);
4191             return float32_default_nan(status);
4192         }
4193         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4194     }
4195     if ( aExp == 0 ) {
4196         if ( aSig == 0 ) return a;
4197         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4198     }
4199     expDiff = aExp - bExp;
4200     aSig |= 0x00800000;
4201     bSig |= 0x00800000;
4202     if ( expDiff < 32 ) {
4203         aSig <<= 8;
4204         bSig <<= 8;
4205         if ( expDiff < 0 ) {
4206             if ( expDiff < -1 ) return a;
4207             aSig >>= 1;
4208         }
4209         q = ( bSig <= aSig );
4210         if ( q ) aSig -= bSig;
4211         if ( 0 < expDiff ) {
4212             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4213             q >>= 32 - expDiff;
4214             bSig >>= 2;
4215             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4216         }
4217         else {
4218             aSig >>= 2;
4219             bSig >>= 2;
4220         }
4221     }
4222     else {
4223         if ( bSig <= aSig ) aSig -= bSig;
4224         aSig64 = ( (uint64_t) aSig )<<40;
4225         bSig64 = ( (uint64_t) bSig )<<40;
4226         expDiff -= 64;
4227         while ( 0 < expDiff ) {
4228             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4229             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4230             aSig64 = - ( ( bSig * q64 )<<38 );
4231             expDiff -= 62;
4232         }
4233         expDiff += 64;
4234         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4235         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4236         q = q64>>( 64 - expDiff );
4237         bSig <<= 6;
4238         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4239     }
4240     do {
4241         alternateASig = aSig;
4242         ++q;
4243         aSig -= bSig;
4244     } while ( 0 <= (int32_t) aSig );
4245     sigMean = aSig + alternateASig;
4246     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4247         aSig = alternateASig;
4248     }
4249     zSign = ( (int32_t) aSig < 0 );
4250     if ( zSign ) aSig = - aSig;
4251     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4252 }
4253 
4254 
4255 
4256 /*----------------------------------------------------------------------------
4257 | Returns the binary exponential of the single-precision floating-point value
4258 | `a'. The operation is performed according to the IEC/IEEE Standard for
4259 | Binary Floating-Point Arithmetic.
4260 |
4261 | Uses the following identities:
4262 |
4263 | 1. -------------------------------------------------------------------------
4264 |      x    x*ln(2)
4265 |     2  = e
4266 |
4267 | 2. -------------------------------------------------------------------------
4268 |                      2     3     4     5           n
4269 |      x        x     x     x     x     x           x
4270 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4271 |               1!    2!    3!    4!    5!          n!
4272 *----------------------------------------------------------------------------*/
4273 
4274 static const float64 float32_exp2_coefficients[15] =
4275 {
4276     const_float64( 0x3ff0000000000000ll ), /*  1 */
4277     const_float64( 0x3fe0000000000000ll ), /*  2 */
4278     const_float64( 0x3fc5555555555555ll ), /*  3 */
4279     const_float64( 0x3fa5555555555555ll ), /*  4 */
4280     const_float64( 0x3f81111111111111ll ), /*  5 */
4281     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4282     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4283     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4284     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4285     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4286     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4287     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4288     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4289     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4290     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4291 };
4292 
4293 float32 float32_exp2(float32 a, float_status *status)
4294 {
4295     flag aSign;
4296     int aExp;
4297     uint32_t aSig;
4298     float64 r, x, xn;
4299     int i;
4300     a = float32_squash_input_denormal(a, status);
4301 
4302     aSig = extractFloat32Frac( a );
4303     aExp = extractFloat32Exp( a );
4304     aSign = extractFloat32Sign( a );
4305 
4306     if ( aExp == 0xFF) {
4307         if (aSig) {
4308             return propagateFloat32NaN(a, float32_zero, status);
4309         }
4310         return (aSign) ? float32_zero : a;
4311     }
4312     if (aExp == 0) {
4313         if (aSig == 0) return float32_one;
4314     }
4315 
4316     float_raise(float_flag_inexact, status);
4317 
4318     /* ******************************* */
4319     /* using float64 for approximation */
4320     /* ******************************* */
4321     x = float32_to_float64(a, status);
4322     x = float64_mul(x, float64_ln2, status);
4323 
4324     xn = x;
4325     r = float64_one;
4326     for (i = 0 ; i < 15 ; i++) {
4327         float64 f;
4328 
4329         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4330         r = float64_add(r, f, status);
4331 
4332         xn = float64_mul(xn, x, status);
4333     }
4334 
4335     return float64_to_float32(r, status);
4336 }
4337 
4338 /*----------------------------------------------------------------------------
4339 | Returns the binary log of the single-precision floating-point value `a'.
4340 | The operation is performed according to the IEC/IEEE Standard for Binary
4341 | Floating-Point Arithmetic.
4342 *----------------------------------------------------------------------------*/
4343 float32 float32_log2(float32 a, float_status *status)
4344 {
4345     flag aSign, zSign;
4346     int aExp;
4347     uint32_t aSig, zSig, i;
4348 
4349     a = float32_squash_input_denormal(a, status);
4350     aSig = extractFloat32Frac( a );
4351     aExp = extractFloat32Exp( a );
4352     aSign = extractFloat32Sign( a );
4353 
4354     if ( aExp == 0 ) {
4355         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4356         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4357     }
4358     if ( aSign ) {
4359         float_raise(float_flag_invalid, status);
4360         return float32_default_nan(status);
4361     }
4362     if ( aExp == 0xFF ) {
4363         if (aSig) {
4364             return propagateFloat32NaN(a, float32_zero, status);
4365         }
4366         return a;
4367     }
4368 
4369     aExp -= 0x7F;
4370     aSig |= 0x00800000;
4371     zSign = aExp < 0;
4372     zSig = aExp << 23;
4373 
4374     for (i = 1 << 22; i > 0; i >>= 1) {
4375         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4376         if ( aSig & 0x01000000 ) {
4377             aSig >>= 1;
4378             zSig |= i;
4379         }
4380     }
4381 
4382     if ( zSign )
4383         zSig = -zSig;
4384 
4385     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4386 }
4387 
4388 /*----------------------------------------------------------------------------
4389 | Returns 1 if the single-precision floating-point value `a' is equal to
4390 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4391 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4393 *----------------------------------------------------------------------------*/
4394 
4395 int float32_eq(float32 a, float32 b, float_status *status)
4396 {
4397     uint32_t av, bv;
4398     a = float32_squash_input_denormal(a, status);
4399     b = float32_squash_input_denormal(b, status);
4400 
4401     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4402          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4403        ) {
4404         float_raise(float_flag_invalid, status);
4405         return 0;
4406     }
4407     av = float32_val(a);
4408     bv = float32_val(b);
4409     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4410 }
4411 
4412 /*----------------------------------------------------------------------------
4413 | Returns 1 if the single-precision floating-point value `a' is less than
4414 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4415 | exception is raised if either operand is a NaN.  The comparison is performed
4416 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4417 *----------------------------------------------------------------------------*/
4418 
4419 int float32_le(float32 a, float32 b, float_status *status)
4420 {
4421     flag aSign, bSign;
4422     uint32_t av, bv;
4423     a = float32_squash_input_denormal(a, status);
4424     b = float32_squash_input_denormal(b, status);
4425 
4426     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4427          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4428        ) {
4429         float_raise(float_flag_invalid, status);
4430         return 0;
4431     }
4432     aSign = extractFloat32Sign( a );
4433     bSign = extractFloat32Sign( b );
4434     av = float32_val(a);
4435     bv = float32_val(b);
4436     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4437     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4438 
4439 }
4440 
4441 /*----------------------------------------------------------------------------
4442 | Returns 1 if the single-precision floating-point value `a' is less than
4443 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4444 | raised if either operand is a NaN.  The comparison is performed according
4445 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447 
4448 int float32_lt(float32 a, float32 b, float_status *status)
4449 {
4450     flag aSign, bSign;
4451     uint32_t av, bv;
4452     a = float32_squash_input_denormal(a, status);
4453     b = float32_squash_input_denormal(b, status);
4454 
4455     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4456          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4457        ) {
4458         float_raise(float_flag_invalid, status);
4459         return 0;
4460     }
4461     aSign = extractFloat32Sign( a );
4462     bSign = extractFloat32Sign( b );
4463     av = float32_val(a);
4464     bv = float32_val(b);
4465     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4466     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4467 
4468 }
4469 
4470 /*----------------------------------------------------------------------------
4471 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4472 | be compared, and 0 otherwise.  The invalid exception is raised if either
4473 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4474 | Standard for Binary Floating-Point Arithmetic.
4475 *----------------------------------------------------------------------------*/
4476 
4477 int float32_unordered(float32 a, float32 b, float_status *status)
4478 {
4479     a = float32_squash_input_denormal(a, status);
4480     b = float32_squash_input_denormal(b, status);
4481 
4482     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4483          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4484        ) {
4485         float_raise(float_flag_invalid, status);
4486         return 1;
4487     }
4488     return 0;
4489 }
4490 
4491 /*----------------------------------------------------------------------------
4492 | Returns 1 if the single-precision floating-point value `a' is equal to
4493 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4494 | exception.  The comparison is performed according to the IEC/IEEE Standard
4495 | for Binary Floating-Point Arithmetic.
4496 *----------------------------------------------------------------------------*/
4497 
4498 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4499 {
4500     a = float32_squash_input_denormal(a, status);
4501     b = float32_squash_input_denormal(b, status);
4502 
4503     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4504          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4505        ) {
4506         if (float32_is_signaling_nan(a, status)
4507          || float32_is_signaling_nan(b, status)) {
4508             float_raise(float_flag_invalid, status);
4509         }
4510         return 0;
4511     }
4512     return ( float32_val(a) == float32_val(b) ) ||
4513             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4514 }
4515 
4516 /*----------------------------------------------------------------------------
4517 | Returns 1 if the single-precision floating-point value `a' is less than or
4518 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4519 | cause an exception.  Otherwise, the comparison is performed according to the
4520 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 
4523 int float32_le_quiet(float32 a, float32 b, float_status *status)
4524 {
4525     flag aSign, bSign;
4526     uint32_t av, bv;
4527     a = float32_squash_input_denormal(a, status);
4528     b = float32_squash_input_denormal(b, status);
4529 
4530     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4531          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4532        ) {
4533         if (float32_is_signaling_nan(a, status)
4534          || float32_is_signaling_nan(b, status)) {
4535             float_raise(float_flag_invalid, status);
4536         }
4537         return 0;
4538     }
4539     aSign = extractFloat32Sign( a );
4540     bSign = extractFloat32Sign( b );
4541     av = float32_val(a);
4542     bv = float32_val(b);
4543     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4544     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4545 
4546 }
4547 
4548 /*----------------------------------------------------------------------------
4549 | Returns 1 if the single-precision floating-point value `a' is less than
4550 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4551 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4552 | Standard for Binary Floating-Point Arithmetic.
4553 *----------------------------------------------------------------------------*/
4554 
4555 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4556 {
4557     flag aSign, bSign;
4558     uint32_t av, bv;
4559     a = float32_squash_input_denormal(a, status);
4560     b = float32_squash_input_denormal(b, status);
4561 
4562     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4563          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4564        ) {
4565         if (float32_is_signaling_nan(a, status)
4566          || float32_is_signaling_nan(b, status)) {
4567             float_raise(float_flag_invalid, status);
4568         }
4569         return 0;
4570     }
4571     aSign = extractFloat32Sign( a );
4572     bSign = extractFloat32Sign( b );
4573     av = float32_val(a);
4574     bv = float32_val(b);
4575     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4576     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4577 
4578 }
4579 
4580 /*----------------------------------------------------------------------------
4581 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4582 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4583 | comparison is performed according to the IEC/IEEE Standard for Binary
4584 | Floating-Point Arithmetic.
4585 *----------------------------------------------------------------------------*/
4586 
4587 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4588 {
4589     a = float32_squash_input_denormal(a, status);
4590     b = float32_squash_input_denormal(b, status);
4591 
4592     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4593          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4594        ) {
4595         if (float32_is_signaling_nan(a, status)
4596          || float32_is_signaling_nan(b, status)) {
4597             float_raise(float_flag_invalid, status);
4598         }
4599         return 1;
4600     }
4601     return 0;
4602 }
4603 
4604 /*----------------------------------------------------------------------------
4605 | If `a' is denormal and we are in flush-to-zero mode then set the
4606 | input-denormal exception and return zero. Otherwise just return the value.
4607 *----------------------------------------------------------------------------*/
4608 float16 float16_squash_input_denormal(float16 a, float_status *status)
4609 {
4610     if (status->flush_inputs_to_zero) {
4611         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4612             float_raise(float_flag_input_denormal, status);
4613             return make_float16(float16_val(a) & 0x8000);
4614         }
4615     }
4616     return a;
4617 }
4618 
4619 /*----------------------------------------------------------------------------
4620 | Returns the result of converting the double-precision floating-point value
4621 | `a' to the extended double-precision floating-point format.  The conversion
4622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4623 | Arithmetic.
4624 *----------------------------------------------------------------------------*/
4625 
4626 floatx80 float64_to_floatx80(float64 a, float_status *status)
4627 {
4628     flag aSign;
4629     int aExp;
4630     uint64_t aSig;
4631 
4632     a = float64_squash_input_denormal(a, status);
4633     aSig = extractFloat64Frac( a );
4634     aExp = extractFloat64Exp( a );
4635     aSign = extractFloat64Sign( a );
4636     if ( aExp == 0x7FF ) {
4637         if (aSig) {
4638             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4639         }
4640         return packFloatx80(aSign,
4641                             floatx80_infinity_high,
4642                             floatx80_infinity_low);
4643     }
4644     if ( aExp == 0 ) {
4645         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4646         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4647     }
4648     return
4649         packFloatx80(
4650             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4651 
4652 }
4653 
4654 /*----------------------------------------------------------------------------
4655 | Returns the result of converting the double-precision floating-point value
4656 | `a' to the quadruple-precision floating-point format.  The conversion is
4657 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4658 | Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660 
4661 float128 float64_to_float128(float64 a, float_status *status)
4662 {
4663     flag aSign;
4664     int aExp;
4665     uint64_t aSig, zSig0, zSig1;
4666 
4667     a = float64_squash_input_denormal(a, status);
4668     aSig = extractFloat64Frac( a );
4669     aExp = extractFloat64Exp( a );
4670     aSign = extractFloat64Sign( a );
4671     if ( aExp == 0x7FF ) {
4672         if (aSig) {
4673             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4674         }
4675         return packFloat128( aSign, 0x7FFF, 0, 0 );
4676     }
4677     if ( aExp == 0 ) {
4678         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4679         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4680         --aExp;
4681     }
4682     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4683     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4684 
4685 }
4686 
4687 
4688 /*----------------------------------------------------------------------------
4689 | Returns the remainder of the double-precision floating-point value `a'
4690 | with respect to the corresponding value `b'.  The operation is performed
4691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693 
4694 float64 float64_rem(float64 a, float64 b, float_status *status)
4695 {
4696     flag aSign, zSign;
4697     int aExp, bExp, expDiff;
4698     uint64_t aSig, bSig;
4699     uint64_t q, alternateASig;
4700     int64_t sigMean;
4701 
4702     a = float64_squash_input_denormal(a, status);
4703     b = float64_squash_input_denormal(b, status);
4704     aSig = extractFloat64Frac( a );
4705     aExp = extractFloat64Exp( a );
4706     aSign = extractFloat64Sign( a );
4707     bSig = extractFloat64Frac( b );
4708     bExp = extractFloat64Exp( b );
4709     if ( aExp == 0x7FF ) {
4710         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4711             return propagateFloat64NaN(a, b, status);
4712         }
4713         float_raise(float_flag_invalid, status);
4714         return float64_default_nan(status);
4715     }
4716     if ( bExp == 0x7FF ) {
4717         if (bSig) {
4718             return propagateFloat64NaN(a, b, status);
4719         }
4720         return a;
4721     }
4722     if ( bExp == 0 ) {
4723         if ( bSig == 0 ) {
4724             float_raise(float_flag_invalid, status);
4725             return float64_default_nan(status);
4726         }
4727         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4728     }
4729     if ( aExp == 0 ) {
4730         if ( aSig == 0 ) return a;
4731         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4732     }
4733     expDiff = aExp - bExp;
4734     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4735     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4736     if ( expDiff < 0 ) {
4737         if ( expDiff < -1 ) return a;
4738         aSig >>= 1;
4739     }
4740     q = ( bSig <= aSig );
4741     if ( q ) aSig -= bSig;
4742     expDiff -= 64;
4743     while ( 0 < expDiff ) {
4744         q = estimateDiv128To64( aSig, 0, bSig );
4745         q = ( 2 < q ) ? q - 2 : 0;
4746         aSig = - ( ( bSig>>2 ) * q );
4747         expDiff -= 62;
4748     }
4749     expDiff += 64;
4750     if ( 0 < expDiff ) {
4751         q = estimateDiv128To64( aSig, 0, bSig );
4752         q = ( 2 < q ) ? q - 2 : 0;
4753         q >>= 64 - expDiff;
4754         bSig >>= 2;
4755         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4756     }
4757     else {
4758         aSig >>= 2;
4759         bSig >>= 2;
4760     }
4761     do {
4762         alternateASig = aSig;
4763         ++q;
4764         aSig -= bSig;
4765     } while ( 0 <= (int64_t) aSig );
4766     sigMean = aSig + alternateASig;
4767     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4768         aSig = alternateASig;
4769     }
4770     zSign = ( (int64_t) aSig < 0 );
4771     if ( zSign ) aSig = - aSig;
4772     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4773 
4774 }
4775 
4776 /*----------------------------------------------------------------------------
4777 | Returns the binary log of the double-precision floating-point value `a'.
4778 | The operation is performed according to the IEC/IEEE Standard for Binary
4779 | Floating-Point Arithmetic.
4780 *----------------------------------------------------------------------------*/
4781 float64 float64_log2(float64 a, float_status *status)
4782 {
4783     flag aSign, zSign;
4784     int aExp;
4785     uint64_t aSig, aSig0, aSig1, zSig, i;
4786     a = float64_squash_input_denormal(a, status);
4787 
4788     aSig = extractFloat64Frac( a );
4789     aExp = extractFloat64Exp( a );
4790     aSign = extractFloat64Sign( a );
4791 
4792     if ( aExp == 0 ) {
4793         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4794         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4795     }
4796     if ( aSign ) {
4797         float_raise(float_flag_invalid, status);
4798         return float64_default_nan(status);
4799     }
4800     if ( aExp == 0x7FF ) {
4801         if (aSig) {
4802             return propagateFloat64NaN(a, float64_zero, status);
4803         }
4804         return a;
4805     }
4806 
4807     aExp -= 0x3FF;
4808     aSig |= LIT64( 0x0010000000000000 );
4809     zSign = aExp < 0;
4810     zSig = (uint64_t)aExp << 52;
4811     for (i = 1LL << 51; i > 0; i >>= 1) {
4812         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4813         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4814         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4815             aSig >>= 1;
4816             zSig |= i;
4817         }
4818     }
4819 
4820     if ( zSign )
4821         zSig = -zSig;
4822     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4823 }
4824 
4825 /*----------------------------------------------------------------------------
4826 | Returns 1 if the double-precision floating-point value `a' is equal to the
4827 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4828 | if either operand is a NaN.  Otherwise, the comparison is performed
4829 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4831 
4832 int float64_eq(float64 a, float64 b, float_status *status)
4833 {
4834     uint64_t av, bv;
4835     a = float64_squash_input_denormal(a, status);
4836     b = float64_squash_input_denormal(b, status);
4837 
4838     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4839          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4840        ) {
4841         float_raise(float_flag_invalid, status);
4842         return 0;
4843     }
4844     av = float64_val(a);
4845     bv = float64_val(b);
4846     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4847 
4848 }
4849 
4850 /*----------------------------------------------------------------------------
4851 | Returns 1 if the double-precision floating-point value `a' is less than or
4852 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4853 | exception is raised if either operand is a NaN.  The comparison is performed
4854 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4855 *----------------------------------------------------------------------------*/
4856 
4857 int float64_le(float64 a, float64 b, float_status *status)
4858 {
4859     flag aSign, bSign;
4860     uint64_t av, bv;
4861     a = float64_squash_input_denormal(a, status);
4862     b = float64_squash_input_denormal(b, status);
4863 
4864     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4865          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4866        ) {
4867         float_raise(float_flag_invalid, status);
4868         return 0;
4869     }
4870     aSign = extractFloat64Sign( a );
4871     bSign = extractFloat64Sign( b );
4872     av = float64_val(a);
4873     bv = float64_val(b);
4874     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4875     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4876 
4877 }
4878 
4879 /*----------------------------------------------------------------------------
4880 | Returns 1 if the double-precision floating-point value `a' is less than
4881 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4882 | raised if either operand is a NaN.  The comparison is performed according
4883 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4884 *----------------------------------------------------------------------------*/
4885 
4886 int float64_lt(float64 a, float64 b, float_status *status)
4887 {
4888     flag aSign, bSign;
4889     uint64_t av, bv;
4890 
4891     a = float64_squash_input_denormal(a, status);
4892     b = float64_squash_input_denormal(b, status);
4893     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4894          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4895        ) {
4896         float_raise(float_flag_invalid, status);
4897         return 0;
4898     }
4899     aSign = extractFloat64Sign( a );
4900     bSign = extractFloat64Sign( b );
4901     av = float64_val(a);
4902     bv = float64_val(b);
4903     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4904     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4905 
4906 }
4907 
4908 /*----------------------------------------------------------------------------
4909 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4910 | be compared, and 0 otherwise.  The invalid exception is raised if either
4911 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4912 | Standard for Binary Floating-Point Arithmetic.
4913 *----------------------------------------------------------------------------*/
4914 
4915 int float64_unordered(float64 a, float64 b, float_status *status)
4916 {
4917     a = float64_squash_input_denormal(a, status);
4918     b = float64_squash_input_denormal(b, status);
4919 
4920     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4921          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4922        ) {
4923         float_raise(float_flag_invalid, status);
4924         return 1;
4925     }
4926     return 0;
4927 }
4928 
4929 /*----------------------------------------------------------------------------
4930 | Returns 1 if the double-precision floating-point value `a' is equal to the
4931 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4932 | exception.The comparison is performed according to the IEC/IEEE Standard
4933 | for Binary Floating-Point Arithmetic.
4934 *----------------------------------------------------------------------------*/
4935 
4936 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4937 {
4938     uint64_t av, bv;
4939     a = float64_squash_input_denormal(a, status);
4940     b = float64_squash_input_denormal(b, status);
4941 
4942     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4943          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4944        ) {
4945         if (float64_is_signaling_nan(a, status)
4946          || float64_is_signaling_nan(b, status)) {
4947             float_raise(float_flag_invalid, status);
4948         }
4949         return 0;
4950     }
4951     av = float64_val(a);
4952     bv = float64_val(b);
4953     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4954 
4955 }
4956 
4957 /*----------------------------------------------------------------------------
4958 | Returns 1 if the double-precision floating-point value `a' is less than or
4959 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4960 | cause an exception.  Otherwise, the comparison is performed according to the
4961 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4962 *----------------------------------------------------------------------------*/
4963 
4964 int float64_le_quiet(float64 a, float64 b, float_status *status)
4965 {
4966     flag aSign, bSign;
4967     uint64_t av, bv;
4968     a = float64_squash_input_denormal(a, status);
4969     b = float64_squash_input_denormal(b, status);
4970 
4971     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4972          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4973        ) {
4974         if (float64_is_signaling_nan(a, status)
4975          || float64_is_signaling_nan(b, status)) {
4976             float_raise(float_flag_invalid, status);
4977         }
4978         return 0;
4979     }
4980     aSign = extractFloat64Sign( a );
4981     bSign = extractFloat64Sign( b );
4982     av = float64_val(a);
4983     bv = float64_val(b);
4984     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4985     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4986 
4987 }
4988 
4989 /*----------------------------------------------------------------------------
4990 | Returns 1 if the double-precision floating-point value `a' is less than
4991 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4992 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4993 | Standard for Binary Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995 
4996 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4997 {
4998     flag aSign, bSign;
4999     uint64_t av, bv;
5000     a = float64_squash_input_denormal(a, status);
5001     b = float64_squash_input_denormal(b, status);
5002 
5003     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5004          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5005        ) {
5006         if (float64_is_signaling_nan(a, status)
5007          || float64_is_signaling_nan(b, status)) {
5008             float_raise(float_flag_invalid, status);
5009         }
5010         return 0;
5011     }
5012     aSign = extractFloat64Sign( a );
5013     bSign = extractFloat64Sign( b );
5014     av = float64_val(a);
5015     bv = float64_val(b);
5016     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5017     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5018 
5019 }
5020 
5021 /*----------------------------------------------------------------------------
5022 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5023 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5024 | comparison is performed according to the IEC/IEEE Standard for Binary
5025 | Floating-Point Arithmetic.
5026 *----------------------------------------------------------------------------*/
5027 
5028 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5029 {
5030     a = float64_squash_input_denormal(a, status);
5031     b = float64_squash_input_denormal(b, status);
5032 
5033     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5034          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5035        ) {
5036         if (float64_is_signaling_nan(a, status)
5037          || float64_is_signaling_nan(b, status)) {
5038             float_raise(float_flag_invalid, status);
5039         }
5040         return 1;
5041     }
5042     return 0;
5043 }
5044 
5045 /*----------------------------------------------------------------------------
5046 | Returns the result of converting the extended double-precision floating-
5047 | point value `a' to the 32-bit two's complement integer format.  The
5048 | conversion is performed according to the IEC/IEEE Standard for Binary
5049 | Floating-Point Arithmetic---which means in particular that the conversion
5050 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5051 | largest positive integer is returned.  Otherwise, if the conversion
5052 | overflows, the largest integer with the same sign as `a' is returned.
5053 *----------------------------------------------------------------------------*/
5054 
5055 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5056 {
5057     flag aSign;
5058     int32_t aExp, shiftCount;
5059     uint64_t aSig;
5060 
5061     if (floatx80_invalid_encoding(a)) {
5062         float_raise(float_flag_invalid, status);
5063         return 1 << 31;
5064     }
5065     aSig = extractFloatx80Frac( a );
5066     aExp = extractFloatx80Exp( a );
5067     aSign = extractFloatx80Sign( a );
5068     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5069     shiftCount = 0x4037 - aExp;
5070     if ( shiftCount <= 0 ) shiftCount = 1;
5071     shift64RightJamming( aSig, shiftCount, &aSig );
5072     return roundAndPackInt32(aSign, aSig, status);
5073 
5074 }
5075 
5076 /*----------------------------------------------------------------------------
5077 | Returns the result of converting the extended double-precision floating-
5078 | point value `a' to the 32-bit two's complement integer format.  The
5079 | conversion is performed according to the IEC/IEEE Standard for Binary
5080 | Floating-Point Arithmetic, except that the conversion is always rounded
5081 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5082 | Otherwise, if the conversion overflows, the largest integer with the same
5083 | sign as `a' is returned.
5084 *----------------------------------------------------------------------------*/
5085 
5086 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5087 {
5088     flag aSign;
5089     int32_t aExp, shiftCount;
5090     uint64_t aSig, savedASig;
5091     int32_t z;
5092 
5093     if (floatx80_invalid_encoding(a)) {
5094         float_raise(float_flag_invalid, status);
5095         return 1 << 31;
5096     }
5097     aSig = extractFloatx80Frac( a );
5098     aExp = extractFloatx80Exp( a );
5099     aSign = extractFloatx80Sign( a );
5100     if ( 0x401E < aExp ) {
5101         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5102         goto invalid;
5103     }
5104     else if ( aExp < 0x3FFF ) {
5105         if (aExp || aSig) {
5106             status->float_exception_flags |= float_flag_inexact;
5107         }
5108         return 0;
5109     }
5110     shiftCount = 0x403E - aExp;
5111     savedASig = aSig;
5112     aSig >>= shiftCount;
5113     z = aSig;
5114     if ( aSign ) z = - z;
5115     if ( ( z < 0 ) ^ aSign ) {
5116  invalid:
5117         float_raise(float_flag_invalid, status);
5118         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5119     }
5120     if ( ( aSig<<shiftCount ) != savedASig ) {
5121         status->float_exception_flags |= float_flag_inexact;
5122     }
5123     return z;
5124 
5125 }
5126 
5127 /*----------------------------------------------------------------------------
5128 | Returns the result of converting the extended double-precision floating-
5129 | point value `a' to the 64-bit two's complement integer format.  The
5130 | conversion is performed according to the IEC/IEEE Standard for Binary
5131 | Floating-Point Arithmetic---which means in particular that the conversion
5132 | is rounded according to the current rounding mode.  If `a' is a NaN,
5133 | the largest positive integer is returned.  Otherwise, if the conversion
5134 | overflows, the largest integer with the same sign as `a' is returned.
5135 *----------------------------------------------------------------------------*/
5136 
5137 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5138 {
5139     flag aSign;
5140     int32_t aExp, shiftCount;
5141     uint64_t aSig, aSigExtra;
5142 
5143     if (floatx80_invalid_encoding(a)) {
5144         float_raise(float_flag_invalid, status);
5145         return 1ULL << 63;
5146     }
5147     aSig = extractFloatx80Frac( a );
5148     aExp = extractFloatx80Exp( a );
5149     aSign = extractFloatx80Sign( a );
5150     shiftCount = 0x403E - aExp;
5151     if ( shiftCount <= 0 ) {
5152         if ( shiftCount ) {
5153             float_raise(float_flag_invalid, status);
5154             if (!aSign || floatx80_is_any_nan(a)) {
5155                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5156             }
5157             return (int64_t) LIT64( 0x8000000000000000 );
5158         }
5159         aSigExtra = 0;
5160     }
5161     else {
5162         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5163     }
5164     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5165 
5166 }
5167 
5168 /*----------------------------------------------------------------------------
5169 | Returns the result of converting the extended double-precision floating-
5170 | point value `a' to the 64-bit two's complement integer format.  The
5171 | conversion is performed according to the IEC/IEEE Standard for Binary
5172 | Floating-Point Arithmetic, except that the conversion is always rounded
5173 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5174 | Otherwise, if the conversion overflows, the largest integer with the same
5175 | sign as `a' is returned.
5176 *----------------------------------------------------------------------------*/
5177 
5178 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5179 {
5180     flag aSign;
5181     int32_t aExp, shiftCount;
5182     uint64_t aSig;
5183     int64_t z;
5184 
5185     if (floatx80_invalid_encoding(a)) {
5186         float_raise(float_flag_invalid, status);
5187         return 1ULL << 63;
5188     }
5189     aSig = extractFloatx80Frac( a );
5190     aExp = extractFloatx80Exp( a );
5191     aSign = extractFloatx80Sign( a );
5192     shiftCount = aExp - 0x403E;
5193     if ( 0 <= shiftCount ) {
5194         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5195         if ( ( a.high != 0xC03E ) || aSig ) {
5196             float_raise(float_flag_invalid, status);
5197             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5198                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5199             }
5200         }
5201         return (int64_t) LIT64( 0x8000000000000000 );
5202     }
5203     else if ( aExp < 0x3FFF ) {
5204         if (aExp | aSig) {
5205             status->float_exception_flags |= float_flag_inexact;
5206         }
5207         return 0;
5208     }
5209     z = aSig>>( - shiftCount );
5210     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5211         status->float_exception_flags |= float_flag_inexact;
5212     }
5213     if ( aSign ) z = - z;
5214     return z;
5215 
5216 }
5217 
5218 /*----------------------------------------------------------------------------
5219 | Returns the result of converting the extended double-precision floating-
5220 | point value `a' to the single-precision floating-point format.  The
5221 | conversion is performed according to the IEC/IEEE Standard for Binary
5222 | Floating-Point Arithmetic.
5223 *----------------------------------------------------------------------------*/
5224 
5225 float32 floatx80_to_float32(floatx80 a, float_status *status)
5226 {
5227     flag aSign;
5228     int32_t aExp;
5229     uint64_t aSig;
5230 
5231     if (floatx80_invalid_encoding(a)) {
5232         float_raise(float_flag_invalid, status);
5233         return float32_default_nan(status);
5234     }
5235     aSig = extractFloatx80Frac( a );
5236     aExp = extractFloatx80Exp( a );
5237     aSign = extractFloatx80Sign( a );
5238     if ( aExp == 0x7FFF ) {
5239         if ( (uint64_t) ( aSig<<1 ) ) {
5240             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5241         }
5242         return packFloat32( aSign, 0xFF, 0 );
5243     }
5244     shift64RightJamming( aSig, 33, &aSig );
5245     if ( aExp || aSig ) aExp -= 0x3F81;
5246     return roundAndPackFloat32(aSign, aExp, aSig, status);
5247 
5248 }
5249 
5250 /*----------------------------------------------------------------------------
5251 | Returns the result of converting the extended double-precision floating-
5252 | point value `a' to the double-precision floating-point format.  The
5253 | conversion is performed according to the IEC/IEEE Standard for Binary
5254 | Floating-Point Arithmetic.
5255 *----------------------------------------------------------------------------*/
5256 
5257 float64 floatx80_to_float64(floatx80 a, float_status *status)
5258 {
5259     flag aSign;
5260     int32_t aExp;
5261     uint64_t aSig, zSig;
5262 
5263     if (floatx80_invalid_encoding(a)) {
5264         float_raise(float_flag_invalid, status);
5265         return float64_default_nan(status);
5266     }
5267     aSig = extractFloatx80Frac( a );
5268     aExp = extractFloatx80Exp( a );
5269     aSign = extractFloatx80Sign( a );
5270     if ( aExp == 0x7FFF ) {
5271         if ( (uint64_t) ( aSig<<1 ) ) {
5272             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5273         }
5274         return packFloat64( aSign, 0x7FF, 0 );
5275     }
5276     shift64RightJamming( aSig, 1, &zSig );
5277     if ( aExp || aSig ) aExp -= 0x3C01;
5278     return roundAndPackFloat64(aSign, aExp, zSig, status);
5279 
5280 }
5281 
5282 /*----------------------------------------------------------------------------
5283 | Returns the result of converting the extended double-precision floating-
5284 | point value `a' to the quadruple-precision floating-point format.  The
5285 | conversion is performed according to the IEC/IEEE Standard for Binary
5286 | Floating-Point Arithmetic.
5287 *----------------------------------------------------------------------------*/
5288 
5289 float128 floatx80_to_float128(floatx80 a, float_status *status)
5290 {
5291     flag aSign;
5292     int aExp;
5293     uint64_t aSig, zSig0, zSig1;
5294 
5295     if (floatx80_invalid_encoding(a)) {
5296         float_raise(float_flag_invalid, status);
5297         return float128_default_nan(status);
5298     }
5299     aSig = extractFloatx80Frac( a );
5300     aExp = extractFloatx80Exp( a );
5301     aSign = extractFloatx80Sign( a );
5302     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5303         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5304     }
5305     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5306     return packFloat128( aSign, aExp, zSig0, zSig1 );
5307 
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Rounds the extended double-precision floating-point value `a'
5312 | to the precision provided by floatx80_rounding_precision and returns the
5313 | result as an extended double-precision floating-point value.
5314 | The operation is performed according to the IEC/IEEE Standard for Binary
5315 | Floating-Point Arithmetic.
5316 *----------------------------------------------------------------------------*/
5317 
5318 floatx80 floatx80_round(floatx80 a, float_status *status)
5319 {
5320     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5321                                 extractFloatx80Sign(a),
5322                                 extractFloatx80Exp(a),
5323                                 extractFloatx80Frac(a), 0, status);
5324 }
5325 
5326 /*----------------------------------------------------------------------------
5327 | Rounds the extended double-precision floating-point value `a' to an integer,
5328 | and returns the result as an extended quadruple-precision floating-point
5329 | value.  The operation is performed according to the IEC/IEEE Standard for
5330 | Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5332 
5333 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5334 {
5335     flag aSign;
5336     int32_t aExp;
5337     uint64_t lastBitMask, roundBitsMask;
5338     floatx80 z;
5339 
5340     if (floatx80_invalid_encoding(a)) {
5341         float_raise(float_flag_invalid, status);
5342         return floatx80_default_nan(status);
5343     }
5344     aExp = extractFloatx80Exp( a );
5345     if ( 0x403E <= aExp ) {
5346         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5347             return propagateFloatx80NaN(a, a, status);
5348         }
5349         return a;
5350     }
5351     if ( aExp < 0x3FFF ) {
5352         if (    ( aExp == 0 )
5353              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5354             return a;
5355         }
5356         status->float_exception_flags |= float_flag_inexact;
5357         aSign = extractFloatx80Sign( a );
5358         switch (status->float_rounding_mode) {
5359          case float_round_nearest_even:
5360             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5361                ) {
5362                 return
5363                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5364             }
5365             break;
5366         case float_round_ties_away:
5367             if (aExp == 0x3FFE) {
5368                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5369             }
5370             break;
5371          case float_round_down:
5372             return
5373                   aSign ?
5374                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5375                 : packFloatx80( 0, 0, 0 );
5376          case float_round_up:
5377             return
5378                   aSign ? packFloatx80( 1, 0, 0 )
5379                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5380         }
5381         return packFloatx80( aSign, 0, 0 );
5382     }
5383     lastBitMask = 1;
5384     lastBitMask <<= 0x403E - aExp;
5385     roundBitsMask = lastBitMask - 1;
5386     z = a;
5387     switch (status->float_rounding_mode) {
5388     case float_round_nearest_even:
5389         z.low += lastBitMask>>1;
5390         if ((z.low & roundBitsMask) == 0) {
5391             z.low &= ~lastBitMask;
5392         }
5393         break;
5394     case float_round_ties_away:
5395         z.low += lastBitMask >> 1;
5396         break;
5397     case float_round_to_zero:
5398         break;
5399     case float_round_up:
5400         if (!extractFloatx80Sign(z)) {
5401             z.low += roundBitsMask;
5402         }
5403         break;
5404     case float_round_down:
5405         if (extractFloatx80Sign(z)) {
5406             z.low += roundBitsMask;
5407         }
5408         break;
5409     default:
5410         abort();
5411     }
5412     z.low &= ~ roundBitsMask;
5413     if ( z.low == 0 ) {
5414         ++z.high;
5415         z.low = LIT64( 0x8000000000000000 );
5416     }
5417     if (z.low != a.low) {
5418         status->float_exception_flags |= float_flag_inexact;
5419     }
5420     return z;
5421 
5422 }
5423 
5424 /*----------------------------------------------------------------------------
5425 | Returns the result of adding the absolute values of the extended double-
5426 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5427 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5428 | The addition is performed according to the IEC/IEEE Standard for Binary
5429 | Floating-Point Arithmetic.
5430 *----------------------------------------------------------------------------*/
5431 
5432 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5433                                 float_status *status)
5434 {
5435     int32_t aExp, bExp, zExp;
5436     uint64_t aSig, bSig, zSig0, zSig1;
5437     int32_t expDiff;
5438 
5439     aSig = extractFloatx80Frac( a );
5440     aExp = extractFloatx80Exp( a );
5441     bSig = extractFloatx80Frac( b );
5442     bExp = extractFloatx80Exp( b );
5443     expDiff = aExp - bExp;
5444     if ( 0 < expDiff ) {
5445         if ( aExp == 0x7FFF ) {
5446             if ((uint64_t)(aSig << 1)) {
5447                 return propagateFloatx80NaN(a, b, status);
5448             }
5449             return a;
5450         }
5451         if ( bExp == 0 ) --expDiff;
5452         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5453         zExp = aExp;
5454     }
5455     else if ( expDiff < 0 ) {
5456         if ( bExp == 0x7FFF ) {
5457             if ((uint64_t)(bSig << 1)) {
5458                 return propagateFloatx80NaN(a, b, status);
5459             }
5460             return packFloatx80(zSign,
5461                                 floatx80_infinity_high,
5462                                 floatx80_infinity_low);
5463         }
5464         if ( aExp == 0 ) ++expDiff;
5465         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5466         zExp = bExp;
5467     }
5468     else {
5469         if ( aExp == 0x7FFF ) {
5470             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5471                 return propagateFloatx80NaN(a, b, status);
5472             }
5473             return a;
5474         }
5475         zSig1 = 0;
5476         zSig0 = aSig + bSig;
5477         if ( aExp == 0 ) {
5478             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5479             goto roundAndPack;
5480         }
5481         zExp = aExp;
5482         goto shiftRight1;
5483     }
5484     zSig0 = aSig + bSig;
5485     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5486  shiftRight1:
5487     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5488     zSig0 |= LIT64( 0x8000000000000000 );
5489     ++zExp;
5490  roundAndPack:
5491     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5492                                 zSign, zExp, zSig0, zSig1, status);
5493 }
5494 
5495 /*----------------------------------------------------------------------------
5496 | Returns the result of subtracting the absolute values of the extended
5497 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5498 | difference is negated before being returned.  `zSign' is ignored if the
5499 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5500 | Standard for Binary Floating-Point Arithmetic.
5501 *----------------------------------------------------------------------------*/
5502 
5503 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5504                                 float_status *status)
5505 {
5506     int32_t aExp, bExp, zExp;
5507     uint64_t aSig, bSig, zSig0, zSig1;
5508     int32_t expDiff;
5509 
5510     aSig = extractFloatx80Frac( a );
5511     aExp = extractFloatx80Exp( a );
5512     bSig = extractFloatx80Frac( b );
5513     bExp = extractFloatx80Exp( b );
5514     expDiff = aExp - bExp;
5515     if ( 0 < expDiff ) goto aExpBigger;
5516     if ( expDiff < 0 ) goto bExpBigger;
5517     if ( aExp == 0x7FFF ) {
5518         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5519             return propagateFloatx80NaN(a, b, status);
5520         }
5521         float_raise(float_flag_invalid, status);
5522         return floatx80_default_nan(status);
5523     }
5524     if ( aExp == 0 ) {
5525         aExp = 1;
5526         bExp = 1;
5527     }
5528     zSig1 = 0;
5529     if ( bSig < aSig ) goto aBigger;
5530     if ( aSig < bSig ) goto bBigger;
5531     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5532  bExpBigger:
5533     if ( bExp == 0x7FFF ) {
5534         if ((uint64_t)(bSig << 1)) {
5535             return propagateFloatx80NaN(a, b, status);
5536         }
5537         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5538                             floatx80_infinity_low);
5539     }
5540     if ( aExp == 0 ) ++expDiff;
5541     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5542  bBigger:
5543     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5544     zExp = bExp;
5545     zSign ^= 1;
5546     goto normalizeRoundAndPack;
5547  aExpBigger:
5548     if ( aExp == 0x7FFF ) {
5549         if ((uint64_t)(aSig << 1)) {
5550             return propagateFloatx80NaN(a, b, status);
5551         }
5552         return a;
5553     }
5554     if ( bExp == 0 ) --expDiff;
5555     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5556  aBigger:
5557     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5558     zExp = aExp;
5559  normalizeRoundAndPack:
5560     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5561                                          zSign, zExp, zSig0, zSig1, status);
5562 }
5563 
5564 /*----------------------------------------------------------------------------
5565 | Returns the result of adding the extended double-precision floating-point
5566 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5567 | Standard for Binary Floating-Point Arithmetic.
5568 *----------------------------------------------------------------------------*/
5569 
5570 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5571 {
5572     flag aSign, bSign;
5573 
5574     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5575         float_raise(float_flag_invalid, status);
5576         return floatx80_default_nan(status);
5577     }
5578     aSign = extractFloatx80Sign( a );
5579     bSign = extractFloatx80Sign( b );
5580     if ( aSign == bSign ) {
5581         return addFloatx80Sigs(a, b, aSign, status);
5582     }
5583     else {
5584         return subFloatx80Sigs(a, b, aSign, status);
5585     }
5586 
5587 }
5588 
5589 /*----------------------------------------------------------------------------
5590 | Returns the result of subtracting the extended double-precision floating-
5591 | point values `a' and `b'.  The operation is performed according to the
5592 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594 
5595 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5596 {
5597     flag aSign, bSign;
5598 
5599     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5600         float_raise(float_flag_invalid, status);
5601         return floatx80_default_nan(status);
5602     }
5603     aSign = extractFloatx80Sign( a );
5604     bSign = extractFloatx80Sign( b );
5605     if ( aSign == bSign ) {
5606         return subFloatx80Sigs(a, b, aSign, status);
5607     }
5608     else {
5609         return addFloatx80Sigs(a, b, aSign, status);
5610     }
5611 
5612 }
5613 
5614 /*----------------------------------------------------------------------------
5615 | Returns the result of multiplying the extended double-precision floating-
5616 | point values `a' and `b'.  The operation is performed according to the
5617 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619 
5620 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5621 {
5622     flag aSign, bSign, zSign;
5623     int32_t aExp, bExp, zExp;
5624     uint64_t aSig, bSig, zSig0, zSig1;
5625 
5626     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5627         float_raise(float_flag_invalid, status);
5628         return floatx80_default_nan(status);
5629     }
5630     aSig = extractFloatx80Frac( a );
5631     aExp = extractFloatx80Exp( a );
5632     aSign = extractFloatx80Sign( a );
5633     bSig = extractFloatx80Frac( b );
5634     bExp = extractFloatx80Exp( b );
5635     bSign = extractFloatx80Sign( b );
5636     zSign = aSign ^ bSign;
5637     if ( aExp == 0x7FFF ) {
5638         if (    (uint64_t) ( aSig<<1 )
5639              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5640             return propagateFloatx80NaN(a, b, status);
5641         }
5642         if ( ( bExp | bSig ) == 0 ) goto invalid;
5643         return packFloatx80(zSign, floatx80_infinity_high,
5644                                    floatx80_infinity_low);
5645     }
5646     if ( bExp == 0x7FFF ) {
5647         if ((uint64_t)(bSig << 1)) {
5648             return propagateFloatx80NaN(a, b, status);
5649         }
5650         if ( ( aExp | aSig ) == 0 ) {
5651  invalid:
5652             float_raise(float_flag_invalid, status);
5653             return floatx80_default_nan(status);
5654         }
5655         return packFloatx80(zSign, floatx80_infinity_high,
5656                                    floatx80_infinity_low);
5657     }
5658     if ( aExp == 0 ) {
5659         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5660         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5661     }
5662     if ( bExp == 0 ) {
5663         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5664         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5665     }
5666     zExp = aExp + bExp - 0x3FFE;
5667     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5668     if ( 0 < (int64_t) zSig0 ) {
5669         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5670         --zExp;
5671     }
5672     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5673                                 zSign, zExp, zSig0, zSig1, status);
5674 }
5675 
5676 /*----------------------------------------------------------------------------
5677 | Returns the result of dividing the extended double-precision floating-point
5678 | value `a' by the corresponding value `b'.  The operation is performed
5679 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5680 *----------------------------------------------------------------------------*/
5681 
5682 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5683 {
5684     flag aSign, bSign, zSign;
5685     int32_t aExp, bExp, zExp;
5686     uint64_t aSig, bSig, zSig0, zSig1;
5687     uint64_t rem0, rem1, rem2, term0, term1, term2;
5688 
5689     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5690         float_raise(float_flag_invalid, status);
5691         return floatx80_default_nan(status);
5692     }
5693     aSig = extractFloatx80Frac( a );
5694     aExp = extractFloatx80Exp( a );
5695     aSign = extractFloatx80Sign( a );
5696     bSig = extractFloatx80Frac( b );
5697     bExp = extractFloatx80Exp( b );
5698     bSign = extractFloatx80Sign( b );
5699     zSign = aSign ^ bSign;
5700     if ( aExp == 0x7FFF ) {
5701         if ((uint64_t)(aSig << 1)) {
5702             return propagateFloatx80NaN(a, b, status);
5703         }
5704         if ( bExp == 0x7FFF ) {
5705             if ((uint64_t)(bSig << 1)) {
5706                 return propagateFloatx80NaN(a, b, status);
5707             }
5708             goto invalid;
5709         }
5710         return packFloatx80(zSign, floatx80_infinity_high,
5711                                    floatx80_infinity_low);
5712     }
5713     if ( bExp == 0x7FFF ) {
5714         if ((uint64_t)(bSig << 1)) {
5715             return propagateFloatx80NaN(a, b, status);
5716         }
5717         return packFloatx80( zSign, 0, 0 );
5718     }
5719     if ( bExp == 0 ) {
5720         if ( bSig == 0 ) {
5721             if ( ( aExp | aSig ) == 0 ) {
5722  invalid:
5723                 float_raise(float_flag_invalid, status);
5724                 return floatx80_default_nan(status);
5725             }
5726             float_raise(float_flag_divbyzero, status);
5727             return packFloatx80(zSign, floatx80_infinity_high,
5728                                        floatx80_infinity_low);
5729         }
5730         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5731     }
5732     if ( aExp == 0 ) {
5733         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5734         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5735     }
5736     zExp = aExp - bExp + 0x3FFE;
5737     rem1 = 0;
5738     if ( bSig <= aSig ) {
5739         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5740         ++zExp;
5741     }
5742     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5743     mul64To128( bSig, zSig0, &term0, &term1 );
5744     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5745     while ( (int64_t) rem0 < 0 ) {
5746         --zSig0;
5747         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5748     }
5749     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5750     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5751         mul64To128( bSig, zSig1, &term1, &term2 );
5752         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5753         while ( (int64_t) rem1 < 0 ) {
5754             --zSig1;
5755             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5756         }
5757         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5758     }
5759     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5760                                 zSign, zExp, zSig0, zSig1, status);
5761 }
5762 
5763 /*----------------------------------------------------------------------------
5764 | Returns the remainder of the extended double-precision floating-point value
5765 | `a' with respect to the corresponding value `b'.  The operation is performed
5766 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5767 *----------------------------------------------------------------------------*/
5768 
5769 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5770 {
5771     flag aSign, zSign;
5772     int32_t aExp, bExp, expDiff;
5773     uint64_t aSig0, aSig1, bSig;
5774     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5775 
5776     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5777         float_raise(float_flag_invalid, status);
5778         return floatx80_default_nan(status);
5779     }
5780     aSig0 = extractFloatx80Frac( a );
5781     aExp = extractFloatx80Exp( a );
5782     aSign = extractFloatx80Sign( a );
5783     bSig = extractFloatx80Frac( b );
5784     bExp = extractFloatx80Exp( b );
5785     if ( aExp == 0x7FFF ) {
5786         if (    (uint64_t) ( aSig0<<1 )
5787              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5788             return propagateFloatx80NaN(a, b, status);
5789         }
5790         goto invalid;
5791     }
5792     if ( bExp == 0x7FFF ) {
5793         if ((uint64_t)(bSig << 1)) {
5794             return propagateFloatx80NaN(a, b, status);
5795         }
5796         return a;
5797     }
5798     if ( bExp == 0 ) {
5799         if ( bSig == 0 ) {
5800  invalid:
5801             float_raise(float_flag_invalid, status);
5802             return floatx80_default_nan(status);
5803         }
5804         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5805     }
5806     if ( aExp == 0 ) {
5807         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5808         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5809     }
5810     bSig |= LIT64( 0x8000000000000000 );
5811     zSign = aSign;
5812     expDiff = aExp - bExp;
5813     aSig1 = 0;
5814     if ( expDiff < 0 ) {
5815         if ( expDiff < -1 ) return a;
5816         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5817         expDiff = 0;
5818     }
5819     q = ( bSig <= aSig0 );
5820     if ( q ) aSig0 -= bSig;
5821     expDiff -= 64;
5822     while ( 0 < expDiff ) {
5823         q = estimateDiv128To64( aSig0, aSig1, bSig );
5824         q = ( 2 < q ) ? q - 2 : 0;
5825         mul64To128( bSig, q, &term0, &term1 );
5826         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5827         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5828         expDiff -= 62;
5829     }
5830     expDiff += 64;
5831     if ( 0 < expDiff ) {
5832         q = estimateDiv128To64( aSig0, aSig1, bSig );
5833         q = ( 2 < q ) ? q - 2 : 0;
5834         q >>= 64 - expDiff;
5835         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5836         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5837         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5838         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5839             ++q;
5840             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5841         }
5842     }
5843     else {
5844         term1 = 0;
5845         term0 = bSig;
5846     }
5847     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5848     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5849          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5850               && ( q & 1 ) )
5851        ) {
5852         aSig0 = alternateASig0;
5853         aSig1 = alternateASig1;
5854         zSign = ! zSign;
5855     }
5856     return
5857         normalizeRoundAndPackFloatx80(
5858             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5859 
5860 }
5861 
5862 /*----------------------------------------------------------------------------
5863 | Returns the square root of the extended double-precision floating-point
5864 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5865 | for Binary Floating-Point Arithmetic.
5866 *----------------------------------------------------------------------------*/
5867 
5868 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5869 {
5870     flag aSign;
5871     int32_t aExp, zExp;
5872     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5873     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5874 
5875     if (floatx80_invalid_encoding(a)) {
5876         float_raise(float_flag_invalid, status);
5877         return floatx80_default_nan(status);
5878     }
5879     aSig0 = extractFloatx80Frac( a );
5880     aExp = extractFloatx80Exp( a );
5881     aSign = extractFloatx80Sign( a );
5882     if ( aExp == 0x7FFF ) {
5883         if ((uint64_t)(aSig0 << 1)) {
5884             return propagateFloatx80NaN(a, a, status);
5885         }
5886         if ( ! aSign ) return a;
5887         goto invalid;
5888     }
5889     if ( aSign ) {
5890         if ( ( aExp | aSig0 ) == 0 ) return a;
5891  invalid:
5892         float_raise(float_flag_invalid, status);
5893         return floatx80_default_nan(status);
5894     }
5895     if ( aExp == 0 ) {
5896         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5897         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5898     }
5899     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5900     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5901     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5902     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5903     doubleZSig0 = zSig0<<1;
5904     mul64To128( zSig0, zSig0, &term0, &term1 );
5905     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5906     while ( (int64_t) rem0 < 0 ) {
5907         --zSig0;
5908         doubleZSig0 -= 2;
5909         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5910     }
5911     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5912     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5913         if ( zSig1 == 0 ) zSig1 = 1;
5914         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5915         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5916         mul64To128( zSig1, zSig1, &term2, &term3 );
5917         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5918         while ( (int64_t) rem1 < 0 ) {
5919             --zSig1;
5920             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5921             term3 |= 1;
5922             term2 |= doubleZSig0;
5923             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5924         }
5925         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5926     }
5927     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5928     zSig0 |= doubleZSig0;
5929     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5930                                 0, zExp, zSig0, zSig1, status);
5931 }
5932 
5933 /*----------------------------------------------------------------------------
5934 | Returns 1 if the extended double-precision floating-point value `a' is equal
5935 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5936 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939 
5940 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5941 {
5942 
5943     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5944         || (extractFloatx80Exp(a) == 0x7FFF
5945             && (uint64_t) (extractFloatx80Frac(a) << 1))
5946         || (extractFloatx80Exp(b) == 0x7FFF
5947             && (uint64_t) (extractFloatx80Frac(b) << 1))
5948        ) {
5949         float_raise(float_flag_invalid, status);
5950         return 0;
5951     }
5952     return
5953            ( a.low == b.low )
5954         && (    ( a.high == b.high )
5955              || (    ( a.low == 0 )
5956                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5957            );
5958 
5959 }
5960 
5961 /*----------------------------------------------------------------------------
5962 | Returns 1 if the extended double-precision floating-point value `a' is
5963 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5964 | invalid exception is raised if either operand is a NaN.  The comparison is
5965 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5966 | Arithmetic.
5967 *----------------------------------------------------------------------------*/
5968 
5969 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5970 {
5971     flag aSign, bSign;
5972 
5973     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5974         || (extractFloatx80Exp(a) == 0x7FFF
5975             && (uint64_t) (extractFloatx80Frac(a) << 1))
5976         || (extractFloatx80Exp(b) == 0x7FFF
5977             && (uint64_t) (extractFloatx80Frac(b) << 1))
5978        ) {
5979         float_raise(float_flag_invalid, status);
5980         return 0;
5981     }
5982     aSign = extractFloatx80Sign( a );
5983     bSign = extractFloatx80Sign( b );
5984     if ( aSign != bSign ) {
5985         return
5986                aSign
5987             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5988                  == 0 );
5989     }
5990     return
5991           aSign ? le128( b.high, b.low, a.high, a.low )
5992         : le128( a.high, a.low, b.high, b.low );
5993 
5994 }
5995 
5996 /*----------------------------------------------------------------------------
5997 | Returns 1 if the extended double-precision floating-point value `a' is
5998 | less than the corresponding value `b', and 0 otherwise.  The invalid
5999 | exception is raised if either operand is a NaN.  The comparison is performed
6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001 *----------------------------------------------------------------------------*/
6002 
6003 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6004 {
6005     flag aSign, bSign;
6006 
6007     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6008         || (extractFloatx80Exp(a) == 0x7FFF
6009             && (uint64_t) (extractFloatx80Frac(a) << 1))
6010         || (extractFloatx80Exp(b) == 0x7FFF
6011             && (uint64_t) (extractFloatx80Frac(b) << 1))
6012        ) {
6013         float_raise(float_flag_invalid, status);
6014         return 0;
6015     }
6016     aSign = extractFloatx80Sign( a );
6017     bSign = extractFloatx80Sign( b );
6018     if ( aSign != bSign ) {
6019         return
6020                aSign
6021             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6022                  != 0 );
6023     }
6024     return
6025           aSign ? lt128( b.high, b.low, a.high, a.low )
6026         : lt128( a.high, a.low, b.high, b.low );
6027 
6028 }
6029 
6030 /*----------------------------------------------------------------------------
6031 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6032 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6033 | either operand is a NaN.   The comparison is performed according to the
6034 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6036 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6037 {
6038     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6039         || (extractFloatx80Exp(a) == 0x7FFF
6040             && (uint64_t) (extractFloatx80Frac(a) << 1))
6041         || (extractFloatx80Exp(b) == 0x7FFF
6042             && (uint64_t) (extractFloatx80Frac(b) << 1))
6043        ) {
6044         float_raise(float_flag_invalid, status);
6045         return 1;
6046     }
6047     return 0;
6048 }
6049 
6050 /*----------------------------------------------------------------------------
6051 | Returns 1 if the extended double-precision floating-point value `a' is
6052 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6053 | cause an exception.  The comparison is performed according to the IEC/IEEE
6054 | Standard for Binary Floating-Point Arithmetic.
6055 *----------------------------------------------------------------------------*/
6056 
6057 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6058 {
6059 
6060     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6061         float_raise(float_flag_invalid, status);
6062         return 0;
6063     }
6064     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6065               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6066          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6067               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6068        ) {
6069         if (floatx80_is_signaling_nan(a, status)
6070          || floatx80_is_signaling_nan(b, status)) {
6071             float_raise(float_flag_invalid, status);
6072         }
6073         return 0;
6074     }
6075     return
6076            ( a.low == b.low )
6077         && (    ( a.high == b.high )
6078              || (    ( a.low == 0 )
6079                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6080            );
6081 
6082 }
6083 
6084 /*----------------------------------------------------------------------------
6085 | Returns 1 if the extended double-precision floating-point value `a' is less
6086 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6087 | do not cause an exception.  Otherwise, the comparison is performed according
6088 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6089 *----------------------------------------------------------------------------*/
6090 
6091 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6092 {
6093     flag aSign, bSign;
6094 
6095     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6096         float_raise(float_flag_invalid, status);
6097         return 0;
6098     }
6099     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6100               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6101          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6102               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6103        ) {
6104         if (floatx80_is_signaling_nan(a, status)
6105          || floatx80_is_signaling_nan(b, status)) {
6106             float_raise(float_flag_invalid, status);
6107         }
6108         return 0;
6109     }
6110     aSign = extractFloatx80Sign( a );
6111     bSign = extractFloatx80Sign( b );
6112     if ( aSign != bSign ) {
6113         return
6114                aSign
6115             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6116                  == 0 );
6117     }
6118     return
6119           aSign ? le128( b.high, b.low, a.high, a.low )
6120         : le128( a.high, a.low, b.high, b.low );
6121 
6122 }
6123 
6124 /*----------------------------------------------------------------------------
6125 | Returns 1 if the extended double-precision floating-point value `a' is less
6126 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6127 | an exception.  Otherwise, the comparison is performed according to the
6128 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6129 *----------------------------------------------------------------------------*/
6130 
6131 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6132 {
6133     flag aSign, bSign;
6134 
6135     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6136         float_raise(float_flag_invalid, status);
6137         return 0;
6138     }
6139     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6140               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6141          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6142               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6143        ) {
6144         if (floatx80_is_signaling_nan(a, status)
6145          || floatx80_is_signaling_nan(b, status)) {
6146             float_raise(float_flag_invalid, status);
6147         }
6148         return 0;
6149     }
6150     aSign = extractFloatx80Sign( a );
6151     bSign = extractFloatx80Sign( b );
6152     if ( aSign != bSign ) {
6153         return
6154                aSign
6155             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6156                  != 0 );
6157     }
6158     return
6159           aSign ? lt128( b.high, b.low, a.high, a.low )
6160         : lt128( a.high, a.low, b.high, b.low );
6161 
6162 }
6163 
6164 /*----------------------------------------------------------------------------
6165 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6166 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6167 | The comparison is performed according to the IEC/IEEE Standard for Binary
6168 | Floating-Point Arithmetic.
6169 *----------------------------------------------------------------------------*/
6170 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6171 {
6172     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6173         float_raise(float_flag_invalid, status);
6174         return 1;
6175     }
6176     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6177               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6178          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6179               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6180        ) {
6181         if (floatx80_is_signaling_nan(a, status)
6182          || floatx80_is_signaling_nan(b, status)) {
6183             float_raise(float_flag_invalid, status);
6184         }
6185         return 1;
6186     }
6187     return 0;
6188 }
6189 
6190 /*----------------------------------------------------------------------------
6191 | Returns the result of converting the quadruple-precision floating-point
6192 | value `a' to the 32-bit two's complement integer format.  The conversion
6193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6194 | Arithmetic---which means in particular that the conversion is rounded
6195 | according to the current rounding mode.  If `a' is a NaN, the largest
6196 | positive integer is returned.  Otherwise, if the conversion overflows, the
6197 | largest integer with the same sign as `a' is returned.
6198 *----------------------------------------------------------------------------*/
6199 
6200 int32_t float128_to_int32(float128 a, float_status *status)
6201 {
6202     flag aSign;
6203     int32_t aExp, shiftCount;
6204     uint64_t aSig0, aSig1;
6205 
6206     aSig1 = extractFloat128Frac1( a );
6207     aSig0 = extractFloat128Frac0( a );
6208     aExp = extractFloat128Exp( a );
6209     aSign = extractFloat128Sign( a );
6210     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6211     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6212     aSig0 |= ( aSig1 != 0 );
6213     shiftCount = 0x4028 - aExp;
6214     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6215     return roundAndPackInt32(aSign, aSig0, status);
6216 
6217 }
6218 
6219 /*----------------------------------------------------------------------------
6220 | Returns the result of converting the quadruple-precision floating-point
6221 | value `a' to the 32-bit two's complement integer format.  The conversion
6222 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6223 | Arithmetic, except that the conversion is always rounded toward zero.  If
6224 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6225 | conversion overflows, the largest integer with the same sign as `a' is
6226 | returned.
6227 *----------------------------------------------------------------------------*/
6228 
6229 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6230 {
6231     flag aSign;
6232     int32_t aExp, shiftCount;
6233     uint64_t aSig0, aSig1, savedASig;
6234     int32_t z;
6235 
6236     aSig1 = extractFloat128Frac1( a );
6237     aSig0 = extractFloat128Frac0( a );
6238     aExp = extractFloat128Exp( a );
6239     aSign = extractFloat128Sign( a );
6240     aSig0 |= ( aSig1 != 0 );
6241     if ( 0x401E < aExp ) {
6242         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6243         goto invalid;
6244     }
6245     else if ( aExp < 0x3FFF ) {
6246         if (aExp || aSig0) {
6247             status->float_exception_flags |= float_flag_inexact;
6248         }
6249         return 0;
6250     }
6251     aSig0 |= LIT64( 0x0001000000000000 );
6252     shiftCount = 0x402F - aExp;
6253     savedASig = aSig0;
6254     aSig0 >>= shiftCount;
6255     z = aSig0;
6256     if ( aSign ) z = - z;
6257     if ( ( z < 0 ) ^ aSign ) {
6258  invalid:
6259         float_raise(float_flag_invalid, status);
6260         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6261     }
6262     if ( ( aSig0<<shiftCount ) != savedASig ) {
6263         status->float_exception_flags |= float_flag_inexact;
6264     }
6265     return z;
6266 
6267 }
6268 
6269 /*----------------------------------------------------------------------------
6270 | Returns the result of converting the quadruple-precision floating-point
6271 | value `a' to the 64-bit two's complement integer format.  The conversion
6272 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6273 | Arithmetic---which means in particular that the conversion is rounded
6274 | according to the current rounding mode.  If `a' is a NaN, the largest
6275 | positive integer is returned.  Otherwise, if the conversion overflows, the
6276 | largest integer with the same sign as `a' is returned.
6277 *----------------------------------------------------------------------------*/
6278 
6279 int64_t float128_to_int64(float128 a, float_status *status)
6280 {
6281     flag aSign;
6282     int32_t aExp, shiftCount;
6283     uint64_t aSig0, aSig1;
6284 
6285     aSig1 = extractFloat128Frac1( a );
6286     aSig0 = extractFloat128Frac0( a );
6287     aExp = extractFloat128Exp( a );
6288     aSign = extractFloat128Sign( a );
6289     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6290     shiftCount = 0x402F - aExp;
6291     if ( shiftCount <= 0 ) {
6292         if ( 0x403E < aExp ) {
6293             float_raise(float_flag_invalid, status);
6294             if (    ! aSign
6295                  || (    ( aExp == 0x7FFF )
6296                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6297                     )
6298                ) {
6299                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6300             }
6301             return (int64_t) LIT64( 0x8000000000000000 );
6302         }
6303         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6304     }
6305     else {
6306         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6307     }
6308     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6309 
6310 }
6311 
6312 /*----------------------------------------------------------------------------
6313 | Returns the result of converting the quadruple-precision floating-point
6314 | value `a' to the 64-bit two's complement integer format.  The conversion
6315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6316 | Arithmetic, except that the conversion is always rounded toward zero.
6317 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6318 | the conversion overflows, the largest integer with the same sign as `a' is
6319 | returned.
6320 *----------------------------------------------------------------------------*/
6321 
6322 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6323 {
6324     flag aSign;
6325     int32_t aExp, shiftCount;
6326     uint64_t aSig0, aSig1;
6327     int64_t z;
6328 
6329     aSig1 = extractFloat128Frac1( a );
6330     aSig0 = extractFloat128Frac0( a );
6331     aExp = extractFloat128Exp( a );
6332     aSign = extractFloat128Sign( a );
6333     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6334     shiftCount = aExp - 0x402F;
6335     if ( 0 < shiftCount ) {
6336         if ( 0x403E <= aExp ) {
6337             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6338             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6339                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6340                 if (aSig1) {
6341                     status->float_exception_flags |= float_flag_inexact;
6342                 }
6343             }
6344             else {
6345                 float_raise(float_flag_invalid, status);
6346                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6347                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6348                 }
6349             }
6350             return (int64_t) LIT64( 0x8000000000000000 );
6351         }
6352         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6353         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6354             status->float_exception_flags |= float_flag_inexact;
6355         }
6356     }
6357     else {
6358         if ( aExp < 0x3FFF ) {
6359             if ( aExp | aSig0 | aSig1 ) {
6360                 status->float_exception_flags |= float_flag_inexact;
6361             }
6362             return 0;
6363         }
6364         z = aSig0>>( - shiftCount );
6365         if (    aSig1
6366              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6367             status->float_exception_flags |= float_flag_inexact;
6368         }
6369     }
6370     if ( aSign ) z = - z;
6371     return z;
6372 
6373 }
6374 
6375 /*----------------------------------------------------------------------------
6376 | Returns the result of converting the quadruple-precision floating-point value
6377 | `a' to the 64-bit unsigned integer format.  The conversion is
6378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6379 | Arithmetic---which means in particular that the conversion is rounded
6380 | according to the current rounding mode.  If `a' is a NaN, the largest
6381 | positive integer is returned.  If the conversion overflows, the
6382 | largest unsigned integer is returned.  If 'a' is negative, the value is
6383 | rounded and zero is returned; negative values that do not round to zero
6384 | will raise the inexact exception.
6385 *----------------------------------------------------------------------------*/
6386 
6387 uint64_t float128_to_uint64(float128 a, float_status *status)
6388 {
6389     flag aSign;
6390     int aExp;
6391     int shiftCount;
6392     uint64_t aSig0, aSig1;
6393 
6394     aSig0 = extractFloat128Frac0(a);
6395     aSig1 = extractFloat128Frac1(a);
6396     aExp = extractFloat128Exp(a);
6397     aSign = extractFloat128Sign(a);
6398     if (aSign && (aExp > 0x3FFE)) {
6399         float_raise(float_flag_invalid, status);
6400         if (float128_is_any_nan(a)) {
6401             return LIT64(0xFFFFFFFFFFFFFFFF);
6402         } else {
6403             return 0;
6404         }
6405     }
6406     if (aExp) {
6407         aSig0 |= LIT64(0x0001000000000000);
6408     }
6409     shiftCount = 0x402F - aExp;
6410     if (shiftCount <= 0) {
6411         if (0x403E < aExp) {
6412             float_raise(float_flag_invalid, status);
6413             return LIT64(0xFFFFFFFFFFFFFFFF);
6414         }
6415         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6416     } else {
6417         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6418     }
6419     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6420 }
6421 
6422 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6423 {
6424     uint64_t v;
6425     signed char current_rounding_mode = status->float_rounding_mode;
6426 
6427     set_float_rounding_mode(float_round_to_zero, status);
6428     v = float128_to_uint64(a, status);
6429     set_float_rounding_mode(current_rounding_mode, status);
6430 
6431     return v;
6432 }
6433 
6434 /*----------------------------------------------------------------------------
6435 | Returns the result of converting the quadruple-precision floating-point
6436 | value `a' to the 32-bit unsigned integer format.  The conversion
6437 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6438 | Arithmetic except that the conversion is always rounded toward zero.
6439 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6440 | if the conversion overflows, the largest unsigned integer is returned.
6441 | If 'a' is negative, the value is rounded and zero is returned; negative
6442 | values that do not round to zero will raise the inexact exception.
6443 *----------------------------------------------------------------------------*/
6444 
6445 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6446 {
6447     uint64_t v;
6448     uint32_t res;
6449     int old_exc_flags = get_float_exception_flags(status);
6450 
6451     v = float128_to_uint64_round_to_zero(a, status);
6452     if (v > 0xffffffff) {
6453         res = 0xffffffff;
6454     } else {
6455         return v;
6456     }
6457     set_float_exception_flags(old_exc_flags, status);
6458     float_raise(float_flag_invalid, status);
6459     return res;
6460 }
6461 
6462 /*----------------------------------------------------------------------------
6463 | Returns the result of converting the quadruple-precision floating-point
6464 | value `a' to the single-precision floating-point format.  The conversion
6465 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6466 | Arithmetic.
6467 *----------------------------------------------------------------------------*/
6468 
6469 float32 float128_to_float32(float128 a, float_status *status)
6470 {
6471     flag aSign;
6472     int32_t aExp;
6473     uint64_t aSig0, aSig1;
6474     uint32_t zSig;
6475 
6476     aSig1 = extractFloat128Frac1( a );
6477     aSig0 = extractFloat128Frac0( a );
6478     aExp = extractFloat128Exp( a );
6479     aSign = extractFloat128Sign( a );
6480     if ( aExp == 0x7FFF ) {
6481         if ( aSig0 | aSig1 ) {
6482             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6483         }
6484         return packFloat32( aSign, 0xFF, 0 );
6485     }
6486     aSig0 |= ( aSig1 != 0 );
6487     shift64RightJamming( aSig0, 18, &aSig0 );
6488     zSig = aSig0;
6489     if ( aExp || zSig ) {
6490         zSig |= 0x40000000;
6491         aExp -= 0x3F81;
6492     }
6493     return roundAndPackFloat32(aSign, aExp, zSig, status);
6494 
6495 }
6496 
6497 /*----------------------------------------------------------------------------
6498 | Returns the result of converting the quadruple-precision floating-point
6499 | value `a' to the double-precision floating-point format.  The conversion
6500 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6501 | Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503 
6504 float64 float128_to_float64(float128 a, float_status *status)
6505 {
6506     flag aSign;
6507     int32_t aExp;
6508     uint64_t aSig0, aSig1;
6509 
6510     aSig1 = extractFloat128Frac1( a );
6511     aSig0 = extractFloat128Frac0( a );
6512     aExp = extractFloat128Exp( a );
6513     aSign = extractFloat128Sign( a );
6514     if ( aExp == 0x7FFF ) {
6515         if ( aSig0 | aSig1 ) {
6516             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6517         }
6518         return packFloat64( aSign, 0x7FF, 0 );
6519     }
6520     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6521     aSig0 |= ( aSig1 != 0 );
6522     if ( aExp || aSig0 ) {
6523         aSig0 |= LIT64( 0x4000000000000000 );
6524         aExp -= 0x3C01;
6525     }
6526     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6527 
6528 }
6529 
6530 /*----------------------------------------------------------------------------
6531 | Returns the result of converting the quadruple-precision floating-point
6532 | value `a' to the extended double-precision floating-point format.  The
6533 | conversion is performed according to the IEC/IEEE Standard for Binary
6534 | Floating-Point Arithmetic.
6535 *----------------------------------------------------------------------------*/
6536 
6537 floatx80 float128_to_floatx80(float128 a, float_status *status)
6538 {
6539     flag aSign;
6540     int32_t aExp;
6541     uint64_t aSig0, aSig1;
6542 
6543     aSig1 = extractFloat128Frac1( a );
6544     aSig0 = extractFloat128Frac0( a );
6545     aExp = extractFloat128Exp( a );
6546     aSign = extractFloat128Sign( a );
6547     if ( aExp == 0x7FFF ) {
6548         if ( aSig0 | aSig1 ) {
6549             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6550         }
6551         return packFloatx80(aSign, floatx80_infinity_high,
6552                                    floatx80_infinity_low);
6553     }
6554     if ( aExp == 0 ) {
6555         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6556         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6557     }
6558     else {
6559         aSig0 |= LIT64( 0x0001000000000000 );
6560     }
6561     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6562     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6563 
6564 }
6565 
6566 /*----------------------------------------------------------------------------
6567 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6568 | returns the result as a quadruple-precision floating-point value.  The
6569 | operation is performed according to the IEC/IEEE Standard for Binary
6570 | Floating-Point Arithmetic.
6571 *----------------------------------------------------------------------------*/
6572 
6573 float128 float128_round_to_int(float128 a, float_status *status)
6574 {
6575     flag aSign;
6576     int32_t aExp;
6577     uint64_t lastBitMask, roundBitsMask;
6578     float128 z;
6579 
6580     aExp = extractFloat128Exp( a );
6581     if ( 0x402F <= aExp ) {
6582         if ( 0x406F <= aExp ) {
6583             if (    ( aExp == 0x7FFF )
6584                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6585                ) {
6586                 return propagateFloat128NaN(a, a, status);
6587             }
6588             return a;
6589         }
6590         lastBitMask = 1;
6591         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6592         roundBitsMask = lastBitMask - 1;
6593         z = a;
6594         switch (status->float_rounding_mode) {
6595         case float_round_nearest_even:
6596             if ( lastBitMask ) {
6597                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6598                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6599             }
6600             else {
6601                 if ( (int64_t) z.low < 0 ) {
6602                     ++z.high;
6603                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6604                 }
6605             }
6606             break;
6607         case float_round_ties_away:
6608             if (lastBitMask) {
6609                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6610             } else {
6611                 if ((int64_t) z.low < 0) {
6612                     ++z.high;
6613                 }
6614             }
6615             break;
6616         case float_round_to_zero:
6617             break;
6618         case float_round_up:
6619             if (!extractFloat128Sign(z)) {
6620                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6621             }
6622             break;
6623         case float_round_down:
6624             if (extractFloat128Sign(z)) {
6625                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6626             }
6627             break;
6628         default:
6629             abort();
6630         }
6631         z.low &= ~ roundBitsMask;
6632     }
6633     else {
6634         if ( aExp < 0x3FFF ) {
6635             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6636             status->float_exception_flags |= float_flag_inexact;
6637             aSign = extractFloat128Sign( a );
6638             switch (status->float_rounding_mode) {
6639              case float_round_nearest_even:
6640                 if (    ( aExp == 0x3FFE )
6641                      && (   extractFloat128Frac0( a )
6642                           | extractFloat128Frac1( a ) )
6643                    ) {
6644                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6645                 }
6646                 break;
6647             case float_round_ties_away:
6648                 if (aExp == 0x3FFE) {
6649                     return packFloat128(aSign, 0x3FFF, 0, 0);
6650                 }
6651                 break;
6652              case float_round_down:
6653                 return
6654                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6655                     : packFloat128( 0, 0, 0, 0 );
6656              case float_round_up:
6657                 return
6658                       aSign ? packFloat128( 1, 0, 0, 0 )
6659                     : packFloat128( 0, 0x3FFF, 0, 0 );
6660             }
6661             return packFloat128( aSign, 0, 0, 0 );
6662         }
6663         lastBitMask = 1;
6664         lastBitMask <<= 0x402F - aExp;
6665         roundBitsMask = lastBitMask - 1;
6666         z.low = 0;
6667         z.high = a.high;
6668         switch (status->float_rounding_mode) {
6669         case float_round_nearest_even:
6670             z.high += lastBitMask>>1;
6671             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6672                 z.high &= ~ lastBitMask;
6673             }
6674             break;
6675         case float_round_ties_away:
6676             z.high += lastBitMask>>1;
6677             break;
6678         case float_round_to_zero:
6679             break;
6680         case float_round_up:
6681             if (!extractFloat128Sign(z)) {
6682                 z.high |= ( a.low != 0 );
6683                 z.high += roundBitsMask;
6684             }
6685             break;
6686         case float_round_down:
6687             if (extractFloat128Sign(z)) {
6688                 z.high |= (a.low != 0);
6689                 z.high += roundBitsMask;
6690             }
6691             break;
6692         default:
6693             abort();
6694         }
6695         z.high &= ~ roundBitsMask;
6696     }
6697     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6698         status->float_exception_flags |= float_flag_inexact;
6699     }
6700     return z;
6701 
6702 }
6703 
6704 /*----------------------------------------------------------------------------
6705 | Returns the result of adding the absolute values of the quadruple-precision
6706 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6707 | before being returned.  `zSign' is ignored if the result is a NaN.
6708 | The addition is performed according to the IEC/IEEE Standard for Binary
6709 | Floating-Point Arithmetic.
6710 *----------------------------------------------------------------------------*/
6711 
6712 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6713                                 float_status *status)
6714 {
6715     int32_t aExp, bExp, zExp;
6716     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6717     int32_t expDiff;
6718 
6719     aSig1 = extractFloat128Frac1( a );
6720     aSig0 = extractFloat128Frac0( a );
6721     aExp = extractFloat128Exp( a );
6722     bSig1 = extractFloat128Frac1( b );
6723     bSig0 = extractFloat128Frac0( b );
6724     bExp = extractFloat128Exp( b );
6725     expDiff = aExp - bExp;
6726     if ( 0 < expDiff ) {
6727         if ( aExp == 0x7FFF ) {
6728             if (aSig0 | aSig1) {
6729                 return propagateFloat128NaN(a, b, status);
6730             }
6731             return a;
6732         }
6733         if ( bExp == 0 ) {
6734             --expDiff;
6735         }
6736         else {
6737             bSig0 |= LIT64( 0x0001000000000000 );
6738         }
6739         shift128ExtraRightJamming(
6740             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6741         zExp = aExp;
6742     }
6743     else if ( expDiff < 0 ) {
6744         if ( bExp == 0x7FFF ) {
6745             if (bSig0 | bSig1) {
6746                 return propagateFloat128NaN(a, b, status);
6747             }
6748             return packFloat128( zSign, 0x7FFF, 0, 0 );
6749         }
6750         if ( aExp == 0 ) {
6751             ++expDiff;
6752         }
6753         else {
6754             aSig0 |= LIT64( 0x0001000000000000 );
6755         }
6756         shift128ExtraRightJamming(
6757             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6758         zExp = bExp;
6759     }
6760     else {
6761         if ( aExp == 0x7FFF ) {
6762             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6763                 return propagateFloat128NaN(a, b, status);
6764             }
6765             return a;
6766         }
6767         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6768         if ( aExp == 0 ) {
6769             if (status->flush_to_zero) {
6770                 if (zSig0 | zSig1) {
6771                     float_raise(float_flag_output_denormal, status);
6772                 }
6773                 return packFloat128(zSign, 0, 0, 0);
6774             }
6775             return packFloat128( zSign, 0, zSig0, zSig1 );
6776         }
6777         zSig2 = 0;
6778         zSig0 |= LIT64( 0x0002000000000000 );
6779         zExp = aExp;
6780         goto shiftRight1;
6781     }
6782     aSig0 |= LIT64( 0x0001000000000000 );
6783     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6784     --zExp;
6785     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6786     ++zExp;
6787  shiftRight1:
6788     shift128ExtraRightJamming(
6789         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6790  roundAndPack:
6791     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6792 
6793 }
6794 
6795 /*----------------------------------------------------------------------------
6796 | Returns the result of subtracting the absolute values of the quadruple-
6797 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6798 | difference is negated before being returned.  `zSign' is ignored if the
6799 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6800 | Standard for Binary Floating-Point Arithmetic.
6801 *----------------------------------------------------------------------------*/
6802 
6803 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6804                                 float_status *status)
6805 {
6806     int32_t aExp, bExp, zExp;
6807     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6808     int32_t expDiff;
6809 
6810     aSig1 = extractFloat128Frac1( a );
6811     aSig0 = extractFloat128Frac0( a );
6812     aExp = extractFloat128Exp( a );
6813     bSig1 = extractFloat128Frac1( b );
6814     bSig0 = extractFloat128Frac0( b );
6815     bExp = extractFloat128Exp( b );
6816     expDiff = aExp - bExp;
6817     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6818     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6819     if ( 0 < expDiff ) goto aExpBigger;
6820     if ( expDiff < 0 ) goto bExpBigger;
6821     if ( aExp == 0x7FFF ) {
6822         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6823             return propagateFloat128NaN(a, b, status);
6824         }
6825         float_raise(float_flag_invalid, status);
6826         return float128_default_nan(status);
6827     }
6828     if ( aExp == 0 ) {
6829         aExp = 1;
6830         bExp = 1;
6831     }
6832     if ( bSig0 < aSig0 ) goto aBigger;
6833     if ( aSig0 < bSig0 ) goto bBigger;
6834     if ( bSig1 < aSig1 ) goto aBigger;
6835     if ( aSig1 < bSig1 ) goto bBigger;
6836     return packFloat128(status->float_rounding_mode == float_round_down,
6837                         0, 0, 0);
6838  bExpBigger:
6839     if ( bExp == 0x7FFF ) {
6840         if (bSig0 | bSig1) {
6841             return propagateFloat128NaN(a, b, status);
6842         }
6843         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6844     }
6845     if ( aExp == 0 ) {
6846         ++expDiff;
6847     }
6848     else {
6849         aSig0 |= LIT64( 0x4000000000000000 );
6850     }
6851     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6852     bSig0 |= LIT64( 0x4000000000000000 );
6853  bBigger:
6854     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6855     zExp = bExp;
6856     zSign ^= 1;
6857     goto normalizeRoundAndPack;
6858  aExpBigger:
6859     if ( aExp == 0x7FFF ) {
6860         if (aSig0 | aSig1) {
6861             return propagateFloat128NaN(a, b, status);
6862         }
6863         return a;
6864     }
6865     if ( bExp == 0 ) {
6866         --expDiff;
6867     }
6868     else {
6869         bSig0 |= LIT64( 0x4000000000000000 );
6870     }
6871     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6872     aSig0 |= LIT64( 0x4000000000000000 );
6873  aBigger:
6874     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6875     zExp = aExp;
6876  normalizeRoundAndPack:
6877     --zExp;
6878     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6879                                          status);
6880 
6881 }
6882 
6883 /*----------------------------------------------------------------------------
6884 | Returns the result of adding the quadruple-precision floating-point values
6885 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6886 | for Binary Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6888 
6889 float128 float128_add(float128 a, float128 b, float_status *status)
6890 {
6891     flag aSign, bSign;
6892 
6893     aSign = extractFloat128Sign( a );
6894     bSign = extractFloat128Sign( b );
6895     if ( aSign == bSign ) {
6896         return addFloat128Sigs(a, b, aSign, status);
6897     }
6898     else {
6899         return subFloat128Sigs(a, b, aSign, status);
6900     }
6901 
6902 }
6903 
6904 /*----------------------------------------------------------------------------
6905 | Returns the result of subtracting the quadruple-precision floating-point
6906 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6907 | Standard for Binary Floating-Point Arithmetic.
6908 *----------------------------------------------------------------------------*/
6909 
6910 float128 float128_sub(float128 a, float128 b, float_status *status)
6911 {
6912     flag aSign, bSign;
6913 
6914     aSign = extractFloat128Sign( a );
6915     bSign = extractFloat128Sign( b );
6916     if ( aSign == bSign ) {
6917         return subFloat128Sigs(a, b, aSign, status);
6918     }
6919     else {
6920         return addFloat128Sigs(a, b, aSign, status);
6921     }
6922 
6923 }
6924 
6925 /*----------------------------------------------------------------------------
6926 | Returns the result of multiplying the quadruple-precision floating-point
6927 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6928 | Standard for Binary Floating-Point Arithmetic.
6929 *----------------------------------------------------------------------------*/
6930 
6931 float128 float128_mul(float128 a, float128 b, float_status *status)
6932 {
6933     flag aSign, bSign, zSign;
6934     int32_t aExp, bExp, zExp;
6935     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6936 
6937     aSig1 = extractFloat128Frac1( a );
6938     aSig0 = extractFloat128Frac0( a );
6939     aExp = extractFloat128Exp( a );
6940     aSign = extractFloat128Sign( a );
6941     bSig1 = extractFloat128Frac1( b );
6942     bSig0 = extractFloat128Frac0( b );
6943     bExp = extractFloat128Exp( b );
6944     bSign = extractFloat128Sign( b );
6945     zSign = aSign ^ bSign;
6946     if ( aExp == 0x7FFF ) {
6947         if (    ( aSig0 | aSig1 )
6948              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6949             return propagateFloat128NaN(a, b, status);
6950         }
6951         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6952         return packFloat128( zSign, 0x7FFF, 0, 0 );
6953     }
6954     if ( bExp == 0x7FFF ) {
6955         if (bSig0 | bSig1) {
6956             return propagateFloat128NaN(a, b, status);
6957         }
6958         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6959  invalid:
6960             float_raise(float_flag_invalid, status);
6961             return float128_default_nan(status);
6962         }
6963         return packFloat128( zSign, 0x7FFF, 0, 0 );
6964     }
6965     if ( aExp == 0 ) {
6966         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6967         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6968     }
6969     if ( bExp == 0 ) {
6970         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6971         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6972     }
6973     zExp = aExp + bExp - 0x4000;
6974     aSig0 |= LIT64( 0x0001000000000000 );
6975     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6976     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6977     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6978     zSig2 |= ( zSig3 != 0 );
6979     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6980         shift128ExtraRightJamming(
6981             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6982         ++zExp;
6983     }
6984     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6985 
6986 }
6987 
6988 /*----------------------------------------------------------------------------
6989 | Returns the result of dividing the quadruple-precision floating-point value
6990 | `a' by the corresponding value `b'.  The operation is performed according to
6991 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6992 *----------------------------------------------------------------------------*/
6993 
6994 float128 float128_div(float128 a, float128 b, float_status *status)
6995 {
6996     flag aSign, bSign, zSign;
6997     int32_t aExp, bExp, zExp;
6998     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6999     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7000 
7001     aSig1 = extractFloat128Frac1( a );
7002     aSig0 = extractFloat128Frac0( a );
7003     aExp = extractFloat128Exp( a );
7004     aSign = extractFloat128Sign( a );
7005     bSig1 = extractFloat128Frac1( b );
7006     bSig0 = extractFloat128Frac0( b );
7007     bExp = extractFloat128Exp( b );
7008     bSign = extractFloat128Sign( b );
7009     zSign = aSign ^ bSign;
7010     if ( aExp == 0x7FFF ) {
7011         if (aSig0 | aSig1) {
7012             return propagateFloat128NaN(a, b, status);
7013         }
7014         if ( bExp == 0x7FFF ) {
7015             if (bSig0 | bSig1) {
7016                 return propagateFloat128NaN(a, b, status);
7017             }
7018             goto invalid;
7019         }
7020         return packFloat128( zSign, 0x7FFF, 0, 0 );
7021     }
7022     if ( bExp == 0x7FFF ) {
7023         if (bSig0 | bSig1) {
7024             return propagateFloat128NaN(a, b, status);
7025         }
7026         return packFloat128( zSign, 0, 0, 0 );
7027     }
7028     if ( bExp == 0 ) {
7029         if ( ( bSig0 | bSig1 ) == 0 ) {
7030             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7031  invalid:
7032                 float_raise(float_flag_invalid, status);
7033                 return float128_default_nan(status);
7034             }
7035             float_raise(float_flag_divbyzero, status);
7036             return packFloat128( zSign, 0x7FFF, 0, 0 );
7037         }
7038         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7039     }
7040     if ( aExp == 0 ) {
7041         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7042         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7043     }
7044     zExp = aExp - bExp + 0x3FFD;
7045     shortShift128Left(
7046         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7047     shortShift128Left(
7048         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7049     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7050         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7051         ++zExp;
7052     }
7053     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7054     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7055     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7056     while ( (int64_t) rem0 < 0 ) {
7057         --zSig0;
7058         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7059     }
7060     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7061     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7062         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7063         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7064         while ( (int64_t) rem1 < 0 ) {
7065             --zSig1;
7066             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7067         }
7068         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7069     }
7070     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7071     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7072 
7073 }
7074 
7075 /*----------------------------------------------------------------------------
7076 | Returns the remainder of the quadruple-precision floating-point value `a'
7077 | with respect to the corresponding value `b'.  The operation is performed
7078 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7079 *----------------------------------------------------------------------------*/
7080 
7081 float128 float128_rem(float128 a, float128 b, float_status *status)
7082 {
7083     flag aSign, zSign;
7084     int32_t aExp, bExp, expDiff;
7085     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7086     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7087     int64_t sigMean0;
7088 
7089     aSig1 = extractFloat128Frac1( a );
7090     aSig0 = extractFloat128Frac0( a );
7091     aExp = extractFloat128Exp( a );
7092     aSign = extractFloat128Sign( a );
7093     bSig1 = extractFloat128Frac1( b );
7094     bSig0 = extractFloat128Frac0( b );
7095     bExp = extractFloat128Exp( b );
7096     if ( aExp == 0x7FFF ) {
7097         if (    ( aSig0 | aSig1 )
7098              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7099             return propagateFloat128NaN(a, b, status);
7100         }
7101         goto invalid;
7102     }
7103     if ( bExp == 0x7FFF ) {
7104         if (bSig0 | bSig1) {
7105             return propagateFloat128NaN(a, b, status);
7106         }
7107         return a;
7108     }
7109     if ( bExp == 0 ) {
7110         if ( ( bSig0 | bSig1 ) == 0 ) {
7111  invalid:
7112             float_raise(float_flag_invalid, status);
7113             return float128_default_nan(status);
7114         }
7115         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7116     }
7117     if ( aExp == 0 ) {
7118         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7119         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7120     }
7121     expDiff = aExp - bExp;
7122     if ( expDiff < -1 ) return a;
7123     shortShift128Left(
7124         aSig0 | LIT64( 0x0001000000000000 ),
7125         aSig1,
7126         15 - ( expDiff < 0 ),
7127         &aSig0,
7128         &aSig1
7129     );
7130     shortShift128Left(
7131         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7132     q = le128( bSig0, bSig1, aSig0, aSig1 );
7133     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7134     expDiff -= 64;
7135     while ( 0 < expDiff ) {
7136         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7137         q = ( 4 < q ) ? q - 4 : 0;
7138         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7139         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7140         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7141         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7142         expDiff -= 61;
7143     }
7144     if ( -64 < expDiff ) {
7145         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7146         q = ( 4 < q ) ? q - 4 : 0;
7147         q >>= - expDiff;
7148         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7149         expDiff += 52;
7150         if ( expDiff < 0 ) {
7151             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7152         }
7153         else {
7154             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7155         }
7156         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7157         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7158     }
7159     else {
7160         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7161         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7162     }
7163     do {
7164         alternateASig0 = aSig0;
7165         alternateASig1 = aSig1;
7166         ++q;
7167         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7168     } while ( 0 <= (int64_t) aSig0 );
7169     add128(
7170         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7171     if (    ( sigMean0 < 0 )
7172          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7173         aSig0 = alternateASig0;
7174         aSig1 = alternateASig1;
7175     }
7176     zSign = ( (int64_t) aSig0 < 0 );
7177     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7178     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7179                                          status);
7180 }
7181 
7182 /*----------------------------------------------------------------------------
7183 | Returns the square root of the quadruple-precision floating-point value `a'.
7184 | The operation is performed according to the IEC/IEEE Standard for Binary
7185 | Floating-Point Arithmetic.
7186 *----------------------------------------------------------------------------*/
7187 
7188 float128 float128_sqrt(float128 a, float_status *status)
7189 {
7190     flag aSign;
7191     int32_t aExp, zExp;
7192     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7193     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7194 
7195     aSig1 = extractFloat128Frac1( a );
7196     aSig0 = extractFloat128Frac0( a );
7197     aExp = extractFloat128Exp( a );
7198     aSign = extractFloat128Sign( a );
7199     if ( aExp == 0x7FFF ) {
7200         if (aSig0 | aSig1) {
7201             return propagateFloat128NaN(a, a, status);
7202         }
7203         if ( ! aSign ) return a;
7204         goto invalid;
7205     }
7206     if ( aSign ) {
7207         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7208  invalid:
7209         float_raise(float_flag_invalid, status);
7210         return float128_default_nan(status);
7211     }
7212     if ( aExp == 0 ) {
7213         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7214         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7215     }
7216     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7217     aSig0 |= LIT64( 0x0001000000000000 );
7218     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7219     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7220     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7221     doubleZSig0 = zSig0<<1;
7222     mul64To128( zSig0, zSig0, &term0, &term1 );
7223     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7224     while ( (int64_t) rem0 < 0 ) {
7225         --zSig0;
7226         doubleZSig0 -= 2;
7227         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7228     }
7229     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7230     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7231         if ( zSig1 == 0 ) zSig1 = 1;
7232         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7233         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7234         mul64To128( zSig1, zSig1, &term2, &term3 );
7235         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7236         while ( (int64_t) rem1 < 0 ) {
7237             --zSig1;
7238             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7239             term3 |= 1;
7240             term2 |= doubleZSig0;
7241             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7242         }
7243         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7244     }
7245     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7246     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7247 
7248 }
7249 
7250 /*----------------------------------------------------------------------------
7251 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7252 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7253 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7255 *----------------------------------------------------------------------------*/
7256 
7257 int float128_eq(float128 a, float128 b, float_status *status)
7258 {
7259 
7260     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7261               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7262          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7263               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7264        ) {
7265         float_raise(float_flag_invalid, status);
7266         return 0;
7267     }
7268     return
7269            ( a.low == b.low )
7270         && (    ( a.high == b.high )
7271              || (    ( a.low == 0 )
7272                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7273            );
7274 
7275 }
7276 
7277 /*----------------------------------------------------------------------------
7278 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7279 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7280 | exception is raised if either operand is a NaN.  The comparison is performed
7281 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7282 *----------------------------------------------------------------------------*/
7283 
7284 int float128_le(float128 a, float128 b, float_status *status)
7285 {
7286     flag aSign, bSign;
7287 
7288     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7289               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7290          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7291               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7292        ) {
7293         float_raise(float_flag_invalid, status);
7294         return 0;
7295     }
7296     aSign = extractFloat128Sign( a );
7297     bSign = extractFloat128Sign( b );
7298     if ( aSign != bSign ) {
7299         return
7300                aSign
7301             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7302                  == 0 );
7303     }
7304     return
7305           aSign ? le128( b.high, b.low, a.high, a.low )
7306         : le128( a.high, a.low, b.high, b.low );
7307 
7308 }
7309 
7310 /*----------------------------------------------------------------------------
7311 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7312 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7313 | raised if either operand is a NaN.  The comparison is performed according
7314 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7315 *----------------------------------------------------------------------------*/
7316 
7317 int float128_lt(float128 a, float128 b, float_status *status)
7318 {
7319     flag aSign, bSign;
7320 
7321     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7322               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7323          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7324               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7325        ) {
7326         float_raise(float_flag_invalid, status);
7327         return 0;
7328     }
7329     aSign = extractFloat128Sign( a );
7330     bSign = extractFloat128Sign( b );
7331     if ( aSign != bSign ) {
7332         return
7333                aSign
7334             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7335                  != 0 );
7336     }
7337     return
7338           aSign ? lt128( b.high, b.low, a.high, a.low )
7339         : lt128( a.high, a.low, b.high, b.low );
7340 
7341 }
7342 
7343 /*----------------------------------------------------------------------------
7344 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7345 | be compared, and 0 otherwise.  The invalid exception is raised if either
7346 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7347 | Standard for Binary Floating-Point Arithmetic.
7348 *----------------------------------------------------------------------------*/
7349 
7350 int float128_unordered(float128 a, float128 b, float_status *status)
7351 {
7352     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7353               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7354          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7355               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7356        ) {
7357         float_raise(float_flag_invalid, status);
7358         return 1;
7359     }
7360     return 0;
7361 }
7362 
7363 /*----------------------------------------------------------------------------
7364 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7365 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7366 | exception.  The comparison is performed according to the IEC/IEEE Standard
7367 | for Binary Floating-Point Arithmetic.
7368 *----------------------------------------------------------------------------*/
7369 
7370 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7371 {
7372 
7373     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7374               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7375          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7376               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7377        ) {
7378         if (float128_is_signaling_nan(a, status)
7379          || float128_is_signaling_nan(b, status)) {
7380             float_raise(float_flag_invalid, status);
7381         }
7382         return 0;
7383     }
7384     return
7385            ( a.low == b.low )
7386         && (    ( a.high == b.high )
7387              || (    ( a.low == 0 )
7388                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7389            );
7390 
7391 }
7392 
7393 /*----------------------------------------------------------------------------
7394 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7395 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7396 | cause an exception.  Otherwise, the comparison is performed according to the
7397 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7398 *----------------------------------------------------------------------------*/
7399 
7400 int float128_le_quiet(float128 a, float128 b, float_status *status)
7401 {
7402     flag aSign, bSign;
7403 
7404     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7405               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7406          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7407               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7408        ) {
7409         if (float128_is_signaling_nan(a, status)
7410          || float128_is_signaling_nan(b, status)) {
7411             float_raise(float_flag_invalid, status);
7412         }
7413         return 0;
7414     }
7415     aSign = extractFloat128Sign( a );
7416     bSign = extractFloat128Sign( b );
7417     if ( aSign != bSign ) {
7418         return
7419                aSign
7420             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7421                  == 0 );
7422     }
7423     return
7424           aSign ? le128( b.high, b.low, a.high, a.low )
7425         : le128( a.high, a.low, b.high, b.low );
7426 
7427 }
7428 
7429 /*----------------------------------------------------------------------------
7430 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7431 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7432 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7433 | Standard for Binary Floating-Point Arithmetic.
7434 *----------------------------------------------------------------------------*/
7435 
7436 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7437 {
7438     flag aSign, bSign;
7439 
7440     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7441               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7442          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7443               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7444        ) {
7445         if (float128_is_signaling_nan(a, status)
7446          || float128_is_signaling_nan(b, status)) {
7447             float_raise(float_flag_invalid, status);
7448         }
7449         return 0;
7450     }
7451     aSign = extractFloat128Sign( a );
7452     bSign = extractFloat128Sign( b );
7453     if ( aSign != bSign ) {
7454         return
7455                aSign
7456             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7457                  != 0 );
7458     }
7459     return
7460           aSign ? lt128( b.high, b.low, a.high, a.low )
7461         : lt128( a.high, a.low, b.high, b.low );
7462 
7463 }
7464 
7465 /*----------------------------------------------------------------------------
7466 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7467 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7468 | comparison is performed according to the IEC/IEEE Standard for Binary
7469 | Floating-Point Arithmetic.
7470 *----------------------------------------------------------------------------*/
7471 
7472 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7473 {
7474     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7475               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7476          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7477               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7478        ) {
7479         if (float128_is_signaling_nan(a, status)
7480          || float128_is_signaling_nan(b, status)) {
7481             float_raise(float_flag_invalid, status);
7482         }
7483         return 1;
7484     }
7485     return 0;
7486 }
7487 
7488 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7489                                             int is_quiet, float_status *status)
7490 {
7491     flag aSign, bSign;
7492 
7493     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7494         float_raise(float_flag_invalid, status);
7495         return float_relation_unordered;
7496     }
7497     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7498           ( extractFloatx80Frac( a )<<1 ) ) ||
7499         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7500           ( extractFloatx80Frac( b )<<1 ) )) {
7501         if (!is_quiet ||
7502             floatx80_is_signaling_nan(a, status) ||
7503             floatx80_is_signaling_nan(b, status)) {
7504             float_raise(float_flag_invalid, status);
7505         }
7506         return float_relation_unordered;
7507     }
7508     aSign = extractFloatx80Sign( a );
7509     bSign = extractFloatx80Sign( b );
7510     if ( aSign != bSign ) {
7511 
7512         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7513              ( ( a.low | b.low ) == 0 ) ) {
7514             /* zero case */
7515             return float_relation_equal;
7516         } else {
7517             return 1 - (2 * aSign);
7518         }
7519     } else {
7520         if (a.low == b.low && a.high == b.high) {
7521             return float_relation_equal;
7522         } else {
7523             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7524         }
7525     }
7526 }
7527 
7528 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7529 {
7530     return floatx80_compare_internal(a, b, 0, status);
7531 }
7532 
7533 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7534 {
7535     return floatx80_compare_internal(a, b, 1, status);
7536 }
7537 
7538 static inline int float128_compare_internal(float128 a, float128 b,
7539                                             int is_quiet, float_status *status)
7540 {
7541     flag aSign, bSign;
7542 
7543     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7544           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7545         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7546           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7547         if (!is_quiet ||
7548             float128_is_signaling_nan(a, status) ||
7549             float128_is_signaling_nan(b, status)) {
7550             float_raise(float_flag_invalid, status);
7551         }
7552         return float_relation_unordered;
7553     }
7554     aSign = extractFloat128Sign( a );
7555     bSign = extractFloat128Sign( b );
7556     if ( aSign != bSign ) {
7557         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7558             /* zero case */
7559             return float_relation_equal;
7560         } else {
7561             return 1 - (2 * aSign);
7562         }
7563     } else {
7564         if (a.low == b.low && a.high == b.high) {
7565             return float_relation_equal;
7566         } else {
7567             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7568         }
7569     }
7570 }
7571 
7572 int float128_compare(float128 a, float128 b, float_status *status)
7573 {
7574     return float128_compare_internal(a, b, 0, status);
7575 }
7576 
7577 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7578 {
7579     return float128_compare_internal(a, b, 1, status);
7580 }
7581 
7582 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7583 {
7584     flag aSign;
7585     int32_t aExp;
7586     uint64_t aSig;
7587 
7588     if (floatx80_invalid_encoding(a)) {
7589         float_raise(float_flag_invalid, status);
7590         return floatx80_default_nan(status);
7591     }
7592     aSig = extractFloatx80Frac( a );
7593     aExp = extractFloatx80Exp( a );
7594     aSign = extractFloatx80Sign( a );
7595 
7596     if ( aExp == 0x7FFF ) {
7597         if ( aSig<<1 ) {
7598             return propagateFloatx80NaN(a, a, status);
7599         }
7600         return a;
7601     }
7602 
7603     if (aExp == 0) {
7604         if (aSig == 0) {
7605             return a;
7606         }
7607         aExp++;
7608     }
7609 
7610     if (n > 0x10000) {
7611         n = 0x10000;
7612     } else if (n < -0x10000) {
7613         n = -0x10000;
7614     }
7615 
7616     aExp += n;
7617     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7618                                          aSign, aExp, aSig, 0, status);
7619 }
7620 
7621 float128 float128_scalbn(float128 a, int n, float_status *status)
7622 {
7623     flag aSign;
7624     int32_t aExp;
7625     uint64_t aSig0, aSig1;
7626 
7627     aSig1 = extractFloat128Frac1( a );
7628     aSig0 = extractFloat128Frac0( a );
7629     aExp = extractFloat128Exp( a );
7630     aSign = extractFloat128Sign( a );
7631     if ( aExp == 0x7FFF ) {
7632         if ( aSig0 | aSig1 ) {
7633             return propagateFloat128NaN(a, a, status);
7634         }
7635         return a;
7636     }
7637     if (aExp != 0) {
7638         aSig0 |= LIT64( 0x0001000000000000 );
7639     } else if (aSig0 == 0 && aSig1 == 0) {
7640         return a;
7641     } else {
7642         aExp++;
7643     }
7644 
7645     if (n > 0x10000) {
7646         n = 0x10000;
7647     } else if (n < -0x10000) {
7648         n = -0x10000;
7649     }
7650 
7651     aExp += n - 1;
7652     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7653                                          , status);
7654 
7655 }
7656