xref: /qemu/fpu/softfloat.c (revision 4a6295613f533a6841de5968c50e1ca36748807e)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1522                                                 int flags, float_status *status)
1523 {
1524     FloatParts pa = float32_unpack_canonical(a, status);
1525     FloatParts pb = float32_unpack_canonical(b, status);
1526     FloatParts pc = float32_unpack_canonical(c, status);
1527     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1528 
1529     return float32_round_pack_canonical(pr, status);
1530 }
1531 
1532 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1533                                                 int flags, float_status *status)
1534 {
1535     FloatParts pa = float64_unpack_canonical(a, status);
1536     FloatParts pb = float64_unpack_canonical(b, status);
1537     FloatParts pc = float64_unpack_canonical(c, status);
1538     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539 
1540     return float64_round_pack_canonical(pr, status);
1541 }
1542 
1543 /*
1544  * Returns the result of dividing the floating-point value `a' by the
1545  * corresponding value `b'. The operation is performed according to
1546  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1547  */
1548 
1549 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1550 {
1551     bool sign = a.sign ^ b.sign;
1552 
1553     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1554         uint64_t n0, n1, q, r;
1555         int exp = a.exp - b.exp;
1556 
1557         /*
1558          * We want a 2*N / N-bit division to produce exactly an N-bit
1559          * result, so that we do not lose any precision and so that we
1560          * do not have to renormalize afterward.  If A.frac < B.frac,
1561          * then division would produce an (N-1)-bit result; shift A left
1562          * by one to produce the an N-bit result, and decrement the
1563          * exponent to match.
1564          *
1565          * The udiv_qrnnd algorithm that we're using requires normalization,
1566          * i.e. the msb of the denominator must be set.  Since we know that
1567          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1568          * by one (more), and the remainder must be shifted right by one.
1569          */
1570         if (a.frac < b.frac) {
1571             exp -= 1;
1572             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1573         } else {
1574             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1575         }
1576         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1577 
1578         /*
1579          * Set lsb if there is a remainder, to set inexact.
1580          * As mentioned above, to find the actual value of the remainder we
1581          * would need to shift right, but (1) we are only concerned about
1582          * non-zero-ness, and (2) the remainder will always be even because
1583          * both inputs to the division primitive are even.
1584          */
1585         a.frac = q | (r != 0);
1586         a.sign = sign;
1587         a.exp = exp;
1588         return a;
1589     }
1590     /* handle all the NaN cases */
1591     if (is_nan(a.cls) || is_nan(b.cls)) {
1592         return pick_nan(a, b, s);
1593     }
1594     /* 0/0 or Inf/Inf */
1595     if (a.cls == b.cls
1596         &&
1597         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1598         s->float_exception_flags |= float_flag_invalid;
1599         return parts_default_nan(s);
1600     }
1601     /* Inf / x or 0 / x */
1602     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1603         a.sign = sign;
1604         return a;
1605     }
1606     /* Div 0 => Inf */
1607     if (b.cls == float_class_zero) {
1608         s->float_exception_flags |= float_flag_divbyzero;
1609         a.cls = float_class_inf;
1610         a.sign = sign;
1611         return a;
1612     }
1613     /* Div by Inf */
1614     if (b.cls == float_class_inf) {
1615         a.cls = float_class_zero;
1616         a.sign = sign;
1617         return a;
1618     }
1619     g_assert_not_reached();
1620 }
1621 
1622 float16 float16_div(float16 a, float16 b, float_status *status)
1623 {
1624     FloatParts pa = float16_unpack_canonical(a, status);
1625     FloatParts pb = float16_unpack_canonical(b, status);
1626     FloatParts pr = div_floats(pa, pb, status);
1627 
1628     return float16_round_pack_canonical(pr, status);
1629 }
1630 
1631 static float32 QEMU_SOFTFLOAT_ATTR
1632 soft_f32_div(float32 a, float32 b, float_status *status)
1633 {
1634     FloatParts pa = float32_unpack_canonical(a, status);
1635     FloatParts pb = float32_unpack_canonical(b, status);
1636     FloatParts pr = div_floats(pa, pb, status);
1637 
1638     return float32_round_pack_canonical(pr, status);
1639 }
1640 
1641 static float64 QEMU_SOFTFLOAT_ATTR
1642 soft_f64_div(float64 a, float64 b, float_status *status)
1643 {
1644     FloatParts pa = float64_unpack_canonical(a, status);
1645     FloatParts pb = float64_unpack_canonical(b, status);
1646     FloatParts pr = div_floats(pa, pb, status);
1647 
1648     return float64_round_pack_canonical(pr, status);
1649 }
1650 
1651 static float hard_f32_div(float a, float b)
1652 {
1653     return a / b;
1654 }
1655 
1656 static double hard_f64_div(double a, double b)
1657 {
1658     return a / b;
1659 }
1660 
1661 static bool f32_div_pre(union_float32 a, union_float32 b)
1662 {
1663     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1664         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1665                fpclassify(b.h) == FP_NORMAL;
1666     }
1667     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1668 }
1669 
1670 static bool f64_div_pre(union_float64 a, union_float64 b)
1671 {
1672     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1673         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1674                fpclassify(b.h) == FP_NORMAL;
1675     }
1676     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1677 }
1678 
1679 static bool f32_div_post(union_float32 a, union_float32 b)
1680 {
1681     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1682         return fpclassify(a.h) != FP_ZERO;
1683     }
1684     return !float32_is_zero(a.s);
1685 }
1686 
1687 static bool f64_div_post(union_float64 a, union_float64 b)
1688 {
1689     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1690         return fpclassify(a.h) != FP_ZERO;
1691     }
1692     return !float64_is_zero(a.s);
1693 }
1694 
1695 float32 QEMU_FLATTEN
1696 float32_div(float32 a, float32 b, float_status *s)
1697 {
1698     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1699                         f32_div_pre, f32_div_post, NULL, NULL);
1700 }
1701 
1702 float64 QEMU_FLATTEN
1703 float64_div(float64 a, float64 b, float_status *s)
1704 {
1705     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1706                         f64_div_pre, f64_div_post, NULL, NULL);
1707 }
1708 
1709 /*
1710  * Float to Float conversions
1711  *
1712  * Returns the result of converting one float format to another. The
1713  * conversion is performed according to the IEC/IEEE Standard for
1714  * Binary Floating-Point Arithmetic.
1715  *
1716  * The float_to_float helper only needs to take care of raising
1717  * invalid exceptions and handling the conversion on NaNs.
1718  */
1719 
1720 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1721                                  float_status *s)
1722 {
1723     if (dstf->arm_althp) {
1724         switch (a.cls) {
1725         case float_class_qnan:
1726         case float_class_snan:
1727             /* There is no NaN in the destination format.  Raise Invalid
1728              * and return a zero with the sign of the input NaN.
1729              */
1730             s->float_exception_flags |= float_flag_invalid;
1731             a.cls = float_class_zero;
1732             a.frac = 0;
1733             a.exp = 0;
1734             break;
1735 
1736         case float_class_inf:
1737             /* There is no Inf in the destination format.  Raise Invalid
1738              * and return the maximum normal with the correct sign.
1739              */
1740             s->float_exception_flags |= float_flag_invalid;
1741             a.cls = float_class_normal;
1742             a.exp = dstf->exp_max;
1743             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1744             break;
1745 
1746         default:
1747             break;
1748         }
1749     } else if (is_nan(a.cls)) {
1750         if (is_snan(a.cls)) {
1751             s->float_exception_flags |= float_flag_invalid;
1752             a = parts_silence_nan(a, s);
1753         }
1754         if (s->default_nan_mode) {
1755             return parts_default_nan(s);
1756         }
1757     }
1758     return a;
1759 }
1760 
1761 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1762 {
1763     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1764     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1765     FloatParts pr = float_to_float(p, &float32_params, s);
1766     return float32_round_pack_canonical(pr, s);
1767 }
1768 
1769 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1770 {
1771     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1772     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1773     FloatParts pr = float_to_float(p, &float64_params, s);
1774     return float64_round_pack_canonical(pr, s);
1775 }
1776 
1777 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1778 {
1779     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1780     FloatParts p = float32_unpack_canonical(a, s);
1781     FloatParts pr = float_to_float(p, fmt16, s);
1782     return float16a_round_pack_canonical(pr, s, fmt16);
1783 }
1784 
1785 float64 float32_to_float64(float32 a, float_status *s)
1786 {
1787     FloatParts p = float32_unpack_canonical(a, s);
1788     FloatParts pr = float_to_float(p, &float64_params, s);
1789     return float64_round_pack_canonical(pr, s);
1790 }
1791 
1792 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1793 {
1794     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1795     FloatParts p = float64_unpack_canonical(a, s);
1796     FloatParts pr = float_to_float(p, fmt16, s);
1797     return float16a_round_pack_canonical(pr, s, fmt16);
1798 }
1799 
1800 float32 float64_to_float32(float64 a, float_status *s)
1801 {
1802     FloatParts p = float64_unpack_canonical(a, s);
1803     FloatParts pr = float_to_float(p, &float32_params, s);
1804     return float32_round_pack_canonical(pr, s);
1805 }
1806 
1807 /*
1808  * Rounds the floating-point value `a' to an integer, and returns the
1809  * result as a floating-point value. The operation is performed
1810  * according to the IEC/IEEE Standard for Binary Floating-Point
1811  * Arithmetic.
1812  */
1813 
1814 static FloatParts round_to_int(FloatParts a, int rmode,
1815                                int scale, float_status *s)
1816 {
1817     switch (a.cls) {
1818     case float_class_qnan:
1819     case float_class_snan:
1820         return return_nan(a, s);
1821 
1822     case float_class_zero:
1823     case float_class_inf:
1824         /* already "integral" */
1825         break;
1826 
1827     case float_class_normal:
1828         scale = MIN(MAX(scale, -0x10000), 0x10000);
1829         a.exp += scale;
1830 
1831         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1832             /* already integral */
1833             break;
1834         }
1835         if (a.exp < 0) {
1836             bool one;
1837             /* all fractional */
1838             s->float_exception_flags |= float_flag_inexact;
1839             switch (rmode) {
1840             case float_round_nearest_even:
1841                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1842                 break;
1843             case float_round_ties_away:
1844                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1845                 break;
1846             case float_round_to_zero:
1847                 one = false;
1848                 break;
1849             case float_round_up:
1850                 one = !a.sign;
1851                 break;
1852             case float_round_down:
1853                 one = a.sign;
1854                 break;
1855             default:
1856                 g_assert_not_reached();
1857             }
1858 
1859             if (one) {
1860                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1861                 a.exp = 0;
1862             } else {
1863                 a.cls = float_class_zero;
1864             }
1865         } else {
1866             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1867             uint64_t frac_lsbm1 = frac_lsb >> 1;
1868             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1869             uint64_t rnd_mask = rnd_even_mask >> 1;
1870             uint64_t inc;
1871 
1872             switch (rmode) {
1873             case float_round_nearest_even:
1874                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1875                 break;
1876             case float_round_ties_away:
1877                 inc = frac_lsbm1;
1878                 break;
1879             case float_round_to_zero:
1880                 inc = 0;
1881                 break;
1882             case float_round_up:
1883                 inc = a.sign ? 0 : rnd_mask;
1884                 break;
1885             case float_round_down:
1886                 inc = a.sign ? rnd_mask : 0;
1887                 break;
1888             default:
1889                 g_assert_not_reached();
1890             }
1891 
1892             if (a.frac & rnd_mask) {
1893                 s->float_exception_flags |= float_flag_inexact;
1894                 a.frac += inc;
1895                 a.frac &= ~rnd_mask;
1896                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1897                     a.frac >>= 1;
1898                     a.exp++;
1899                 }
1900             }
1901         }
1902         break;
1903     default:
1904         g_assert_not_reached();
1905     }
1906     return a;
1907 }
1908 
1909 float16 float16_round_to_int(float16 a, float_status *s)
1910 {
1911     FloatParts pa = float16_unpack_canonical(a, s);
1912     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1913     return float16_round_pack_canonical(pr, s);
1914 }
1915 
1916 float32 float32_round_to_int(float32 a, float_status *s)
1917 {
1918     FloatParts pa = float32_unpack_canonical(a, s);
1919     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1920     return float32_round_pack_canonical(pr, s);
1921 }
1922 
1923 float64 float64_round_to_int(float64 a, float_status *s)
1924 {
1925     FloatParts pa = float64_unpack_canonical(a, s);
1926     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1927     return float64_round_pack_canonical(pr, s);
1928 }
1929 
1930 /*
1931  * Returns the result of converting the floating-point value `a' to
1932  * the two's complement integer format. The conversion is performed
1933  * according to the IEC/IEEE Standard for Binary Floating-Point
1934  * Arithmetic---which means in particular that the conversion is
1935  * rounded according to the current rounding mode. If `a' is a NaN,
1936  * the largest positive integer is returned. Otherwise, if the
1937  * conversion overflows, the largest integer with the same sign as `a'
1938  * is returned.
1939 */
1940 
1941 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1942                                      int64_t min, int64_t max,
1943                                      float_status *s)
1944 {
1945     uint64_t r;
1946     int orig_flags = get_float_exception_flags(s);
1947     FloatParts p = round_to_int(in, rmode, scale, s);
1948 
1949     switch (p.cls) {
1950     case float_class_snan:
1951     case float_class_qnan:
1952         s->float_exception_flags = orig_flags | float_flag_invalid;
1953         return max;
1954     case float_class_inf:
1955         s->float_exception_flags = orig_flags | float_flag_invalid;
1956         return p.sign ? min : max;
1957     case float_class_zero:
1958         return 0;
1959     case float_class_normal:
1960         if (p.exp < DECOMPOSED_BINARY_POINT) {
1961             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1962         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1963             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1964         } else {
1965             r = UINT64_MAX;
1966         }
1967         if (p.sign) {
1968             if (r <= -(uint64_t) min) {
1969                 return -r;
1970             } else {
1971                 s->float_exception_flags = orig_flags | float_flag_invalid;
1972                 return min;
1973             }
1974         } else {
1975             if (r <= max) {
1976                 return r;
1977             } else {
1978                 s->float_exception_flags = orig_flags | float_flag_invalid;
1979                 return max;
1980             }
1981         }
1982     default:
1983         g_assert_not_reached();
1984     }
1985 }
1986 
1987 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1988                                 float_status *s)
1989 {
1990     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1991                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1992 }
1993 
1994 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1995                                 float_status *s)
1996 {
1997     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1998                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1999 }
2000 
2001 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2002                                 float_status *s)
2003 {
2004     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2005                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2006 }
2007 
2008 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2009                                 float_status *s)
2010 {
2011     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2012                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2013 }
2014 
2015 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2016                                 float_status *s)
2017 {
2018     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2019                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2020 }
2021 
2022 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2023                                 float_status *s)
2024 {
2025     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2026                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2027 }
2028 
2029 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2030                                 float_status *s)
2031 {
2032     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2033                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2034 }
2035 
2036 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2037                                 float_status *s)
2038 {
2039     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2040                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2041 }
2042 
2043 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2044                                 float_status *s)
2045 {
2046     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2047                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2048 }
2049 
2050 int16_t float16_to_int16(float16 a, float_status *s)
2051 {
2052     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2053 }
2054 
2055 int32_t float16_to_int32(float16 a, float_status *s)
2056 {
2057     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2058 }
2059 
2060 int64_t float16_to_int64(float16 a, float_status *s)
2061 {
2062     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2063 }
2064 
2065 int16_t float32_to_int16(float32 a, float_status *s)
2066 {
2067     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2068 }
2069 
2070 int32_t float32_to_int32(float32 a, float_status *s)
2071 {
2072     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2073 }
2074 
2075 int64_t float32_to_int64(float32 a, float_status *s)
2076 {
2077     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2078 }
2079 
2080 int16_t float64_to_int16(float64 a, float_status *s)
2081 {
2082     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2083 }
2084 
2085 int32_t float64_to_int32(float64 a, float_status *s)
2086 {
2087     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2088 }
2089 
2090 int64_t float64_to_int64(float64 a, float_status *s)
2091 {
2092     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2093 }
2094 
2095 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2096 {
2097     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2098 }
2099 
2100 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2101 {
2102     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2103 }
2104 
2105 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2106 {
2107     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2108 }
2109 
2110 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2111 {
2112     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2113 }
2114 
2115 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2116 {
2117     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2118 }
2119 
2120 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2121 {
2122     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2123 }
2124 
2125 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2126 {
2127     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2128 }
2129 
2130 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2131 {
2132     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2133 }
2134 
2135 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2136 {
2137     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2138 }
2139 
2140 /*
2141  *  Returns the result of converting the floating-point value `a' to
2142  *  the unsigned integer format. The conversion is performed according
2143  *  to the IEC/IEEE Standard for Binary Floating-Point
2144  *  Arithmetic---which means in particular that the conversion is
2145  *  rounded according to the current rounding mode. If `a' is a NaN,
2146  *  the largest unsigned integer is returned. Otherwise, if the
2147  *  conversion overflows, the largest unsigned integer is returned. If
2148  *  the 'a' is negative, the result is rounded and zero is returned;
2149  *  values that do not round to zero will raise the inexact exception
2150  *  flag.
2151  */
2152 
2153 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2154                                        uint64_t max, float_status *s)
2155 {
2156     int orig_flags = get_float_exception_flags(s);
2157     FloatParts p = round_to_int(in, rmode, scale, s);
2158     uint64_t r;
2159 
2160     switch (p.cls) {
2161     case float_class_snan:
2162     case float_class_qnan:
2163         s->float_exception_flags = orig_flags | float_flag_invalid;
2164         return max;
2165     case float_class_inf:
2166         s->float_exception_flags = orig_flags | float_flag_invalid;
2167         return p.sign ? 0 : max;
2168     case float_class_zero:
2169         return 0;
2170     case float_class_normal:
2171         if (p.sign) {
2172             s->float_exception_flags = orig_flags | float_flag_invalid;
2173             return 0;
2174         }
2175 
2176         if (p.exp < DECOMPOSED_BINARY_POINT) {
2177             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2178         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2179             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2180         } else {
2181             s->float_exception_flags = orig_flags | float_flag_invalid;
2182             return max;
2183         }
2184 
2185         /* For uint64 this will never trip, but if p.exp is too large
2186          * to shift a decomposed fraction we shall have exited via the
2187          * 3rd leg above.
2188          */
2189         if (r > max) {
2190             s->float_exception_flags = orig_flags | float_flag_invalid;
2191             return max;
2192         }
2193         return r;
2194     default:
2195         g_assert_not_reached();
2196     }
2197 }
2198 
2199 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2200                                   float_status *s)
2201 {
2202     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2203                                   rmode, scale, UINT16_MAX, s);
2204 }
2205 
2206 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2207                                   float_status *s)
2208 {
2209     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2210                                   rmode, scale, UINT32_MAX, s);
2211 }
2212 
2213 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2214                                   float_status *s)
2215 {
2216     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2217                                   rmode, scale, UINT64_MAX, s);
2218 }
2219 
2220 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2221                                   float_status *s)
2222 {
2223     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2224                                   rmode, scale, UINT16_MAX, s);
2225 }
2226 
2227 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2228                                   float_status *s)
2229 {
2230     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2231                                   rmode, scale, UINT32_MAX, s);
2232 }
2233 
2234 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2235                                   float_status *s)
2236 {
2237     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2238                                   rmode, scale, UINT64_MAX, s);
2239 }
2240 
2241 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2242                                   float_status *s)
2243 {
2244     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2245                                   rmode, scale, UINT16_MAX, s);
2246 }
2247 
2248 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2249                                   float_status *s)
2250 {
2251     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2252                                   rmode, scale, UINT32_MAX, s);
2253 }
2254 
2255 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2256                                   float_status *s)
2257 {
2258     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2259                                   rmode, scale, UINT64_MAX, s);
2260 }
2261 
2262 uint16_t float16_to_uint16(float16 a, float_status *s)
2263 {
2264     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2265 }
2266 
2267 uint32_t float16_to_uint32(float16 a, float_status *s)
2268 {
2269     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2270 }
2271 
2272 uint64_t float16_to_uint64(float16 a, float_status *s)
2273 {
2274     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2275 }
2276 
2277 uint16_t float32_to_uint16(float32 a, float_status *s)
2278 {
2279     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2280 }
2281 
2282 uint32_t float32_to_uint32(float32 a, float_status *s)
2283 {
2284     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2285 }
2286 
2287 uint64_t float32_to_uint64(float32 a, float_status *s)
2288 {
2289     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2290 }
2291 
2292 uint16_t float64_to_uint16(float64 a, float_status *s)
2293 {
2294     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2295 }
2296 
2297 uint32_t float64_to_uint32(float64 a, float_status *s)
2298 {
2299     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2300 }
2301 
2302 uint64_t float64_to_uint64(float64 a, float_status *s)
2303 {
2304     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2305 }
2306 
2307 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2308 {
2309     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2310 }
2311 
2312 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2313 {
2314     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2315 }
2316 
2317 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2318 {
2319     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2320 }
2321 
2322 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2323 {
2324     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2325 }
2326 
2327 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2328 {
2329     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2330 }
2331 
2332 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2333 {
2334     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2335 }
2336 
2337 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2338 {
2339     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2340 }
2341 
2342 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2343 {
2344     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2345 }
2346 
2347 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2348 {
2349     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2350 }
2351 
2352 /*
2353  * Integer to float conversions
2354  *
2355  * Returns the result of converting the two's complement integer `a'
2356  * to the floating-point format. The conversion is performed according
2357  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2358  */
2359 
2360 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2361 {
2362     FloatParts r = { .sign = false };
2363 
2364     if (a == 0) {
2365         r.cls = float_class_zero;
2366     } else {
2367         uint64_t f = a;
2368         int shift;
2369 
2370         r.cls = float_class_normal;
2371         if (a < 0) {
2372             f = -f;
2373             r.sign = true;
2374         }
2375         shift = clz64(f) - 1;
2376         scale = MIN(MAX(scale, -0x10000), 0x10000);
2377 
2378         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2379         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2380     }
2381 
2382     return r;
2383 }
2384 
2385 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2386 {
2387     FloatParts pa = int_to_float(a, scale, status);
2388     return float16_round_pack_canonical(pa, status);
2389 }
2390 
2391 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2392 {
2393     return int64_to_float16_scalbn(a, scale, status);
2394 }
2395 
2396 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2397 {
2398     return int64_to_float16_scalbn(a, scale, status);
2399 }
2400 
2401 float16 int64_to_float16(int64_t a, float_status *status)
2402 {
2403     return int64_to_float16_scalbn(a, 0, status);
2404 }
2405 
2406 float16 int32_to_float16(int32_t a, float_status *status)
2407 {
2408     return int64_to_float16_scalbn(a, 0, status);
2409 }
2410 
2411 float16 int16_to_float16(int16_t a, float_status *status)
2412 {
2413     return int64_to_float16_scalbn(a, 0, status);
2414 }
2415 
2416 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2417 {
2418     FloatParts pa = int_to_float(a, scale, status);
2419     return float32_round_pack_canonical(pa, status);
2420 }
2421 
2422 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2423 {
2424     return int64_to_float32_scalbn(a, scale, status);
2425 }
2426 
2427 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2428 {
2429     return int64_to_float32_scalbn(a, scale, status);
2430 }
2431 
2432 float32 int64_to_float32(int64_t a, float_status *status)
2433 {
2434     return int64_to_float32_scalbn(a, 0, status);
2435 }
2436 
2437 float32 int32_to_float32(int32_t a, float_status *status)
2438 {
2439     return int64_to_float32_scalbn(a, 0, status);
2440 }
2441 
2442 float32 int16_to_float32(int16_t a, float_status *status)
2443 {
2444     return int64_to_float32_scalbn(a, 0, status);
2445 }
2446 
2447 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2448 {
2449     FloatParts pa = int_to_float(a, scale, status);
2450     return float64_round_pack_canonical(pa, status);
2451 }
2452 
2453 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2454 {
2455     return int64_to_float64_scalbn(a, scale, status);
2456 }
2457 
2458 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2459 {
2460     return int64_to_float64_scalbn(a, scale, status);
2461 }
2462 
2463 float64 int64_to_float64(int64_t a, float_status *status)
2464 {
2465     return int64_to_float64_scalbn(a, 0, status);
2466 }
2467 
2468 float64 int32_to_float64(int32_t a, float_status *status)
2469 {
2470     return int64_to_float64_scalbn(a, 0, status);
2471 }
2472 
2473 float64 int16_to_float64(int16_t a, float_status *status)
2474 {
2475     return int64_to_float64_scalbn(a, 0, status);
2476 }
2477 
2478 
2479 /*
2480  * Unsigned Integer to float conversions
2481  *
2482  * Returns the result of converting the unsigned integer `a' to the
2483  * floating-point format. The conversion is performed according to the
2484  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2485  */
2486 
2487 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2488 {
2489     FloatParts r = { .sign = false };
2490 
2491     if (a == 0) {
2492         r.cls = float_class_zero;
2493     } else {
2494         scale = MIN(MAX(scale, -0x10000), 0x10000);
2495         r.cls = float_class_normal;
2496         if ((int64_t)a < 0) {
2497             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2498             shift64RightJamming(a, 1, &a);
2499             r.frac = a;
2500         } else {
2501             int shift = clz64(a) - 1;
2502             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2503             r.frac = a << shift;
2504         }
2505     }
2506 
2507     return r;
2508 }
2509 
2510 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2511 {
2512     FloatParts pa = uint_to_float(a, scale, status);
2513     return float16_round_pack_canonical(pa, status);
2514 }
2515 
2516 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2517 {
2518     return uint64_to_float16_scalbn(a, scale, status);
2519 }
2520 
2521 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2522 {
2523     return uint64_to_float16_scalbn(a, scale, status);
2524 }
2525 
2526 float16 uint64_to_float16(uint64_t a, float_status *status)
2527 {
2528     return uint64_to_float16_scalbn(a, 0, status);
2529 }
2530 
2531 float16 uint32_to_float16(uint32_t a, float_status *status)
2532 {
2533     return uint64_to_float16_scalbn(a, 0, status);
2534 }
2535 
2536 float16 uint16_to_float16(uint16_t a, float_status *status)
2537 {
2538     return uint64_to_float16_scalbn(a, 0, status);
2539 }
2540 
2541 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2542 {
2543     FloatParts pa = uint_to_float(a, scale, status);
2544     return float32_round_pack_canonical(pa, status);
2545 }
2546 
2547 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2548 {
2549     return uint64_to_float32_scalbn(a, scale, status);
2550 }
2551 
2552 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2553 {
2554     return uint64_to_float32_scalbn(a, scale, status);
2555 }
2556 
2557 float32 uint64_to_float32(uint64_t a, float_status *status)
2558 {
2559     return uint64_to_float32_scalbn(a, 0, status);
2560 }
2561 
2562 float32 uint32_to_float32(uint32_t a, float_status *status)
2563 {
2564     return uint64_to_float32_scalbn(a, 0, status);
2565 }
2566 
2567 float32 uint16_to_float32(uint16_t a, float_status *status)
2568 {
2569     return uint64_to_float32_scalbn(a, 0, status);
2570 }
2571 
2572 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2573 {
2574     FloatParts pa = uint_to_float(a, scale, status);
2575     return float64_round_pack_canonical(pa, status);
2576 }
2577 
2578 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2579 {
2580     return uint64_to_float64_scalbn(a, scale, status);
2581 }
2582 
2583 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2584 {
2585     return uint64_to_float64_scalbn(a, scale, status);
2586 }
2587 
2588 float64 uint64_to_float64(uint64_t a, float_status *status)
2589 {
2590     return uint64_to_float64_scalbn(a, 0, status);
2591 }
2592 
2593 float64 uint32_to_float64(uint32_t a, float_status *status)
2594 {
2595     return uint64_to_float64_scalbn(a, 0, status);
2596 }
2597 
2598 float64 uint16_to_float64(uint16_t a, float_status *status)
2599 {
2600     return uint64_to_float64_scalbn(a, 0, status);
2601 }
2602 
2603 /* Float Min/Max */
2604 /* min() and max() functions. These can't be implemented as
2605  * 'compare and pick one input' because that would mishandle
2606  * NaNs and +0 vs -0.
2607  *
2608  * minnum() and maxnum() functions. These are similar to the min()
2609  * and max() functions but if one of the arguments is a QNaN and
2610  * the other is numerical then the numerical argument is returned.
2611  * SNaNs will get quietened before being returned.
2612  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2613  * and maxNum() operations. min() and max() are the typical min/max
2614  * semantics provided by many CPUs which predate that specification.
2615  *
2616  * minnummag() and maxnummag() functions correspond to minNumMag()
2617  * and minNumMag() from the IEEE-754 2008.
2618  */
2619 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2620                                 bool ieee, bool ismag, float_status *s)
2621 {
2622     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2623         if (ieee) {
2624             /* Takes two floating-point values `a' and `b', one of
2625              * which is a NaN, and returns the appropriate NaN
2626              * result. If either `a' or `b' is a signaling NaN,
2627              * the invalid exception is raised.
2628              */
2629             if (is_snan(a.cls) || is_snan(b.cls)) {
2630                 return pick_nan(a, b, s);
2631             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2632                 return b;
2633             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2634                 return a;
2635             }
2636         }
2637         return pick_nan(a, b, s);
2638     } else {
2639         int a_exp, b_exp;
2640 
2641         switch (a.cls) {
2642         case float_class_normal:
2643             a_exp = a.exp;
2644             break;
2645         case float_class_inf:
2646             a_exp = INT_MAX;
2647             break;
2648         case float_class_zero:
2649             a_exp = INT_MIN;
2650             break;
2651         default:
2652             g_assert_not_reached();
2653             break;
2654         }
2655         switch (b.cls) {
2656         case float_class_normal:
2657             b_exp = b.exp;
2658             break;
2659         case float_class_inf:
2660             b_exp = INT_MAX;
2661             break;
2662         case float_class_zero:
2663             b_exp = INT_MIN;
2664             break;
2665         default:
2666             g_assert_not_reached();
2667             break;
2668         }
2669 
2670         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2671             bool a_less = a_exp < b_exp;
2672             if (a_exp == b_exp) {
2673                 a_less = a.frac < b.frac;
2674             }
2675             return a_less ^ ismin ? b : a;
2676         }
2677 
2678         if (a.sign == b.sign) {
2679             bool a_less = a_exp < b_exp;
2680             if (a_exp == b_exp) {
2681                 a_less = a.frac < b.frac;
2682             }
2683             return a.sign ^ a_less ^ ismin ? b : a;
2684         } else {
2685             return a.sign ^ ismin ? b : a;
2686         }
2687     }
2688 }
2689 
2690 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2691 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2692                                      float_status *s)                   \
2693 {                                                                       \
2694     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2695     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2696     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2697                                                                         \
2698     return float ## sz ## _round_pack_canonical(pr, s);                 \
2699 }
2700 
2701 MINMAX(16, min, true, false, false)
2702 MINMAX(16, minnum, true, true, false)
2703 MINMAX(16, minnummag, true, true, true)
2704 MINMAX(16, max, false, false, false)
2705 MINMAX(16, maxnum, false, true, false)
2706 MINMAX(16, maxnummag, false, true, true)
2707 
2708 MINMAX(32, min, true, false, false)
2709 MINMAX(32, minnum, true, true, false)
2710 MINMAX(32, minnummag, true, true, true)
2711 MINMAX(32, max, false, false, false)
2712 MINMAX(32, maxnum, false, true, false)
2713 MINMAX(32, maxnummag, false, true, true)
2714 
2715 MINMAX(64, min, true, false, false)
2716 MINMAX(64, minnum, true, true, false)
2717 MINMAX(64, minnummag, true, true, true)
2718 MINMAX(64, max, false, false, false)
2719 MINMAX(64, maxnum, false, true, false)
2720 MINMAX(64, maxnummag, false, true, true)
2721 
2722 #undef MINMAX
2723 
2724 /* Floating point compare */
2725 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2726                           float_status *s)
2727 {
2728     if (is_nan(a.cls) || is_nan(b.cls)) {
2729         if (!is_quiet ||
2730             a.cls == float_class_snan ||
2731             b.cls == float_class_snan) {
2732             s->float_exception_flags |= float_flag_invalid;
2733         }
2734         return float_relation_unordered;
2735     }
2736 
2737     if (a.cls == float_class_zero) {
2738         if (b.cls == float_class_zero) {
2739             return float_relation_equal;
2740         }
2741         return b.sign ? float_relation_greater : float_relation_less;
2742     } else if (b.cls == float_class_zero) {
2743         return a.sign ? float_relation_less : float_relation_greater;
2744     }
2745 
2746     /* The only really important thing about infinity is its sign. If
2747      * both are infinities the sign marks the smallest of the two.
2748      */
2749     if (a.cls == float_class_inf) {
2750         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2751             return float_relation_equal;
2752         }
2753         return a.sign ? float_relation_less : float_relation_greater;
2754     } else if (b.cls == float_class_inf) {
2755         return b.sign ? float_relation_greater : float_relation_less;
2756     }
2757 
2758     if (a.sign != b.sign) {
2759         return a.sign ? float_relation_less : float_relation_greater;
2760     }
2761 
2762     if (a.exp == b.exp) {
2763         if (a.frac == b.frac) {
2764             return float_relation_equal;
2765         }
2766         if (a.sign) {
2767             return a.frac > b.frac ?
2768                 float_relation_less : float_relation_greater;
2769         } else {
2770             return a.frac > b.frac ?
2771                 float_relation_greater : float_relation_less;
2772         }
2773     } else {
2774         if (a.sign) {
2775             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2776         } else {
2777             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2778         }
2779     }
2780 }
2781 
2782 #define COMPARE(sz)                                                     \
2783 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2784                             float_status *s)                            \
2785 {                                                                       \
2786     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2787     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2788     return compare_floats(pa, pb, false, s);                            \
2789 }                                                                       \
2790 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2791                                   float_status *s)                      \
2792 {                                                                       \
2793     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2794     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2795     return compare_floats(pa, pb, true, s);                             \
2796 }
2797 
2798 COMPARE(16)
2799 COMPARE(32)
2800 COMPARE(64)
2801 
2802 #undef COMPARE
2803 
2804 /* Multiply A by 2 raised to the power N.  */
2805 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2806 {
2807     if (unlikely(is_nan(a.cls))) {
2808         return return_nan(a, s);
2809     }
2810     if (a.cls == float_class_normal) {
2811         /* The largest float type (even though not supported by FloatParts)
2812          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2813          * still allows rounding to infinity, without allowing overflow
2814          * within the int32_t that backs FloatParts.exp.
2815          */
2816         n = MIN(MAX(n, -0x10000), 0x10000);
2817         a.exp += n;
2818     }
2819     return a;
2820 }
2821 
2822 float16 float16_scalbn(float16 a, int n, float_status *status)
2823 {
2824     FloatParts pa = float16_unpack_canonical(a, status);
2825     FloatParts pr = scalbn_decomposed(pa, n, status);
2826     return float16_round_pack_canonical(pr, status);
2827 }
2828 
2829 float32 float32_scalbn(float32 a, int n, float_status *status)
2830 {
2831     FloatParts pa = float32_unpack_canonical(a, status);
2832     FloatParts pr = scalbn_decomposed(pa, n, status);
2833     return float32_round_pack_canonical(pr, status);
2834 }
2835 
2836 float64 float64_scalbn(float64 a, int n, float_status *status)
2837 {
2838     FloatParts pa = float64_unpack_canonical(a, status);
2839     FloatParts pr = scalbn_decomposed(pa, n, status);
2840     return float64_round_pack_canonical(pr, status);
2841 }
2842 
2843 /*
2844  * Square Root
2845  *
2846  * The old softfloat code did an approximation step before zeroing in
2847  * on the final result. However for simpleness we just compute the
2848  * square root by iterating down from the implicit bit to enough extra
2849  * bits to ensure we get a correctly rounded result.
2850  *
2851  * This does mean however the calculation is slower than before,
2852  * especially for 64 bit floats.
2853  */
2854 
2855 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2856 {
2857     uint64_t a_frac, r_frac, s_frac;
2858     int bit, last_bit;
2859 
2860     if (is_nan(a.cls)) {
2861         return return_nan(a, s);
2862     }
2863     if (a.cls == float_class_zero) {
2864         return a;  /* sqrt(+-0) = +-0 */
2865     }
2866     if (a.sign) {
2867         s->float_exception_flags |= float_flag_invalid;
2868         return parts_default_nan(s);
2869     }
2870     if (a.cls == float_class_inf) {
2871         return a;  /* sqrt(+inf) = +inf */
2872     }
2873 
2874     assert(a.cls == float_class_normal);
2875 
2876     /* We need two overflow bits at the top. Adding room for that is a
2877      * right shift. If the exponent is odd, we can discard the low bit
2878      * by multiplying the fraction by 2; that's a left shift. Combine
2879      * those and we shift right if the exponent is even.
2880      */
2881     a_frac = a.frac;
2882     if (!(a.exp & 1)) {
2883         a_frac >>= 1;
2884     }
2885     a.exp >>= 1;
2886 
2887     /* Bit-by-bit computation of sqrt.  */
2888     r_frac = 0;
2889     s_frac = 0;
2890 
2891     /* Iterate from implicit bit down to the 3 extra bits to compute a
2892      * properly rounded result. Remember we've inserted one more bit
2893      * at the top, so these positions are one less.
2894      */
2895     bit = DECOMPOSED_BINARY_POINT - 1;
2896     last_bit = MAX(p->frac_shift - 4, 0);
2897     do {
2898         uint64_t q = 1ULL << bit;
2899         uint64_t t_frac = s_frac + q;
2900         if (t_frac <= a_frac) {
2901             s_frac = t_frac + q;
2902             a_frac -= t_frac;
2903             r_frac += q;
2904         }
2905         a_frac <<= 1;
2906     } while (--bit >= last_bit);
2907 
2908     /* Undo the right shift done above. If there is any remaining
2909      * fraction, the result is inexact. Set the sticky bit.
2910      */
2911     a.frac = (r_frac << 1) + (a_frac != 0);
2912 
2913     return a;
2914 }
2915 
2916 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2917 {
2918     FloatParts pa = float16_unpack_canonical(a, status);
2919     FloatParts pr = sqrt_float(pa, status, &float16_params);
2920     return float16_round_pack_canonical(pr, status);
2921 }
2922 
2923 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2924 {
2925     FloatParts pa = float32_unpack_canonical(a, status);
2926     FloatParts pr = sqrt_float(pa, status, &float32_params);
2927     return float32_round_pack_canonical(pr, status);
2928 }
2929 
2930 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2931 {
2932     FloatParts pa = float64_unpack_canonical(a, status);
2933     FloatParts pr = sqrt_float(pa, status, &float64_params);
2934     return float64_round_pack_canonical(pr, status);
2935 }
2936 
2937 /*----------------------------------------------------------------------------
2938 | The pattern for a default generated NaN.
2939 *----------------------------------------------------------------------------*/
2940 
2941 float16 float16_default_nan(float_status *status)
2942 {
2943     FloatParts p = parts_default_nan(status);
2944     p.frac >>= float16_params.frac_shift;
2945     return float16_pack_raw(p);
2946 }
2947 
2948 float32 float32_default_nan(float_status *status)
2949 {
2950     FloatParts p = parts_default_nan(status);
2951     p.frac >>= float32_params.frac_shift;
2952     return float32_pack_raw(p);
2953 }
2954 
2955 float64 float64_default_nan(float_status *status)
2956 {
2957     FloatParts p = parts_default_nan(status);
2958     p.frac >>= float64_params.frac_shift;
2959     return float64_pack_raw(p);
2960 }
2961 
2962 float128 float128_default_nan(float_status *status)
2963 {
2964     FloatParts p = parts_default_nan(status);
2965     float128 r;
2966 
2967     /* Extrapolate from the choices made by parts_default_nan to fill
2968      * in the quad-floating format.  If the low bit is set, assume we
2969      * want to set all non-snan bits.
2970      */
2971     r.low = -(p.frac & 1);
2972     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2973     r.high |= LIT64(0x7FFF000000000000);
2974     r.high |= (uint64_t)p.sign << 63;
2975 
2976     return r;
2977 }
2978 
2979 /*----------------------------------------------------------------------------
2980 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2981 *----------------------------------------------------------------------------*/
2982 
2983 float16 float16_silence_nan(float16 a, float_status *status)
2984 {
2985     FloatParts p = float16_unpack_raw(a);
2986     p.frac <<= float16_params.frac_shift;
2987     p = parts_silence_nan(p, status);
2988     p.frac >>= float16_params.frac_shift;
2989     return float16_pack_raw(p);
2990 }
2991 
2992 float32 float32_silence_nan(float32 a, float_status *status)
2993 {
2994     FloatParts p = float32_unpack_raw(a);
2995     p.frac <<= float32_params.frac_shift;
2996     p = parts_silence_nan(p, status);
2997     p.frac >>= float32_params.frac_shift;
2998     return float32_pack_raw(p);
2999 }
3000 
3001 float64 float64_silence_nan(float64 a, float_status *status)
3002 {
3003     FloatParts p = float64_unpack_raw(a);
3004     p.frac <<= float64_params.frac_shift;
3005     p = parts_silence_nan(p, status);
3006     p.frac >>= float64_params.frac_shift;
3007     return float64_pack_raw(p);
3008 }
3009 
3010 /*----------------------------------------------------------------------------
3011 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3012 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3013 | input.  If `zSign' is 1, the input is negated before being converted to an
3014 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3015 | is simply rounded to an integer, with the inexact exception raised if the
3016 | input cannot be represented exactly as an integer.  However, if the fixed-
3017 | point input is too large, the invalid exception is raised and the largest
3018 | positive or negative integer is returned.
3019 *----------------------------------------------------------------------------*/
3020 
3021 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3022 {
3023     int8_t roundingMode;
3024     flag roundNearestEven;
3025     int8_t roundIncrement, roundBits;
3026     int32_t z;
3027 
3028     roundingMode = status->float_rounding_mode;
3029     roundNearestEven = ( roundingMode == float_round_nearest_even );
3030     switch (roundingMode) {
3031     case float_round_nearest_even:
3032     case float_round_ties_away:
3033         roundIncrement = 0x40;
3034         break;
3035     case float_round_to_zero:
3036         roundIncrement = 0;
3037         break;
3038     case float_round_up:
3039         roundIncrement = zSign ? 0 : 0x7f;
3040         break;
3041     case float_round_down:
3042         roundIncrement = zSign ? 0x7f : 0;
3043         break;
3044     default:
3045         abort();
3046     }
3047     roundBits = absZ & 0x7F;
3048     absZ = ( absZ + roundIncrement )>>7;
3049     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3050     z = absZ;
3051     if ( zSign ) z = - z;
3052     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3053         float_raise(float_flag_invalid, status);
3054         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3055     }
3056     if (roundBits) {
3057         status->float_exception_flags |= float_flag_inexact;
3058     }
3059     return z;
3060 
3061 }
3062 
3063 /*----------------------------------------------------------------------------
3064 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3065 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3066 | and returns the properly rounded 64-bit integer corresponding to the input.
3067 | If `zSign' is 1, the input is negated before being converted to an integer.
3068 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3069 | the inexact exception raised if the input cannot be represented exactly as
3070 | an integer.  However, if the fixed-point input is too large, the invalid
3071 | exception is raised and the largest positive or negative integer is
3072 | returned.
3073 *----------------------------------------------------------------------------*/
3074 
3075 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3076                                float_status *status)
3077 {
3078     int8_t roundingMode;
3079     flag roundNearestEven, increment;
3080     int64_t z;
3081 
3082     roundingMode = status->float_rounding_mode;
3083     roundNearestEven = ( roundingMode == float_round_nearest_even );
3084     switch (roundingMode) {
3085     case float_round_nearest_even:
3086     case float_round_ties_away:
3087         increment = ((int64_t) absZ1 < 0);
3088         break;
3089     case float_round_to_zero:
3090         increment = 0;
3091         break;
3092     case float_round_up:
3093         increment = !zSign && absZ1;
3094         break;
3095     case float_round_down:
3096         increment = zSign && absZ1;
3097         break;
3098     default:
3099         abort();
3100     }
3101     if ( increment ) {
3102         ++absZ0;
3103         if ( absZ0 == 0 ) goto overflow;
3104         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3105     }
3106     z = absZ0;
3107     if ( zSign ) z = - z;
3108     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3109  overflow:
3110         float_raise(float_flag_invalid, status);
3111         return
3112               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3113             : LIT64( 0x7FFFFFFFFFFFFFFF );
3114     }
3115     if (absZ1) {
3116         status->float_exception_flags |= float_flag_inexact;
3117     }
3118     return z;
3119 
3120 }
3121 
3122 /*----------------------------------------------------------------------------
3123 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3124 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3125 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3126 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3127 | with the inexact exception raised if the input cannot be represented exactly
3128 | as an integer.  However, if the fixed-point input is too large, the invalid
3129 | exception is raised and the largest unsigned integer is returned.
3130 *----------------------------------------------------------------------------*/
3131 
3132 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3133                                 uint64_t absZ1, float_status *status)
3134 {
3135     int8_t roundingMode;
3136     flag roundNearestEven, increment;
3137 
3138     roundingMode = status->float_rounding_mode;
3139     roundNearestEven = (roundingMode == float_round_nearest_even);
3140     switch (roundingMode) {
3141     case float_round_nearest_even:
3142     case float_round_ties_away:
3143         increment = ((int64_t)absZ1 < 0);
3144         break;
3145     case float_round_to_zero:
3146         increment = 0;
3147         break;
3148     case float_round_up:
3149         increment = !zSign && absZ1;
3150         break;
3151     case float_round_down:
3152         increment = zSign && absZ1;
3153         break;
3154     default:
3155         abort();
3156     }
3157     if (increment) {
3158         ++absZ0;
3159         if (absZ0 == 0) {
3160             float_raise(float_flag_invalid, status);
3161             return LIT64(0xFFFFFFFFFFFFFFFF);
3162         }
3163         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3164     }
3165 
3166     if (zSign && absZ0) {
3167         float_raise(float_flag_invalid, status);
3168         return 0;
3169     }
3170 
3171     if (absZ1) {
3172         status->float_exception_flags |= float_flag_inexact;
3173     }
3174     return absZ0;
3175 }
3176 
3177 /*----------------------------------------------------------------------------
3178 | If `a' is denormal and we are in flush-to-zero mode then set the
3179 | input-denormal exception and return zero. Otherwise just return the value.
3180 *----------------------------------------------------------------------------*/
3181 float32 float32_squash_input_denormal(float32 a, float_status *status)
3182 {
3183     if (status->flush_inputs_to_zero) {
3184         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3185             float_raise(float_flag_input_denormal, status);
3186             return make_float32(float32_val(a) & 0x80000000);
3187         }
3188     }
3189     return a;
3190 }
3191 
3192 /*----------------------------------------------------------------------------
3193 | Normalizes the subnormal single-precision floating-point value represented
3194 | by the denormalized significand `aSig'.  The normalized exponent and
3195 | significand are stored at the locations pointed to by `zExpPtr' and
3196 | `zSigPtr', respectively.
3197 *----------------------------------------------------------------------------*/
3198 
3199 static void
3200  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3201 {
3202     int8_t shiftCount;
3203 
3204     shiftCount = clz32(aSig) - 8;
3205     *zSigPtr = aSig<<shiftCount;
3206     *zExpPtr = 1 - shiftCount;
3207 
3208 }
3209 
3210 /*----------------------------------------------------------------------------
3211 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3212 | and significand `zSig', and returns the proper single-precision floating-
3213 | point value corresponding to the abstract input.  Ordinarily, the abstract
3214 | value is simply rounded and packed into the single-precision format, with
3215 | the inexact exception raised if the abstract input cannot be represented
3216 | exactly.  However, if the abstract value is too large, the overflow and
3217 | inexact exceptions are raised and an infinity or maximal finite value is
3218 | returned.  If the abstract value is too small, the input value is rounded to
3219 | a subnormal number, and the underflow and inexact exceptions are raised if
3220 | the abstract input cannot be represented exactly as a subnormal single-
3221 | precision floating-point number.
3222 |     The input significand `zSig' has its binary point between bits 30
3223 | and 29, which is 7 bits to the left of the usual location.  This shifted
3224 | significand must be normalized or smaller.  If `zSig' is not normalized,
3225 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3226 | and it must not require rounding.  In the usual case that `zSig' is
3227 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3228 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3229 | Binary Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3231 
3232 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3233                                    float_status *status)
3234 {
3235     int8_t roundingMode;
3236     flag roundNearestEven;
3237     int8_t roundIncrement, roundBits;
3238     flag isTiny;
3239 
3240     roundingMode = status->float_rounding_mode;
3241     roundNearestEven = ( roundingMode == float_round_nearest_even );
3242     switch (roundingMode) {
3243     case float_round_nearest_even:
3244     case float_round_ties_away:
3245         roundIncrement = 0x40;
3246         break;
3247     case float_round_to_zero:
3248         roundIncrement = 0;
3249         break;
3250     case float_round_up:
3251         roundIncrement = zSign ? 0 : 0x7f;
3252         break;
3253     case float_round_down:
3254         roundIncrement = zSign ? 0x7f : 0;
3255         break;
3256     default:
3257         abort();
3258         break;
3259     }
3260     roundBits = zSig & 0x7F;
3261     if ( 0xFD <= (uint16_t) zExp ) {
3262         if (    ( 0xFD < zExp )
3263              || (    ( zExp == 0xFD )
3264                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3265            ) {
3266             float_raise(float_flag_overflow | float_flag_inexact, status);
3267             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3268         }
3269         if ( zExp < 0 ) {
3270             if (status->flush_to_zero) {
3271                 float_raise(float_flag_output_denormal, status);
3272                 return packFloat32(zSign, 0, 0);
3273             }
3274             isTiny =
3275                 (status->float_detect_tininess
3276                  == float_tininess_before_rounding)
3277                 || ( zExp < -1 )
3278                 || ( zSig + roundIncrement < 0x80000000 );
3279             shift32RightJamming( zSig, - zExp, &zSig );
3280             zExp = 0;
3281             roundBits = zSig & 0x7F;
3282             if (isTiny && roundBits) {
3283                 float_raise(float_flag_underflow, status);
3284             }
3285         }
3286     }
3287     if (roundBits) {
3288         status->float_exception_flags |= float_flag_inexact;
3289     }
3290     zSig = ( zSig + roundIncrement )>>7;
3291     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3292     if ( zSig == 0 ) zExp = 0;
3293     return packFloat32( zSign, zExp, zSig );
3294 
3295 }
3296 
3297 /*----------------------------------------------------------------------------
3298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3299 | and significand `zSig', and returns the proper single-precision floating-
3300 | point value corresponding to the abstract input.  This routine is just like
3301 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3302 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3303 | floating-point exponent.
3304 *----------------------------------------------------------------------------*/
3305 
3306 static float32
3307  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3308                               float_status *status)
3309 {
3310     int8_t shiftCount;
3311 
3312     shiftCount = clz32(zSig) - 1;
3313     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3314                                status);
3315 
3316 }
3317 
3318 /*----------------------------------------------------------------------------
3319 | If `a' is denormal and we are in flush-to-zero mode then set the
3320 | input-denormal exception and return zero. Otherwise just return the value.
3321 *----------------------------------------------------------------------------*/
3322 float64 float64_squash_input_denormal(float64 a, float_status *status)
3323 {
3324     if (status->flush_inputs_to_zero) {
3325         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3326             float_raise(float_flag_input_denormal, status);
3327             return make_float64(float64_val(a) & (1ULL << 63));
3328         }
3329     }
3330     return a;
3331 }
3332 
3333 /*----------------------------------------------------------------------------
3334 | Normalizes the subnormal double-precision floating-point value represented
3335 | by the denormalized significand `aSig'.  The normalized exponent and
3336 | significand are stored at the locations pointed to by `zExpPtr' and
3337 | `zSigPtr', respectively.
3338 *----------------------------------------------------------------------------*/
3339 
3340 static void
3341  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3342 {
3343     int8_t shiftCount;
3344 
3345     shiftCount = clz64(aSig) - 11;
3346     *zSigPtr = aSig<<shiftCount;
3347     *zExpPtr = 1 - shiftCount;
3348 
3349 }
3350 
3351 /*----------------------------------------------------------------------------
3352 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3353 | double-precision floating-point value, returning the result.  After being
3354 | shifted into the proper positions, the three fields are simply added
3355 | together to form the result.  This means that any integer portion of `zSig'
3356 | will be added into the exponent.  Since a properly normalized significand
3357 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3358 | than the desired result exponent whenever `zSig' is a complete, normalized
3359 | significand.
3360 *----------------------------------------------------------------------------*/
3361 
3362 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3363 {
3364 
3365     return make_float64(
3366         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3367 
3368 }
3369 
3370 /*----------------------------------------------------------------------------
3371 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3372 | and significand `zSig', and returns the proper double-precision floating-
3373 | point value corresponding to the abstract input.  Ordinarily, the abstract
3374 | value is simply rounded and packed into the double-precision format, with
3375 | the inexact exception raised if the abstract input cannot be represented
3376 | exactly.  However, if the abstract value is too large, the overflow and
3377 | inexact exceptions are raised and an infinity or maximal finite value is
3378 | returned.  If the abstract value is too small, the input value is rounded to
3379 | a subnormal number, and the underflow and inexact exceptions are raised if
3380 | the abstract input cannot be represented exactly as a subnormal double-
3381 | precision floating-point number.
3382 |     The input significand `zSig' has its binary point between bits 62
3383 | and 61, which is 10 bits to the left of the usual location.  This shifted
3384 | significand must be normalized or smaller.  If `zSig' is not normalized,
3385 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3386 | and it must not require rounding.  In the usual case that `zSig' is
3387 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3388 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3389 | Binary Floating-Point Arithmetic.
3390 *----------------------------------------------------------------------------*/
3391 
3392 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3393                                    float_status *status)
3394 {
3395     int8_t roundingMode;
3396     flag roundNearestEven;
3397     int roundIncrement, roundBits;
3398     flag isTiny;
3399 
3400     roundingMode = status->float_rounding_mode;
3401     roundNearestEven = ( roundingMode == float_round_nearest_even );
3402     switch (roundingMode) {
3403     case float_round_nearest_even:
3404     case float_round_ties_away:
3405         roundIncrement = 0x200;
3406         break;
3407     case float_round_to_zero:
3408         roundIncrement = 0;
3409         break;
3410     case float_round_up:
3411         roundIncrement = zSign ? 0 : 0x3ff;
3412         break;
3413     case float_round_down:
3414         roundIncrement = zSign ? 0x3ff : 0;
3415         break;
3416     case float_round_to_odd:
3417         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3418         break;
3419     default:
3420         abort();
3421     }
3422     roundBits = zSig & 0x3FF;
3423     if ( 0x7FD <= (uint16_t) zExp ) {
3424         if (    ( 0x7FD < zExp )
3425              || (    ( zExp == 0x7FD )
3426                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3427            ) {
3428             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3429                                    roundIncrement != 0;
3430             float_raise(float_flag_overflow | float_flag_inexact, status);
3431             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3432         }
3433         if ( zExp < 0 ) {
3434             if (status->flush_to_zero) {
3435                 float_raise(float_flag_output_denormal, status);
3436                 return packFloat64(zSign, 0, 0);
3437             }
3438             isTiny =
3439                    (status->float_detect_tininess
3440                     == float_tininess_before_rounding)
3441                 || ( zExp < -1 )
3442                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3443             shift64RightJamming( zSig, - zExp, &zSig );
3444             zExp = 0;
3445             roundBits = zSig & 0x3FF;
3446             if (isTiny && roundBits) {
3447                 float_raise(float_flag_underflow, status);
3448             }
3449             if (roundingMode == float_round_to_odd) {
3450                 /*
3451                  * For round-to-odd case, the roundIncrement depends on
3452                  * zSig which just changed.
3453                  */
3454                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3455             }
3456         }
3457     }
3458     if (roundBits) {
3459         status->float_exception_flags |= float_flag_inexact;
3460     }
3461     zSig = ( zSig + roundIncrement )>>10;
3462     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3463     if ( zSig == 0 ) zExp = 0;
3464     return packFloat64( zSign, zExp, zSig );
3465 
3466 }
3467 
3468 /*----------------------------------------------------------------------------
3469 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3470 | and significand `zSig', and returns the proper double-precision floating-
3471 | point value corresponding to the abstract input.  This routine is just like
3472 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3473 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3474 | floating-point exponent.
3475 *----------------------------------------------------------------------------*/
3476 
3477 static float64
3478  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3479                               float_status *status)
3480 {
3481     int8_t shiftCount;
3482 
3483     shiftCount = clz64(zSig) - 1;
3484     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3485                                status);
3486 
3487 }
3488 
3489 /*----------------------------------------------------------------------------
3490 | Normalizes the subnormal extended double-precision floating-point value
3491 | represented by the denormalized significand `aSig'.  The normalized exponent
3492 | and significand are stored at the locations pointed to by `zExpPtr' and
3493 | `zSigPtr', respectively.
3494 *----------------------------------------------------------------------------*/
3495 
3496 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3497                                 uint64_t *zSigPtr)
3498 {
3499     int8_t shiftCount;
3500 
3501     shiftCount = clz64(aSig);
3502     *zSigPtr = aSig<<shiftCount;
3503     *zExpPtr = 1 - shiftCount;
3504 }
3505 
3506 /*----------------------------------------------------------------------------
3507 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3508 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3509 | and returns the proper extended double-precision floating-point value
3510 | corresponding to the abstract input.  Ordinarily, the abstract value is
3511 | rounded and packed into the extended double-precision format, with the
3512 | inexact exception raised if the abstract input cannot be represented
3513 | exactly.  However, if the abstract value is too large, the overflow and
3514 | inexact exceptions are raised and an infinity or maximal finite value is
3515 | returned.  If the abstract value is too small, the input value is rounded to
3516 | a subnormal number, and the underflow and inexact exceptions are raised if
3517 | the abstract input cannot be represented exactly as a subnormal extended
3518 | double-precision floating-point number.
3519 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3520 | number of bits as single or double precision, respectively.  Otherwise, the
3521 | result is rounded to the full precision of the extended double-precision
3522 | format.
3523 |     The input significand must be normalized or smaller.  If the input
3524 | significand is not normalized, `zExp' must be 0; in that case, the result
3525 | returned is a subnormal number, and it must not require rounding.  The
3526 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3527 | Floating-Point Arithmetic.
3528 *----------------------------------------------------------------------------*/
3529 
3530 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3531                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3532                               float_status *status)
3533 {
3534     int8_t roundingMode;
3535     flag roundNearestEven, increment, isTiny;
3536     int64_t roundIncrement, roundMask, roundBits;
3537 
3538     roundingMode = status->float_rounding_mode;
3539     roundNearestEven = ( roundingMode == float_round_nearest_even );
3540     if ( roundingPrecision == 80 ) goto precision80;
3541     if ( roundingPrecision == 64 ) {
3542         roundIncrement = LIT64( 0x0000000000000400 );
3543         roundMask = LIT64( 0x00000000000007FF );
3544     }
3545     else if ( roundingPrecision == 32 ) {
3546         roundIncrement = LIT64( 0x0000008000000000 );
3547         roundMask = LIT64( 0x000000FFFFFFFFFF );
3548     }
3549     else {
3550         goto precision80;
3551     }
3552     zSig0 |= ( zSig1 != 0 );
3553     switch (roundingMode) {
3554     case float_round_nearest_even:
3555     case float_round_ties_away:
3556         break;
3557     case float_round_to_zero:
3558         roundIncrement = 0;
3559         break;
3560     case float_round_up:
3561         roundIncrement = zSign ? 0 : roundMask;
3562         break;
3563     case float_round_down:
3564         roundIncrement = zSign ? roundMask : 0;
3565         break;
3566     default:
3567         abort();
3568     }
3569     roundBits = zSig0 & roundMask;
3570     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3571         if (    ( 0x7FFE < zExp )
3572              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3573            ) {
3574             goto overflow;
3575         }
3576         if ( zExp <= 0 ) {
3577             if (status->flush_to_zero) {
3578                 float_raise(float_flag_output_denormal, status);
3579                 return packFloatx80(zSign, 0, 0);
3580             }
3581             isTiny =
3582                    (status->float_detect_tininess
3583                     == float_tininess_before_rounding)
3584                 || ( zExp < 0 )
3585                 || ( zSig0 <= zSig0 + roundIncrement );
3586             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3587             zExp = 0;
3588             roundBits = zSig0 & roundMask;
3589             if (isTiny && roundBits) {
3590                 float_raise(float_flag_underflow, status);
3591             }
3592             if (roundBits) {
3593                 status->float_exception_flags |= float_flag_inexact;
3594             }
3595             zSig0 += roundIncrement;
3596             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3597             roundIncrement = roundMask + 1;
3598             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3599                 roundMask |= roundIncrement;
3600             }
3601             zSig0 &= ~ roundMask;
3602             return packFloatx80( zSign, zExp, zSig0 );
3603         }
3604     }
3605     if (roundBits) {
3606         status->float_exception_flags |= float_flag_inexact;
3607     }
3608     zSig0 += roundIncrement;
3609     if ( zSig0 < roundIncrement ) {
3610         ++zExp;
3611         zSig0 = LIT64( 0x8000000000000000 );
3612     }
3613     roundIncrement = roundMask + 1;
3614     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3615         roundMask |= roundIncrement;
3616     }
3617     zSig0 &= ~ roundMask;
3618     if ( zSig0 == 0 ) zExp = 0;
3619     return packFloatx80( zSign, zExp, zSig0 );
3620  precision80:
3621     switch (roundingMode) {
3622     case float_round_nearest_even:
3623     case float_round_ties_away:
3624         increment = ((int64_t)zSig1 < 0);
3625         break;
3626     case float_round_to_zero:
3627         increment = 0;
3628         break;
3629     case float_round_up:
3630         increment = !zSign && zSig1;
3631         break;
3632     case float_round_down:
3633         increment = zSign && zSig1;
3634         break;
3635     default:
3636         abort();
3637     }
3638     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3639         if (    ( 0x7FFE < zExp )
3640              || (    ( zExp == 0x7FFE )
3641                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3642                   && increment
3643                 )
3644            ) {
3645             roundMask = 0;
3646  overflow:
3647             float_raise(float_flag_overflow | float_flag_inexact, status);
3648             if (    ( roundingMode == float_round_to_zero )
3649                  || ( zSign && ( roundingMode == float_round_up ) )
3650                  || ( ! zSign && ( roundingMode == float_round_down ) )
3651                ) {
3652                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3653             }
3654             return packFloatx80(zSign,
3655                                 floatx80_infinity_high,
3656                                 floatx80_infinity_low);
3657         }
3658         if ( zExp <= 0 ) {
3659             isTiny =
3660                    (status->float_detect_tininess
3661                     == float_tininess_before_rounding)
3662                 || ( zExp < 0 )
3663                 || ! increment
3664                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3665             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3666             zExp = 0;
3667             if (isTiny && zSig1) {
3668                 float_raise(float_flag_underflow, status);
3669             }
3670             if (zSig1) {
3671                 status->float_exception_flags |= float_flag_inexact;
3672             }
3673             switch (roundingMode) {
3674             case float_round_nearest_even:
3675             case float_round_ties_away:
3676                 increment = ((int64_t)zSig1 < 0);
3677                 break;
3678             case float_round_to_zero:
3679                 increment = 0;
3680                 break;
3681             case float_round_up:
3682                 increment = !zSign && zSig1;
3683                 break;
3684             case float_round_down:
3685                 increment = zSign && zSig1;
3686                 break;
3687             default:
3688                 abort();
3689             }
3690             if ( increment ) {
3691                 ++zSig0;
3692                 zSig0 &=
3693                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3694                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3695             }
3696             return packFloatx80( zSign, zExp, zSig0 );
3697         }
3698     }
3699     if (zSig1) {
3700         status->float_exception_flags |= float_flag_inexact;
3701     }
3702     if ( increment ) {
3703         ++zSig0;
3704         if ( zSig0 == 0 ) {
3705             ++zExp;
3706             zSig0 = LIT64( 0x8000000000000000 );
3707         }
3708         else {
3709             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3710         }
3711     }
3712     else {
3713         if ( zSig0 == 0 ) zExp = 0;
3714     }
3715     return packFloatx80( zSign, zExp, zSig0 );
3716 
3717 }
3718 
3719 /*----------------------------------------------------------------------------
3720 | Takes an abstract floating-point value having sign `zSign', exponent
3721 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3722 | and returns the proper extended double-precision floating-point value
3723 | corresponding to the abstract input.  This routine is just like
3724 | `roundAndPackFloatx80' except that the input significand does not have to be
3725 | normalized.
3726 *----------------------------------------------------------------------------*/
3727 
3728 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3729                                        flag zSign, int32_t zExp,
3730                                        uint64_t zSig0, uint64_t zSig1,
3731                                        float_status *status)
3732 {
3733     int8_t shiftCount;
3734 
3735     if ( zSig0 == 0 ) {
3736         zSig0 = zSig1;
3737         zSig1 = 0;
3738         zExp -= 64;
3739     }
3740     shiftCount = clz64(zSig0);
3741     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3742     zExp -= shiftCount;
3743     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3744                                 zSig0, zSig1, status);
3745 
3746 }
3747 
3748 /*----------------------------------------------------------------------------
3749 | Returns the least-significant 64 fraction bits of the quadruple-precision
3750 | floating-point value `a'.
3751 *----------------------------------------------------------------------------*/
3752 
3753 static inline uint64_t extractFloat128Frac1( float128 a )
3754 {
3755 
3756     return a.low;
3757 
3758 }
3759 
3760 /*----------------------------------------------------------------------------
3761 | Returns the most-significant 48 fraction bits of the quadruple-precision
3762 | floating-point value `a'.
3763 *----------------------------------------------------------------------------*/
3764 
3765 static inline uint64_t extractFloat128Frac0( float128 a )
3766 {
3767 
3768     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3769 
3770 }
3771 
3772 /*----------------------------------------------------------------------------
3773 | Returns the exponent bits of the quadruple-precision floating-point value
3774 | `a'.
3775 *----------------------------------------------------------------------------*/
3776 
3777 static inline int32_t extractFloat128Exp( float128 a )
3778 {
3779 
3780     return ( a.high>>48 ) & 0x7FFF;
3781 
3782 }
3783 
3784 /*----------------------------------------------------------------------------
3785 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3786 *----------------------------------------------------------------------------*/
3787 
3788 static inline flag extractFloat128Sign( float128 a )
3789 {
3790 
3791     return a.high>>63;
3792 
3793 }
3794 
3795 /*----------------------------------------------------------------------------
3796 | Normalizes the subnormal quadruple-precision floating-point value
3797 | represented by the denormalized significand formed by the concatenation of
3798 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3799 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3800 | significand are stored at the location pointed to by `zSig0Ptr', and the
3801 | least significant 64 bits of the normalized significand are stored at the
3802 | location pointed to by `zSig1Ptr'.
3803 *----------------------------------------------------------------------------*/
3804 
3805 static void
3806  normalizeFloat128Subnormal(
3807      uint64_t aSig0,
3808      uint64_t aSig1,
3809      int32_t *zExpPtr,
3810      uint64_t *zSig0Ptr,
3811      uint64_t *zSig1Ptr
3812  )
3813 {
3814     int8_t shiftCount;
3815 
3816     if ( aSig0 == 0 ) {
3817         shiftCount = clz64(aSig1) - 15;
3818         if ( shiftCount < 0 ) {
3819             *zSig0Ptr = aSig1>>( - shiftCount );
3820             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3821         }
3822         else {
3823             *zSig0Ptr = aSig1<<shiftCount;
3824             *zSig1Ptr = 0;
3825         }
3826         *zExpPtr = - shiftCount - 63;
3827     }
3828     else {
3829         shiftCount = clz64(aSig0) - 15;
3830         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3831         *zExpPtr = 1 - shiftCount;
3832     }
3833 
3834 }
3835 
3836 /*----------------------------------------------------------------------------
3837 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3838 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3839 | floating-point value, returning the result.  After being shifted into the
3840 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3841 | added together to form the most significant 32 bits of the result.  This
3842 | means that any integer portion of `zSig0' will be added into the exponent.
3843 | Since a properly normalized significand will have an integer portion equal
3844 | to 1, the `zExp' input should be 1 less than the desired result exponent
3845 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3846 | significand.
3847 *----------------------------------------------------------------------------*/
3848 
3849 static inline float128
3850  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3851 {
3852     float128 z;
3853 
3854     z.low = zSig1;
3855     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3856     return z;
3857 
3858 }
3859 
3860 /*----------------------------------------------------------------------------
3861 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3862 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3863 | and `zSig2', and returns the proper quadruple-precision floating-point value
3864 | corresponding to the abstract input.  Ordinarily, the abstract value is
3865 | simply rounded and packed into the quadruple-precision format, with the
3866 | inexact exception raised if the abstract input cannot be represented
3867 | exactly.  However, if the abstract value is too large, the overflow and
3868 | inexact exceptions are raised and an infinity or maximal finite value is
3869 | returned.  If the abstract value is too small, the input value is rounded to
3870 | a subnormal number, and the underflow and inexact exceptions are raised if
3871 | the abstract input cannot be represented exactly as a subnormal quadruple-
3872 | precision floating-point number.
3873 |     The input significand must be normalized or smaller.  If the input
3874 | significand is not normalized, `zExp' must be 0; in that case, the result
3875 | returned is a subnormal number, and it must not require rounding.  In the
3876 | usual case that the input significand is normalized, `zExp' must be 1 less
3877 | than the ``true'' floating-point exponent.  The handling of underflow and
3878 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3879 *----------------------------------------------------------------------------*/
3880 
3881 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3882                                      uint64_t zSig0, uint64_t zSig1,
3883                                      uint64_t zSig2, float_status *status)
3884 {
3885     int8_t roundingMode;
3886     flag roundNearestEven, increment, isTiny;
3887 
3888     roundingMode = status->float_rounding_mode;
3889     roundNearestEven = ( roundingMode == float_round_nearest_even );
3890     switch (roundingMode) {
3891     case float_round_nearest_even:
3892     case float_round_ties_away:
3893         increment = ((int64_t)zSig2 < 0);
3894         break;
3895     case float_round_to_zero:
3896         increment = 0;
3897         break;
3898     case float_round_up:
3899         increment = !zSign && zSig2;
3900         break;
3901     case float_round_down:
3902         increment = zSign && zSig2;
3903         break;
3904     case float_round_to_odd:
3905         increment = !(zSig1 & 0x1) && zSig2;
3906         break;
3907     default:
3908         abort();
3909     }
3910     if ( 0x7FFD <= (uint32_t) zExp ) {
3911         if (    ( 0x7FFD < zExp )
3912              || (    ( zExp == 0x7FFD )
3913                   && eq128(
3914                          LIT64( 0x0001FFFFFFFFFFFF ),
3915                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3916                          zSig0,
3917                          zSig1
3918                      )
3919                   && increment
3920                 )
3921            ) {
3922             float_raise(float_flag_overflow | float_flag_inexact, status);
3923             if (    ( roundingMode == float_round_to_zero )
3924                  || ( zSign && ( roundingMode == float_round_up ) )
3925                  || ( ! zSign && ( roundingMode == float_round_down ) )
3926                  || (roundingMode == float_round_to_odd)
3927                ) {
3928                 return
3929                     packFloat128(
3930                         zSign,
3931                         0x7FFE,
3932                         LIT64( 0x0000FFFFFFFFFFFF ),
3933                         LIT64( 0xFFFFFFFFFFFFFFFF )
3934                     );
3935             }
3936             return packFloat128( zSign, 0x7FFF, 0, 0 );
3937         }
3938         if ( zExp < 0 ) {
3939             if (status->flush_to_zero) {
3940                 float_raise(float_flag_output_denormal, status);
3941                 return packFloat128(zSign, 0, 0, 0);
3942             }
3943             isTiny =
3944                    (status->float_detect_tininess
3945                     == float_tininess_before_rounding)
3946                 || ( zExp < -1 )
3947                 || ! increment
3948                 || lt128(
3949                        zSig0,
3950                        zSig1,
3951                        LIT64( 0x0001FFFFFFFFFFFF ),
3952                        LIT64( 0xFFFFFFFFFFFFFFFF )
3953                    );
3954             shift128ExtraRightJamming(
3955                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3956             zExp = 0;
3957             if (isTiny && zSig2) {
3958                 float_raise(float_flag_underflow, status);
3959             }
3960             switch (roundingMode) {
3961             case float_round_nearest_even:
3962             case float_round_ties_away:
3963                 increment = ((int64_t)zSig2 < 0);
3964                 break;
3965             case float_round_to_zero:
3966                 increment = 0;
3967                 break;
3968             case float_round_up:
3969                 increment = !zSign && zSig2;
3970                 break;
3971             case float_round_down:
3972                 increment = zSign && zSig2;
3973                 break;
3974             case float_round_to_odd:
3975                 increment = !(zSig1 & 0x1) && zSig2;
3976                 break;
3977             default:
3978                 abort();
3979             }
3980         }
3981     }
3982     if (zSig2) {
3983         status->float_exception_flags |= float_flag_inexact;
3984     }
3985     if ( increment ) {
3986         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3987         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3988     }
3989     else {
3990         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3991     }
3992     return packFloat128( zSign, zExp, zSig0, zSig1 );
3993 
3994 }
3995 
3996 /*----------------------------------------------------------------------------
3997 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3998 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3999 | returns the proper quadruple-precision floating-point value corresponding
4000 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4001 | except that the input significand has fewer bits and does not have to be
4002 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4003 | point exponent.
4004 *----------------------------------------------------------------------------*/
4005 
4006 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4007                                               uint64_t zSig0, uint64_t zSig1,
4008                                               float_status *status)
4009 {
4010     int8_t shiftCount;
4011     uint64_t zSig2;
4012 
4013     if ( zSig0 == 0 ) {
4014         zSig0 = zSig1;
4015         zSig1 = 0;
4016         zExp -= 64;
4017     }
4018     shiftCount = clz64(zSig0) - 15;
4019     if ( 0 <= shiftCount ) {
4020         zSig2 = 0;
4021         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4022     }
4023     else {
4024         shift128ExtraRightJamming(
4025             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4026     }
4027     zExp -= shiftCount;
4028     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4029 
4030 }
4031 
4032 
4033 /*----------------------------------------------------------------------------
4034 | Returns the result of converting the 32-bit two's complement integer `a'
4035 | to the extended double-precision floating-point format.  The conversion
4036 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4037 | Arithmetic.
4038 *----------------------------------------------------------------------------*/
4039 
4040 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4041 {
4042     flag zSign;
4043     uint32_t absA;
4044     int8_t shiftCount;
4045     uint64_t zSig;
4046 
4047     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4048     zSign = ( a < 0 );
4049     absA = zSign ? - a : a;
4050     shiftCount = clz32(absA) + 32;
4051     zSig = absA;
4052     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4053 
4054 }
4055 
4056 /*----------------------------------------------------------------------------
4057 | Returns the result of converting the 32-bit two's complement integer `a' to
4058 | the quadruple-precision floating-point format.  The conversion is performed
4059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4060 *----------------------------------------------------------------------------*/
4061 
4062 float128 int32_to_float128(int32_t a, float_status *status)
4063 {
4064     flag zSign;
4065     uint32_t absA;
4066     int8_t shiftCount;
4067     uint64_t zSig0;
4068 
4069     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4070     zSign = ( a < 0 );
4071     absA = zSign ? - a : a;
4072     shiftCount = clz32(absA) + 17;
4073     zSig0 = absA;
4074     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4075 
4076 }
4077 
4078 /*----------------------------------------------------------------------------
4079 | Returns the result of converting the 64-bit two's complement integer `a'
4080 | to the extended double-precision floating-point format.  The conversion
4081 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4082 | Arithmetic.
4083 *----------------------------------------------------------------------------*/
4084 
4085 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4086 {
4087     flag zSign;
4088     uint64_t absA;
4089     int8_t shiftCount;
4090 
4091     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4092     zSign = ( a < 0 );
4093     absA = zSign ? - a : a;
4094     shiftCount = clz64(absA);
4095     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4096 
4097 }
4098 
4099 /*----------------------------------------------------------------------------
4100 | Returns the result of converting the 64-bit two's complement integer `a' to
4101 | the quadruple-precision floating-point format.  The conversion is performed
4102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4103 *----------------------------------------------------------------------------*/
4104 
4105 float128 int64_to_float128(int64_t a, float_status *status)
4106 {
4107     flag zSign;
4108     uint64_t absA;
4109     int8_t shiftCount;
4110     int32_t zExp;
4111     uint64_t zSig0, zSig1;
4112 
4113     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4114     zSign = ( a < 0 );
4115     absA = zSign ? - a : a;
4116     shiftCount = clz64(absA) + 49;
4117     zExp = 0x406E - shiftCount;
4118     if ( 64 <= shiftCount ) {
4119         zSig1 = 0;
4120         zSig0 = absA;
4121         shiftCount -= 64;
4122     }
4123     else {
4124         zSig1 = absA;
4125         zSig0 = 0;
4126     }
4127     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4128     return packFloat128( zSign, zExp, zSig0, zSig1 );
4129 
4130 }
4131 
4132 /*----------------------------------------------------------------------------
4133 | Returns the result of converting the 64-bit unsigned integer `a'
4134 | to the quadruple-precision floating-point format.  The conversion is performed
4135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4136 *----------------------------------------------------------------------------*/
4137 
4138 float128 uint64_to_float128(uint64_t a, float_status *status)
4139 {
4140     if (a == 0) {
4141         return float128_zero;
4142     }
4143     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4144 }
4145 
4146 /*----------------------------------------------------------------------------
4147 | Returns the result of converting the single-precision floating-point value
4148 | `a' to the extended double-precision floating-point format.  The conversion
4149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4150 | Arithmetic.
4151 *----------------------------------------------------------------------------*/
4152 
4153 floatx80 float32_to_floatx80(float32 a, float_status *status)
4154 {
4155     flag aSign;
4156     int aExp;
4157     uint32_t aSig;
4158 
4159     a = float32_squash_input_denormal(a, status);
4160     aSig = extractFloat32Frac( a );
4161     aExp = extractFloat32Exp( a );
4162     aSign = extractFloat32Sign( a );
4163     if ( aExp == 0xFF ) {
4164         if (aSig) {
4165             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4166         }
4167         return packFloatx80(aSign,
4168                             floatx80_infinity_high,
4169                             floatx80_infinity_low);
4170     }
4171     if ( aExp == 0 ) {
4172         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4173         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4174     }
4175     aSig |= 0x00800000;
4176     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4177 
4178 }
4179 
4180 /*----------------------------------------------------------------------------
4181 | Returns the result of converting the single-precision floating-point value
4182 | `a' to the double-precision floating-point format.  The conversion is
4183 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4184 | Arithmetic.
4185 *----------------------------------------------------------------------------*/
4186 
4187 float128 float32_to_float128(float32 a, float_status *status)
4188 {
4189     flag aSign;
4190     int aExp;
4191     uint32_t aSig;
4192 
4193     a = float32_squash_input_denormal(a, status);
4194     aSig = extractFloat32Frac( a );
4195     aExp = extractFloat32Exp( a );
4196     aSign = extractFloat32Sign( a );
4197     if ( aExp == 0xFF ) {
4198         if (aSig) {
4199             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4200         }
4201         return packFloat128( aSign, 0x7FFF, 0, 0 );
4202     }
4203     if ( aExp == 0 ) {
4204         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4205         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4206         --aExp;
4207     }
4208     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4209 
4210 }
4211 
4212 /*----------------------------------------------------------------------------
4213 | Returns the remainder of the single-precision floating-point value `a'
4214 | with respect to the corresponding value `b'.  The operation is performed
4215 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4216 *----------------------------------------------------------------------------*/
4217 
4218 float32 float32_rem(float32 a, float32 b, float_status *status)
4219 {
4220     flag aSign, zSign;
4221     int aExp, bExp, expDiff;
4222     uint32_t aSig, bSig;
4223     uint32_t q;
4224     uint64_t aSig64, bSig64, q64;
4225     uint32_t alternateASig;
4226     int32_t sigMean;
4227     a = float32_squash_input_denormal(a, status);
4228     b = float32_squash_input_denormal(b, status);
4229 
4230     aSig = extractFloat32Frac( a );
4231     aExp = extractFloat32Exp( a );
4232     aSign = extractFloat32Sign( a );
4233     bSig = extractFloat32Frac( b );
4234     bExp = extractFloat32Exp( b );
4235     if ( aExp == 0xFF ) {
4236         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4237             return propagateFloat32NaN(a, b, status);
4238         }
4239         float_raise(float_flag_invalid, status);
4240         return float32_default_nan(status);
4241     }
4242     if ( bExp == 0xFF ) {
4243         if (bSig) {
4244             return propagateFloat32NaN(a, b, status);
4245         }
4246         return a;
4247     }
4248     if ( bExp == 0 ) {
4249         if ( bSig == 0 ) {
4250             float_raise(float_flag_invalid, status);
4251             return float32_default_nan(status);
4252         }
4253         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4254     }
4255     if ( aExp == 0 ) {
4256         if ( aSig == 0 ) return a;
4257         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4258     }
4259     expDiff = aExp - bExp;
4260     aSig |= 0x00800000;
4261     bSig |= 0x00800000;
4262     if ( expDiff < 32 ) {
4263         aSig <<= 8;
4264         bSig <<= 8;
4265         if ( expDiff < 0 ) {
4266             if ( expDiff < -1 ) return a;
4267             aSig >>= 1;
4268         }
4269         q = ( bSig <= aSig );
4270         if ( q ) aSig -= bSig;
4271         if ( 0 < expDiff ) {
4272             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4273             q >>= 32 - expDiff;
4274             bSig >>= 2;
4275             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4276         }
4277         else {
4278             aSig >>= 2;
4279             bSig >>= 2;
4280         }
4281     }
4282     else {
4283         if ( bSig <= aSig ) aSig -= bSig;
4284         aSig64 = ( (uint64_t) aSig )<<40;
4285         bSig64 = ( (uint64_t) bSig )<<40;
4286         expDiff -= 64;
4287         while ( 0 < expDiff ) {
4288             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4289             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4290             aSig64 = - ( ( bSig * q64 )<<38 );
4291             expDiff -= 62;
4292         }
4293         expDiff += 64;
4294         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4295         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4296         q = q64>>( 64 - expDiff );
4297         bSig <<= 6;
4298         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4299     }
4300     do {
4301         alternateASig = aSig;
4302         ++q;
4303         aSig -= bSig;
4304     } while ( 0 <= (int32_t) aSig );
4305     sigMean = aSig + alternateASig;
4306     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4307         aSig = alternateASig;
4308     }
4309     zSign = ( (int32_t) aSig < 0 );
4310     if ( zSign ) aSig = - aSig;
4311     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4312 }
4313 
4314 
4315 
4316 /*----------------------------------------------------------------------------
4317 | Returns the binary exponential of the single-precision floating-point value
4318 | `a'. The operation is performed according to the IEC/IEEE Standard for
4319 | Binary Floating-Point Arithmetic.
4320 |
4321 | Uses the following identities:
4322 |
4323 | 1. -------------------------------------------------------------------------
4324 |      x    x*ln(2)
4325 |     2  = e
4326 |
4327 | 2. -------------------------------------------------------------------------
4328 |                      2     3     4     5           n
4329 |      x        x     x     x     x     x           x
4330 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4331 |               1!    2!    3!    4!    5!          n!
4332 *----------------------------------------------------------------------------*/
4333 
4334 static const float64 float32_exp2_coefficients[15] =
4335 {
4336     const_float64( 0x3ff0000000000000ll ), /*  1 */
4337     const_float64( 0x3fe0000000000000ll ), /*  2 */
4338     const_float64( 0x3fc5555555555555ll ), /*  3 */
4339     const_float64( 0x3fa5555555555555ll ), /*  4 */
4340     const_float64( 0x3f81111111111111ll ), /*  5 */
4341     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4342     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4343     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4344     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4345     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4346     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4347     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4348     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4349     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4350     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4351 };
4352 
4353 float32 float32_exp2(float32 a, float_status *status)
4354 {
4355     flag aSign;
4356     int aExp;
4357     uint32_t aSig;
4358     float64 r, x, xn;
4359     int i;
4360     a = float32_squash_input_denormal(a, status);
4361 
4362     aSig = extractFloat32Frac( a );
4363     aExp = extractFloat32Exp( a );
4364     aSign = extractFloat32Sign( a );
4365 
4366     if ( aExp == 0xFF) {
4367         if (aSig) {
4368             return propagateFloat32NaN(a, float32_zero, status);
4369         }
4370         return (aSign) ? float32_zero : a;
4371     }
4372     if (aExp == 0) {
4373         if (aSig == 0) return float32_one;
4374     }
4375 
4376     float_raise(float_flag_inexact, status);
4377 
4378     /* ******************************* */
4379     /* using float64 for approximation */
4380     /* ******************************* */
4381     x = float32_to_float64(a, status);
4382     x = float64_mul(x, float64_ln2, status);
4383 
4384     xn = x;
4385     r = float64_one;
4386     for (i = 0 ; i < 15 ; i++) {
4387         float64 f;
4388 
4389         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4390         r = float64_add(r, f, status);
4391 
4392         xn = float64_mul(xn, x, status);
4393     }
4394 
4395     return float64_to_float32(r, status);
4396 }
4397 
4398 /*----------------------------------------------------------------------------
4399 | Returns the binary log of the single-precision floating-point value `a'.
4400 | The operation is performed according to the IEC/IEEE Standard for Binary
4401 | Floating-Point Arithmetic.
4402 *----------------------------------------------------------------------------*/
4403 float32 float32_log2(float32 a, float_status *status)
4404 {
4405     flag aSign, zSign;
4406     int aExp;
4407     uint32_t aSig, zSig, i;
4408 
4409     a = float32_squash_input_denormal(a, status);
4410     aSig = extractFloat32Frac( a );
4411     aExp = extractFloat32Exp( a );
4412     aSign = extractFloat32Sign( a );
4413 
4414     if ( aExp == 0 ) {
4415         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4416         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4417     }
4418     if ( aSign ) {
4419         float_raise(float_flag_invalid, status);
4420         return float32_default_nan(status);
4421     }
4422     if ( aExp == 0xFF ) {
4423         if (aSig) {
4424             return propagateFloat32NaN(a, float32_zero, status);
4425         }
4426         return a;
4427     }
4428 
4429     aExp -= 0x7F;
4430     aSig |= 0x00800000;
4431     zSign = aExp < 0;
4432     zSig = aExp << 23;
4433 
4434     for (i = 1 << 22; i > 0; i >>= 1) {
4435         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4436         if ( aSig & 0x01000000 ) {
4437             aSig >>= 1;
4438             zSig |= i;
4439         }
4440     }
4441 
4442     if ( zSign )
4443         zSig = -zSig;
4444 
4445     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4446 }
4447 
4448 /*----------------------------------------------------------------------------
4449 | Returns 1 if the single-precision floating-point value `a' is equal to
4450 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4451 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4453 *----------------------------------------------------------------------------*/
4454 
4455 int float32_eq(float32 a, float32 b, float_status *status)
4456 {
4457     uint32_t av, bv;
4458     a = float32_squash_input_denormal(a, status);
4459     b = float32_squash_input_denormal(b, status);
4460 
4461     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4462          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4463        ) {
4464         float_raise(float_flag_invalid, status);
4465         return 0;
4466     }
4467     av = float32_val(a);
4468     bv = float32_val(b);
4469     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4470 }
4471 
4472 /*----------------------------------------------------------------------------
4473 | Returns 1 if the single-precision floating-point value `a' is less than
4474 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4475 | exception is raised if either operand is a NaN.  The comparison is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4478 
4479 int float32_le(float32 a, float32 b, float_status *status)
4480 {
4481     flag aSign, bSign;
4482     uint32_t av, bv;
4483     a = float32_squash_input_denormal(a, status);
4484     b = float32_squash_input_denormal(b, status);
4485 
4486     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4487          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4488        ) {
4489         float_raise(float_flag_invalid, status);
4490         return 0;
4491     }
4492     aSign = extractFloat32Sign( a );
4493     bSign = extractFloat32Sign( b );
4494     av = float32_val(a);
4495     bv = float32_val(b);
4496     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4497     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4498 
4499 }
4500 
4501 /*----------------------------------------------------------------------------
4502 | Returns 1 if the single-precision floating-point value `a' is less than
4503 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4504 | raised if either operand is a NaN.  The comparison is performed according
4505 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4507 
4508 int float32_lt(float32 a, float32 b, float_status *status)
4509 {
4510     flag aSign, bSign;
4511     uint32_t av, bv;
4512     a = float32_squash_input_denormal(a, status);
4513     b = float32_squash_input_denormal(b, status);
4514 
4515     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4516          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4517        ) {
4518         float_raise(float_flag_invalid, status);
4519         return 0;
4520     }
4521     aSign = extractFloat32Sign( a );
4522     bSign = extractFloat32Sign( b );
4523     av = float32_val(a);
4524     bv = float32_val(b);
4525     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4526     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4527 
4528 }
4529 
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4532 | be compared, and 0 otherwise.  The invalid exception is raised if either
4533 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4534 | Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4536 
4537 int float32_unordered(float32 a, float32 b, float_status *status)
4538 {
4539     a = float32_squash_input_denormal(a, status);
4540     b = float32_squash_input_denormal(b, status);
4541 
4542     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4543          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4544        ) {
4545         float_raise(float_flag_invalid, status);
4546         return 1;
4547     }
4548     return 0;
4549 }
4550 
4551 /*----------------------------------------------------------------------------
4552 | Returns 1 if the single-precision floating-point value `a' is equal to
4553 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4554 | exception.  The comparison is performed according to the IEC/IEEE Standard
4555 | for Binary Floating-Point Arithmetic.
4556 *----------------------------------------------------------------------------*/
4557 
4558 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4559 {
4560     a = float32_squash_input_denormal(a, status);
4561     b = float32_squash_input_denormal(b, status);
4562 
4563     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4564          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4565        ) {
4566         if (float32_is_signaling_nan(a, status)
4567          || float32_is_signaling_nan(b, status)) {
4568             float_raise(float_flag_invalid, status);
4569         }
4570         return 0;
4571     }
4572     return ( float32_val(a) == float32_val(b) ) ||
4573             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4574 }
4575 
4576 /*----------------------------------------------------------------------------
4577 | Returns 1 if the single-precision floating-point value `a' is less than or
4578 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4579 | cause an exception.  Otherwise, the comparison is performed according to the
4580 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4581 *----------------------------------------------------------------------------*/
4582 
4583 int float32_le_quiet(float32 a, float32 b, float_status *status)
4584 {
4585     flag aSign, bSign;
4586     uint32_t av, bv;
4587     a = float32_squash_input_denormal(a, status);
4588     b = float32_squash_input_denormal(b, status);
4589 
4590     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4591          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4592        ) {
4593         if (float32_is_signaling_nan(a, status)
4594          || float32_is_signaling_nan(b, status)) {
4595             float_raise(float_flag_invalid, status);
4596         }
4597         return 0;
4598     }
4599     aSign = extractFloat32Sign( a );
4600     bSign = extractFloat32Sign( b );
4601     av = float32_val(a);
4602     bv = float32_val(b);
4603     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4604     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4605 
4606 }
4607 
4608 /*----------------------------------------------------------------------------
4609 | Returns 1 if the single-precision floating-point value `a' is less than
4610 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4611 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4612 | Standard for Binary Floating-Point Arithmetic.
4613 *----------------------------------------------------------------------------*/
4614 
4615 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4616 {
4617     flag aSign, bSign;
4618     uint32_t av, bv;
4619     a = float32_squash_input_denormal(a, status);
4620     b = float32_squash_input_denormal(b, status);
4621 
4622     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4623          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4624        ) {
4625         if (float32_is_signaling_nan(a, status)
4626          || float32_is_signaling_nan(b, status)) {
4627             float_raise(float_flag_invalid, status);
4628         }
4629         return 0;
4630     }
4631     aSign = extractFloat32Sign( a );
4632     bSign = extractFloat32Sign( b );
4633     av = float32_val(a);
4634     bv = float32_val(b);
4635     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4636     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4637 
4638 }
4639 
4640 /*----------------------------------------------------------------------------
4641 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4642 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4643 | comparison is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic.
4645 *----------------------------------------------------------------------------*/
4646 
4647 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4648 {
4649     a = float32_squash_input_denormal(a, status);
4650     b = float32_squash_input_denormal(b, status);
4651 
4652     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4653          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4654        ) {
4655         if (float32_is_signaling_nan(a, status)
4656          || float32_is_signaling_nan(b, status)) {
4657             float_raise(float_flag_invalid, status);
4658         }
4659         return 1;
4660     }
4661     return 0;
4662 }
4663 
4664 /*----------------------------------------------------------------------------
4665 | If `a' is denormal and we are in flush-to-zero mode then set the
4666 | input-denormal exception and return zero. Otherwise just return the value.
4667 *----------------------------------------------------------------------------*/
4668 float16 float16_squash_input_denormal(float16 a, float_status *status)
4669 {
4670     if (status->flush_inputs_to_zero) {
4671         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4672             float_raise(float_flag_input_denormal, status);
4673             return make_float16(float16_val(a) & 0x8000);
4674         }
4675     }
4676     return a;
4677 }
4678 
4679 /*----------------------------------------------------------------------------
4680 | Returns the result of converting the double-precision floating-point value
4681 | `a' to the extended double-precision floating-point format.  The conversion
4682 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4683 | Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685 
4686 floatx80 float64_to_floatx80(float64 a, float_status *status)
4687 {
4688     flag aSign;
4689     int aExp;
4690     uint64_t aSig;
4691 
4692     a = float64_squash_input_denormal(a, status);
4693     aSig = extractFloat64Frac( a );
4694     aExp = extractFloat64Exp( a );
4695     aSign = extractFloat64Sign( a );
4696     if ( aExp == 0x7FF ) {
4697         if (aSig) {
4698             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4699         }
4700         return packFloatx80(aSign,
4701                             floatx80_infinity_high,
4702                             floatx80_infinity_low);
4703     }
4704     if ( aExp == 0 ) {
4705         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4706         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4707     }
4708     return
4709         packFloatx80(
4710             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4711 
4712 }
4713 
4714 /*----------------------------------------------------------------------------
4715 | Returns the result of converting the double-precision floating-point value
4716 | `a' to the quadruple-precision floating-point format.  The conversion is
4717 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4718 | Arithmetic.
4719 *----------------------------------------------------------------------------*/
4720 
4721 float128 float64_to_float128(float64 a, float_status *status)
4722 {
4723     flag aSign;
4724     int aExp;
4725     uint64_t aSig, zSig0, zSig1;
4726 
4727     a = float64_squash_input_denormal(a, status);
4728     aSig = extractFloat64Frac( a );
4729     aExp = extractFloat64Exp( a );
4730     aSign = extractFloat64Sign( a );
4731     if ( aExp == 0x7FF ) {
4732         if (aSig) {
4733             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4734         }
4735         return packFloat128( aSign, 0x7FFF, 0, 0 );
4736     }
4737     if ( aExp == 0 ) {
4738         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4739         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4740         --aExp;
4741     }
4742     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4743     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4744 
4745 }
4746 
4747 
4748 /*----------------------------------------------------------------------------
4749 | Returns the remainder of the double-precision floating-point value `a'
4750 | with respect to the corresponding value `b'.  The operation is performed
4751 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4752 *----------------------------------------------------------------------------*/
4753 
4754 float64 float64_rem(float64 a, float64 b, float_status *status)
4755 {
4756     flag aSign, zSign;
4757     int aExp, bExp, expDiff;
4758     uint64_t aSig, bSig;
4759     uint64_t q, alternateASig;
4760     int64_t sigMean;
4761 
4762     a = float64_squash_input_denormal(a, status);
4763     b = float64_squash_input_denormal(b, status);
4764     aSig = extractFloat64Frac( a );
4765     aExp = extractFloat64Exp( a );
4766     aSign = extractFloat64Sign( a );
4767     bSig = extractFloat64Frac( b );
4768     bExp = extractFloat64Exp( b );
4769     if ( aExp == 0x7FF ) {
4770         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4771             return propagateFloat64NaN(a, b, status);
4772         }
4773         float_raise(float_flag_invalid, status);
4774         return float64_default_nan(status);
4775     }
4776     if ( bExp == 0x7FF ) {
4777         if (bSig) {
4778             return propagateFloat64NaN(a, b, status);
4779         }
4780         return a;
4781     }
4782     if ( bExp == 0 ) {
4783         if ( bSig == 0 ) {
4784             float_raise(float_flag_invalid, status);
4785             return float64_default_nan(status);
4786         }
4787         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4788     }
4789     if ( aExp == 0 ) {
4790         if ( aSig == 0 ) return a;
4791         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4792     }
4793     expDiff = aExp - bExp;
4794     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4795     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4796     if ( expDiff < 0 ) {
4797         if ( expDiff < -1 ) return a;
4798         aSig >>= 1;
4799     }
4800     q = ( bSig <= aSig );
4801     if ( q ) aSig -= bSig;
4802     expDiff -= 64;
4803     while ( 0 < expDiff ) {
4804         q = estimateDiv128To64( aSig, 0, bSig );
4805         q = ( 2 < q ) ? q - 2 : 0;
4806         aSig = - ( ( bSig>>2 ) * q );
4807         expDiff -= 62;
4808     }
4809     expDiff += 64;
4810     if ( 0 < expDiff ) {
4811         q = estimateDiv128To64( aSig, 0, bSig );
4812         q = ( 2 < q ) ? q - 2 : 0;
4813         q >>= 64 - expDiff;
4814         bSig >>= 2;
4815         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4816     }
4817     else {
4818         aSig >>= 2;
4819         bSig >>= 2;
4820     }
4821     do {
4822         alternateASig = aSig;
4823         ++q;
4824         aSig -= bSig;
4825     } while ( 0 <= (int64_t) aSig );
4826     sigMean = aSig + alternateASig;
4827     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4828         aSig = alternateASig;
4829     }
4830     zSign = ( (int64_t) aSig < 0 );
4831     if ( zSign ) aSig = - aSig;
4832     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4833 
4834 }
4835 
4836 /*----------------------------------------------------------------------------
4837 | Returns the binary log of the double-precision floating-point value `a'.
4838 | The operation is performed according to the IEC/IEEE Standard for Binary
4839 | Floating-Point Arithmetic.
4840 *----------------------------------------------------------------------------*/
4841 float64 float64_log2(float64 a, float_status *status)
4842 {
4843     flag aSign, zSign;
4844     int aExp;
4845     uint64_t aSig, aSig0, aSig1, zSig, i;
4846     a = float64_squash_input_denormal(a, status);
4847 
4848     aSig = extractFloat64Frac( a );
4849     aExp = extractFloat64Exp( a );
4850     aSign = extractFloat64Sign( a );
4851 
4852     if ( aExp == 0 ) {
4853         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4854         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4855     }
4856     if ( aSign ) {
4857         float_raise(float_flag_invalid, status);
4858         return float64_default_nan(status);
4859     }
4860     if ( aExp == 0x7FF ) {
4861         if (aSig) {
4862             return propagateFloat64NaN(a, float64_zero, status);
4863         }
4864         return a;
4865     }
4866 
4867     aExp -= 0x3FF;
4868     aSig |= LIT64( 0x0010000000000000 );
4869     zSign = aExp < 0;
4870     zSig = (uint64_t)aExp << 52;
4871     for (i = 1LL << 51; i > 0; i >>= 1) {
4872         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4873         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4874         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4875             aSig >>= 1;
4876             zSig |= i;
4877         }
4878     }
4879 
4880     if ( zSign )
4881         zSig = -zSig;
4882     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4883 }
4884 
4885 /*----------------------------------------------------------------------------
4886 | Returns 1 if the double-precision floating-point value `a' is equal to the
4887 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4888 | if either operand is a NaN.  Otherwise, the comparison is performed
4889 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4890 *----------------------------------------------------------------------------*/
4891 
4892 int float64_eq(float64 a, float64 b, float_status *status)
4893 {
4894     uint64_t av, bv;
4895     a = float64_squash_input_denormal(a, status);
4896     b = float64_squash_input_denormal(b, status);
4897 
4898     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4899          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4900        ) {
4901         float_raise(float_flag_invalid, status);
4902         return 0;
4903     }
4904     av = float64_val(a);
4905     bv = float64_val(b);
4906     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4907 
4908 }
4909 
4910 /*----------------------------------------------------------------------------
4911 | Returns 1 if the double-precision floating-point value `a' is less than or
4912 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4913 | exception is raised if either operand is a NaN.  The comparison is performed
4914 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4915 *----------------------------------------------------------------------------*/
4916 
4917 int float64_le(float64 a, float64 b, float_status *status)
4918 {
4919     flag aSign, bSign;
4920     uint64_t av, bv;
4921     a = float64_squash_input_denormal(a, status);
4922     b = float64_squash_input_denormal(b, status);
4923 
4924     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4925          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4926        ) {
4927         float_raise(float_flag_invalid, status);
4928         return 0;
4929     }
4930     aSign = extractFloat64Sign( a );
4931     bSign = extractFloat64Sign( b );
4932     av = float64_val(a);
4933     bv = float64_val(b);
4934     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4935     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4936 
4937 }
4938 
4939 /*----------------------------------------------------------------------------
4940 | Returns 1 if the double-precision floating-point value `a' is less than
4941 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4942 | raised if either operand is a NaN.  The comparison is performed according
4943 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4945 
4946 int float64_lt(float64 a, float64 b, float_status *status)
4947 {
4948     flag aSign, bSign;
4949     uint64_t av, bv;
4950 
4951     a = float64_squash_input_denormal(a, status);
4952     b = float64_squash_input_denormal(b, status);
4953     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4954          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4955        ) {
4956         float_raise(float_flag_invalid, status);
4957         return 0;
4958     }
4959     aSign = extractFloat64Sign( a );
4960     bSign = extractFloat64Sign( b );
4961     av = float64_val(a);
4962     bv = float64_val(b);
4963     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4964     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4965 
4966 }
4967 
4968 /*----------------------------------------------------------------------------
4969 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4970 | be compared, and 0 otherwise.  The invalid exception is raised if either
4971 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4972 | Standard for Binary Floating-Point Arithmetic.
4973 *----------------------------------------------------------------------------*/
4974 
4975 int float64_unordered(float64 a, float64 b, float_status *status)
4976 {
4977     a = float64_squash_input_denormal(a, status);
4978     b = float64_squash_input_denormal(b, status);
4979 
4980     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4981          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4982        ) {
4983         float_raise(float_flag_invalid, status);
4984         return 1;
4985     }
4986     return 0;
4987 }
4988 
4989 /*----------------------------------------------------------------------------
4990 | Returns 1 if the double-precision floating-point value `a' is equal to the
4991 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4992 | exception.The comparison is performed according to the IEC/IEEE Standard
4993 | for Binary Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995 
4996 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4997 {
4998     uint64_t av, bv;
4999     a = float64_squash_input_denormal(a, status);
5000     b = float64_squash_input_denormal(b, status);
5001 
5002     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5003          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5004        ) {
5005         if (float64_is_signaling_nan(a, status)
5006          || float64_is_signaling_nan(b, status)) {
5007             float_raise(float_flag_invalid, status);
5008         }
5009         return 0;
5010     }
5011     av = float64_val(a);
5012     bv = float64_val(b);
5013     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5014 
5015 }
5016 
5017 /*----------------------------------------------------------------------------
5018 | Returns 1 if the double-precision floating-point value `a' is less than or
5019 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5020 | cause an exception.  Otherwise, the comparison is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5023 
5024 int float64_le_quiet(float64 a, float64 b, float_status *status)
5025 {
5026     flag aSign, bSign;
5027     uint64_t av, bv;
5028     a = float64_squash_input_denormal(a, status);
5029     b = float64_squash_input_denormal(b, status);
5030 
5031     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5032          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5033        ) {
5034         if (float64_is_signaling_nan(a, status)
5035          || float64_is_signaling_nan(b, status)) {
5036             float_raise(float_flag_invalid, status);
5037         }
5038         return 0;
5039     }
5040     aSign = extractFloat64Sign( a );
5041     bSign = extractFloat64Sign( b );
5042     av = float64_val(a);
5043     bv = float64_val(b);
5044     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5045     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5046 
5047 }
5048 
5049 /*----------------------------------------------------------------------------
5050 | Returns 1 if the double-precision floating-point value `a' is less than
5051 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5052 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5053 | Standard for Binary Floating-Point Arithmetic.
5054 *----------------------------------------------------------------------------*/
5055 
5056 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5057 {
5058     flag aSign, bSign;
5059     uint64_t av, bv;
5060     a = float64_squash_input_denormal(a, status);
5061     b = float64_squash_input_denormal(b, status);
5062 
5063     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5064          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5065        ) {
5066         if (float64_is_signaling_nan(a, status)
5067          || float64_is_signaling_nan(b, status)) {
5068             float_raise(float_flag_invalid, status);
5069         }
5070         return 0;
5071     }
5072     aSign = extractFloat64Sign( a );
5073     bSign = extractFloat64Sign( b );
5074     av = float64_val(a);
5075     bv = float64_val(b);
5076     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5077     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5078 
5079 }
5080 
5081 /*----------------------------------------------------------------------------
5082 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5083 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5084 | comparison is performed according to the IEC/IEEE Standard for Binary
5085 | Floating-Point Arithmetic.
5086 *----------------------------------------------------------------------------*/
5087 
5088 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5089 {
5090     a = float64_squash_input_denormal(a, status);
5091     b = float64_squash_input_denormal(b, status);
5092 
5093     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5094          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5095        ) {
5096         if (float64_is_signaling_nan(a, status)
5097          || float64_is_signaling_nan(b, status)) {
5098             float_raise(float_flag_invalid, status);
5099         }
5100         return 1;
5101     }
5102     return 0;
5103 }
5104 
5105 /*----------------------------------------------------------------------------
5106 | Returns the result of converting the extended double-precision floating-
5107 | point value `a' to the 32-bit two's complement integer format.  The
5108 | conversion is performed according to the IEC/IEEE Standard for Binary
5109 | Floating-Point Arithmetic---which means in particular that the conversion
5110 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5111 | largest positive integer is returned.  Otherwise, if the conversion
5112 | overflows, the largest integer with the same sign as `a' is returned.
5113 *----------------------------------------------------------------------------*/
5114 
5115 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5116 {
5117     flag aSign;
5118     int32_t aExp, shiftCount;
5119     uint64_t aSig;
5120 
5121     if (floatx80_invalid_encoding(a)) {
5122         float_raise(float_flag_invalid, status);
5123         return 1 << 31;
5124     }
5125     aSig = extractFloatx80Frac( a );
5126     aExp = extractFloatx80Exp( a );
5127     aSign = extractFloatx80Sign( a );
5128     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5129     shiftCount = 0x4037 - aExp;
5130     if ( shiftCount <= 0 ) shiftCount = 1;
5131     shift64RightJamming( aSig, shiftCount, &aSig );
5132     return roundAndPackInt32(aSign, aSig, status);
5133 
5134 }
5135 
5136 /*----------------------------------------------------------------------------
5137 | Returns the result of converting the extended double-precision floating-
5138 | point value `a' to the 32-bit two's complement integer format.  The
5139 | conversion is performed according to the IEC/IEEE Standard for Binary
5140 | Floating-Point Arithmetic, except that the conversion is always rounded
5141 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5142 | Otherwise, if the conversion overflows, the largest integer with the same
5143 | sign as `a' is returned.
5144 *----------------------------------------------------------------------------*/
5145 
5146 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5147 {
5148     flag aSign;
5149     int32_t aExp, shiftCount;
5150     uint64_t aSig, savedASig;
5151     int32_t z;
5152 
5153     if (floatx80_invalid_encoding(a)) {
5154         float_raise(float_flag_invalid, status);
5155         return 1 << 31;
5156     }
5157     aSig = extractFloatx80Frac( a );
5158     aExp = extractFloatx80Exp( a );
5159     aSign = extractFloatx80Sign( a );
5160     if ( 0x401E < aExp ) {
5161         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5162         goto invalid;
5163     }
5164     else if ( aExp < 0x3FFF ) {
5165         if (aExp || aSig) {
5166             status->float_exception_flags |= float_flag_inexact;
5167         }
5168         return 0;
5169     }
5170     shiftCount = 0x403E - aExp;
5171     savedASig = aSig;
5172     aSig >>= shiftCount;
5173     z = aSig;
5174     if ( aSign ) z = - z;
5175     if ( ( z < 0 ) ^ aSign ) {
5176  invalid:
5177         float_raise(float_flag_invalid, status);
5178         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5179     }
5180     if ( ( aSig<<shiftCount ) != savedASig ) {
5181         status->float_exception_flags |= float_flag_inexact;
5182     }
5183     return z;
5184 
5185 }
5186 
5187 /*----------------------------------------------------------------------------
5188 | Returns the result of converting the extended double-precision floating-
5189 | point value `a' to the 64-bit two's complement integer format.  The
5190 | conversion is performed according to the IEC/IEEE Standard for Binary
5191 | Floating-Point Arithmetic---which means in particular that the conversion
5192 | is rounded according to the current rounding mode.  If `a' is a NaN,
5193 | the largest positive integer is returned.  Otherwise, if the conversion
5194 | overflows, the largest integer with the same sign as `a' is returned.
5195 *----------------------------------------------------------------------------*/
5196 
5197 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5198 {
5199     flag aSign;
5200     int32_t aExp, shiftCount;
5201     uint64_t aSig, aSigExtra;
5202 
5203     if (floatx80_invalid_encoding(a)) {
5204         float_raise(float_flag_invalid, status);
5205         return 1ULL << 63;
5206     }
5207     aSig = extractFloatx80Frac( a );
5208     aExp = extractFloatx80Exp( a );
5209     aSign = extractFloatx80Sign( a );
5210     shiftCount = 0x403E - aExp;
5211     if ( shiftCount <= 0 ) {
5212         if ( shiftCount ) {
5213             float_raise(float_flag_invalid, status);
5214             if (!aSign || floatx80_is_any_nan(a)) {
5215                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5216             }
5217             return (int64_t) LIT64( 0x8000000000000000 );
5218         }
5219         aSigExtra = 0;
5220     }
5221     else {
5222         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5223     }
5224     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5225 
5226 }
5227 
5228 /*----------------------------------------------------------------------------
5229 | Returns the result of converting the extended double-precision floating-
5230 | point value `a' to the 64-bit two's complement integer format.  The
5231 | conversion is performed according to the IEC/IEEE Standard for Binary
5232 | Floating-Point Arithmetic, except that the conversion is always rounded
5233 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5234 | Otherwise, if the conversion overflows, the largest integer with the same
5235 | sign as `a' is returned.
5236 *----------------------------------------------------------------------------*/
5237 
5238 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5239 {
5240     flag aSign;
5241     int32_t aExp, shiftCount;
5242     uint64_t aSig;
5243     int64_t z;
5244 
5245     if (floatx80_invalid_encoding(a)) {
5246         float_raise(float_flag_invalid, status);
5247         return 1ULL << 63;
5248     }
5249     aSig = extractFloatx80Frac( a );
5250     aExp = extractFloatx80Exp( a );
5251     aSign = extractFloatx80Sign( a );
5252     shiftCount = aExp - 0x403E;
5253     if ( 0 <= shiftCount ) {
5254         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5255         if ( ( a.high != 0xC03E ) || aSig ) {
5256             float_raise(float_flag_invalid, status);
5257             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5258                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5259             }
5260         }
5261         return (int64_t) LIT64( 0x8000000000000000 );
5262     }
5263     else if ( aExp < 0x3FFF ) {
5264         if (aExp | aSig) {
5265             status->float_exception_flags |= float_flag_inexact;
5266         }
5267         return 0;
5268     }
5269     z = aSig>>( - shiftCount );
5270     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5271         status->float_exception_flags |= float_flag_inexact;
5272     }
5273     if ( aSign ) z = - z;
5274     return z;
5275 
5276 }
5277 
5278 /*----------------------------------------------------------------------------
5279 | Returns the result of converting the extended double-precision floating-
5280 | point value `a' to the single-precision floating-point format.  The
5281 | conversion is performed according to the IEC/IEEE Standard for Binary
5282 | Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284 
5285 float32 floatx80_to_float32(floatx80 a, float_status *status)
5286 {
5287     flag aSign;
5288     int32_t aExp;
5289     uint64_t aSig;
5290 
5291     if (floatx80_invalid_encoding(a)) {
5292         float_raise(float_flag_invalid, status);
5293         return float32_default_nan(status);
5294     }
5295     aSig = extractFloatx80Frac( a );
5296     aExp = extractFloatx80Exp( a );
5297     aSign = extractFloatx80Sign( a );
5298     if ( aExp == 0x7FFF ) {
5299         if ( (uint64_t) ( aSig<<1 ) ) {
5300             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5301         }
5302         return packFloat32( aSign, 0xFF, 0 );
5303     }
5304     shift64RightJamming( aSig, 33, &aSig );
5305     if ( aExp || aSig ) aExp -= 0x3F81;
5306     return roundAndPackFloat32(aSign, aExp, aSig, status);
5307 
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Returns the result of converting the extended double-precision floating-
5312 | point value `a' to the double-precision floating-point format.  The
5313 | conversion is performed according to the IEC/IEEE Standard for Binary
5314 | Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5316 
5317 float64 floatx80_to_float64(floatx80 a, float_status *status)
5318 {
5319     flag aSign;
5320     int32_t aExp;
5321     uint64_t aSig, zSig;
5322 
5323     if (floatx80_invalid_encoding(a)) {
5324         float_raise(float_flag_invalid, status);
5325         return float64_default_nan(status);
5326     }
5327     aSig = extractFloatx80Frac( a );
5328     aExp = extractFloatx80Exp( a );
5329     aSign = extractFloatx80Sign( a );
5330     if ( aExp == 0x7FFF ) {
5331         if ( (uint64_t) ( aSig<<1 ) ) {
5332             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5333         }
5334         return packFloat64( aSign, 0x7FF, 0 );
5335     }
5336     shift64RightJamming( aSig, 1, &zSig );
5337     if ( aExp || aSig ) aExp -= 0x3C01;
5338     return roundAndPackFloat64(aSign, aExp, zSig, status);
5339 
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns the result of converting the extended double-precision floating-
5344 | point value `a' to the quadruple-precision floating-point format.  The
5345 | conversion is performed according to the IEC/IEEE Standard for Binary
5346 | Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5348 
5349 float128 floatx80_to_float128(floatx80 a, float_status *status)
5350 {
5351     flag aSign;
5352     int aExp;
5353     uint64_t aSig, zSig0, zSig1;
5354 
5355     if (floatx80_invalid_encoding(a)) {
5356         float_raise(float_flag_invalid, status);
5357         return float128_default_nan(status);
5358     }
5359     aSig = extractFloatx80Frac( a );
5360     aExp = extractFloatx80Exp( a );
5361     aSign = extractFloatx80Sign( a );
5362     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5363         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5364     }
5365     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5366     return packFloat128( aSign, aExp, zSig0, zSig1 );
5367 
5368 }
5369 
5370 /*----------------------------------------------------------------------------
5371 | Rounds the extended double-precision floating-point value `a'
5372 | to the precision provided by floatx80_rounding_precision and returns the
5373 | result as an extended double-precision floating-point value.
5374 | The operation is performed according to the IEC/IEEE Standard for Binary
5375 | Floating-Point Arithmetic.
5376 *----------------------------------------------------------------------------*/
5377 
5378 floatx80 floatx80_round(floatx80 a, float_status *status)
5379 {
5380     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5381                                 extractFloatx80Sign(a),
5382                                 extractFloatx80Exp(a),
5383                                 extractFloatx80Frac(a), 0, status);
5384 }
5385 
5386 /*----------------------------------------------------------------------------
5387 | Rounds the extended double-precision floating-point value `a' to an integer,
5388 | and returns the result as an extended quadruple-precision floating-point
5389 | value.  The operation is performed according to the IEC/IEEE Standard for
5390 | Binary Floating-Point Arithmetic.
5391 *----------------------------------------------------------------------------*/
5392 
5393 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5394 {
5395     flag aSign;
5396     int32_t aExp;
5397     uint64_t lastBitMask, roundBitsMask;
5398     floatx80 z;
5399 
5400     if (floatx80_invalid_encoding(a)) {
5401         float_raise(float_flag_invalid, status);
5402         return floatx80_default_nan(status);
5403     }
5404     aExp = extractFloatx80Exp( a );
5405     if ( 0x403E <= aExp ) {
5406         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5407             return propagateFloatx80NaN(a, a, status);
5408         }
5409         return a;
5410     }
5411     if ( aExp < 0x3FFF ) {
5412         if (    ( aExp == 0 )
5413              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5414             return a;
5415         }
5416         status->float_exception_flags |= float_flag_inexact;
5417         aSign = extractFloatx80Sign( a );
5418         switch (status->float_rounding_mode) {
5419          case float_round_nearest_even:
5420             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5421                ) {
5422                 return
5423                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5424             }
5425             break;
5426         case float_round_ties_away:
5427             if (aExp == 0x3FFE) {
5428                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5429             }
5430             break;
5431          case float_round_down:
5432             return
5433                   aSign ?
5434                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5435                 : packFloatx80( 0, 0, 0 );
5436          case float_round_up:
5437             return
5438                   aSign ? packFloatx80( 1, 0, 0 )
5439                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5440         }
5441         return packFloatx80( aSign, 0, 0 );
5442     }
5443     lastBitMask = 1;
5444     lastBitMask <<= 0x403E - aExp;
5445     roundBitsMask = lastBitMask - 1;
5446     z = a;
5447     switch (status->float_rounding_mode) {
5448     case float_round_nearest_even:
5449         z.low += lastBitMask>>1;
5450         if ((z.low & roundBitsMask) == 0) {
5451             z.low &= ~lastBitMask;
5452         }
5453         break;
5454     case float_round_ties_away:
5455         z.low += lastBitMask >> 1;
5456         break;
5457     case float_round_to_zero:
5458         break;
5459     case float_round_up:
5460         if (!extractFloatx80Sign(z)) {
5461             z.low += roundBitsMask;
5462         }
5463         break;
5464     case float_round_down:
5465         if (extractFloatx80Sign(z)) {
5466             z.low += roundBitsMask;
5467         }
5468         break;
5469     default:
5470         abort();
5471     }
5472     z.low &= ~ roundBitsMask;
5473     if ( z.low == 0 ) {
5474         ++z.high;
5475         z.low = LIT64( 0x8000000000000000 );
5476     }
5477     if (z.low != a.low) {
5478         status->float_exception_flags |= float_flag_inexact;
5479     }
5480     return z;
5481 
5482 }
5483 
5484 /*----------------------------------------------------------------------------
5485 | Returns the result of adding the absolute values of the extended double-
5486 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5487 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5488 | The addition is performed according to the IEC/IEEE Standard for Binary
5489 | Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5491 
5492 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5493                                 float_status *status)
5494 {
5495     int32_t aExp, bExp, zExp;
5496     uint64_t aSig, bSig, zSig0, zSig1;
5497     int32_t expDiff;
5498 
5499     aSig = extractFloatx80Frac( a );
5500     aExp = extractFloatx80Exp( a );
5501     bSig = extractFloatx80Frac( b );
5502     bExp = extractFloatx80Exp( b );
5503     expDiff = aExp - bExp;
5504     if ( 0 < expDiff ) {
5505         if ( aExp == 0x7FFF ) {
5506             if ((uint64_t)(aSig << 1)) {
5507                 return propagateFloatx80NaN(a, b, status);
5508             }
5509             return a;
5510         }
5511         if ( bExp == 0 ) --expDiff;
5512         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5513         zExp = aExp;
5514     }
5515     else if ( expDiff < 0 ) {
5516         if ( bExp == 0x7FFF ) {
5517             if ((uint64_t)(bSig << 1)) {
5518                 return propagateFloatx80NaN(a, b, status);
5519             }
5520             return packFloatx80(zSign,
5521                                 floatx80_infinity_high,
5522                                 floatx80_infinity_low);
5523         }
5524         if ( aExp == 0 ) ++expDiff;
5525         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5526         zExp = bExp;
5527     }
5528     else {
5529         if ( aExp == 0x7FFF ) {
5530             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5531                 return propagateFloatx80NaN(a, b, status);
5532             }
5533             return a;
5534         }
5535         zSig1 = 0;
5536         zSig0 = aSig + bSig;
5537         if ( aExp == 0 ) {
5538             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5539             goto roundAndPack;
5540         }
5541         zExp = aExp;
5542         goto shiftRight1;
5543     }
5544     zSig0 = aSig + bSig;
5545     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5546  shiftRight1:
5547     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5548     zSig0 |= LIT64( 0x8000000000000000 );
5549     ++zExp;
5550  roundAndPack:
5551     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5552                                 zSign, zExp, zSig0, zSig1, status);
5553 }
5554 
5555 /*----------------------------------------------------------------------------
5556 | Returns the result of subtracting the absolute values of the extended
5557 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5558 | difference is negated before being returned.  `zSign' is ignored if the
5559 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5560 | Standard for Binary Floating-Point Arithmetic.
5561 *----------------------------------------------------------------------------*/
5562 
5563 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5564                                 float_status *status)
5565 {
5566     int32_t aExp, bExp, zExp;
5567     uint64_t aSig, bSig, zSig0, zSig1;
5568     int32_t expDiff;
5569 
5570     aSig = extractFloatx80Frac( a );
5571     aExp = extractFloatx80Exp( a );
5572     bSig = extractFloatx80Frac( b );
5573     bExp = extractFloatx80Exp( b );
5574     expDiff = aExp - bExp;
5575     if ( 0 < expDiff ) goto aExpBigger;
5576     if ( expDiff < 0 ) goto bExpBigger;
5577     if ( aExp == 0x7FFF ) {
5578         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5579             return propagateFloatx80NaN(a, b, status);
5580         }
5581         float_raise(float_flag_invalid, status);
5582         return floatx80_default_nan(status);
5583     }
5584     if ( aExp == 0 ) {
5585         aExp = 1;
5586         bExp = 1;
5587     }
5588     zSig1 = 0;
5589     if ( bSig < aSig ) goto aBigger;
5590     if ( aSig < bSig ) goto bBigger;
5591     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5592  bExpBigger:
5593     if ( bExp == 0x7FFF ) {
5594         if ((uint64_t)(bSig << 1)) {
5595             return propagateFloatx80NaN(a, b, status);
5596         }
5597         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5598                             floatx80_infinity_low);
5599     }
5600     if ( aExp == 0 ) ++expDiff;
5601     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5602  bBigger:
5603     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5604     zExp = bExp;
5605     zSign ^= 1;
5606     goto normalizeRoundAndPack;
5607  aExpBigger:
5608     if ( aExp == 0x7FFF ) {
5609         if ((uint64_t)(aSig << 1)) {
5610             return propagateFloatx80NaN(a, b, status);
5611         }
5612         return a;
5613     }
5614     if ( bExp == 0 ) --expDiff;
5615     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5616  aBigger:
5617     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5618     zExp = aExp;
5619  normalizeRoundAndPack:
5620     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5621                                          zSign, zExp, zSig0, zSig1, status);
5622 }
5623 
5624 /*----------------------------------------------------------------------------
5625 | Returns the result of adding the extended double-precision floating-point
5626 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5627 | Standard for Binary Floating-Point Arithmetic.
5628 *----------------------------------------------------------------------------*/
5629 
5630 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5631 {
5632     flag aSign, bSign;
5633 
5634     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5635         float_raise(float_flag_invalid, status);
5636         return floatx80_default_nan(status);
5637     }
5638     aSign = extractFloatx80Sign( a );
5639     bSign = extractFloatx80Sign( b );
5640     if ( aSign == bSign ) {
5641         return addFloatx80Sigs(a, b, aSign, status);
5642     }
5643     else {
5644         return subFloatx80Sigs(a, b, aSign, status);
5645     }
5646 
5647 }
5648 
5649 /*----------------------------------------------------------------------------
5650 | Returns the result of subtracting the extended double-precision floating-
5651 | point values `a' and `b'.  The operation is performed according to the
5652 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5653 *----------------------------------------------------------------------------*/
5654 
5655 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5656 {
5657     flag aSign, bSign;
5658 
5659     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5660         float_raise(float_flag_invalid, status);
5661         return floatx80_default_nan(status);
5662     }
5663     aSign = extractFloatx80Sign( a );
5664     bSign = extractFloatx80Sign( b );
5665     if ( aSign == bSign ) {
5666         return subFloatx80Sigs(a, b, aSign, status);
5667     }
5668     else {
5669         return addFloatx80Sigs(a, b, aSign, status);
5670     }
5671 
5672 }
5673 
5674 /*----------------------------------------------------------------------------
5675 | Returns the result of multiplying the extended double-precision floating-
5676 | point values `a' and `b'.  The operation is performed according to the
5677 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5678 *----------------------------------------------------------------------------*/
5679 
5680 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5681 {
5682     flag aSign, bSign, zSign;
5683     int32_t aExp, bExp, zExp;
5684     uint64_t aSig, bSig, zSig0, zSig1;
5685 
5686     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5687         float_raise(float_flag_invalid, status);
5688         return floatx80_default_nan(status);
5689     }
5690     aSig = extractFloatx80Frac( a );
5691     aExp = extractFloatx80Exp( a );
5692     aSign = extractFloatx80Sign( a );
5693     bSig = extractFloatx80Frac( b );
5694     bExp = extractFloatx80Exp( b );
5695     bSign = extractFloatx80Sign( b );
5696     zSign = aSign ^ bSign;
5697     if ( aExp == 0x7FFF ) {
5698         if (    (uint64_t) ( aSig<<1 )
5699              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5700             return propagateFloatx80NaN(a, b, status);
5701         }
5702         if ( ( bExp | bSig ) == 0 ) goto invalid;
5703         return packFloatx80(zSign, floatx80_infinity_high,
5704                                    floatx80_infinity_low);
5705     }
5706     if ( bExp == 0x7FFF ) {
5707         if ((uint64_t)(bSig << 1)) {
5708             return propagateFloatx80NaN(a, b, status);
5709         }
5710         if ( ( aExp | aSig ) == 0 ) {
5711  invalid:
5712             float_raise(float_flag_invalid, status);
5713             return floatx80_default_nan(status);
5714         }
5715         return packFloatx80(zSign, floatx80_infinity_high,
5716                                    floatx80_infinity_low);
5717     }
5718     if ( aExp == 0 ) {
5719         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5720         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5721     }
5722     if ( bExp == 0 ) {
5723         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5724         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5725     }
5726     zExp = aExp + bExp - 0x3FFE;
5727     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5728     if ( 0 < (int64_t) zSig0 ) {
5729         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5730         --zExp;
5731     }
5732     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5733                                 zSign, zExp, zSig0, zSig1, status);
5734 }
5735 
5736 /*----------------------------------------------------------------------------
5737 | Returns the result of dividing the extended double-precision floating-point
5738 | value `a' by the corresponding value `b'.  The operation is performed
5739 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5740 *----------------------------------------------------------------------------*/
5741 
5742 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5743 {
5744     flag aSign, bSign, zSign;
5745     int32_t aExp, bExp, zExp;
5746     uint64_t aSig, bSig, zSig0, zSig1;
5747     uint64_t rem0, rem1, rem2, term0, term1, term2;
5748 
5749     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5750         float_raise(float_flag_invalid, status);
5751         return floatx80_default_nan(status);
5752     }
5753     aSig = extractFloatx80Frac( a );
5754     aExp = extractFloatx80Exp( a );
5755     aSign = extractFloatx80Sign( a );
5756     bSig = extractFloatx80Frac( b );
5757     bExp = extractFloatx80Exp( b );
5758     bSign = extractFloatx80Sign( b );
5759     zSign = aSign ^ bSign;
5760     if ( aExp == 0x7FFF ) {
5761         if ((uint64_t)(aSig << 1)) {
5762             return propagateFloatx80NaN(a, b, status);
5763         }
5764         if ( bExp == 0x7FFF ) {
5765             if ((uint64_t)(bSig << 1)) {
5766                 return propagateFloatx80NaN(a, b, status);
5767             }
5768             goto invalid;
5769         }
5770         return packFloatx80(zSign, floatx80_infinity_high,
5771                                    floatx80_infinity_low);
5772     }
5773     if ( bExp == 0x7FFF ) {
5774         if ((uint64_t)(bSig << 1)) {
5775             return propagateFloatx80NaN(a, b, status);
5776         }
5777         return packFloatx80( zSign, 0, 0 );
5778     }
5779     if ( bExp == 0 ) {
5780         if ( bSig == 0 ) {
5781             if ( ( aExp | aSig ) == 0 ) {
5782  invalid:
5783                 float_raise(float_flag_invalid, status);
5784                 return floatx80_default_nan(status);
5785             }
5786             float_raise(float_flag_divbyzero, status);
5787             return packFloatx80(zSign, floatx80_infinity_high,
5788                                        floatx80_infinity_low);
5789         }
5790         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5791     }
5792     if ( aExp == 0 ) {
5793         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5794         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5795     }
5796     zExp = aExp - bExp + 0x3FFE;
5797     rem1 = 0;
5798     if ( bSig <= aSig ) {
5799         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5800         ++zExp;
5801     }
5802     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5803     mul64To128( bSig, zSig0, &term0, &term1 );
5804     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5805     while ( (int64_t) rem0 < 0 ) {
5806         --zSig0;
5807         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5808     }
5809     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5810     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5811         mul64To128( bSig, zSig1, &term1, &term2 );
5812         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5813         while ( (int64_t) rem1 < 0 ) {
5814             --zSig1;
5815             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5816         }
5817         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5818     }
5819     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5820                                 zSign, zExp, zSig0, zSig1, status);
5821 }
5822 
5823 /*----------------------------------------------------------------------------
5824 | Returns the remainder of the extended double-precision floating-point value
5825 | `a' with respect to the corresponding value `b'.  The operation is performed
5826 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5827 *----------------------------------------------------------------------------*/
5828 
5829 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5830 {
5831     flag aSign, zSign;
5832     int32_t aExp, bExp, expDiff;
5833     uint64_t aSig0, aSig1, bSig;
5834     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5835 
5836     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5837         float_raise(float_flag_invalid, status);
5838         return floatx80_default_nan(status);
5839     }
5840     aSig0 = extractFloatx80Frac( a );
5841     aExp = extractFloatx80Exp( a );
5842     aSign = extractFloatx80Sign( a );
5843     bSig = extractFloatx80Frac( b );
5844     bExp = extractFloatx80Exp( b );
5845     if ( aExp == 0x7FFF ) {
5846         if (    (uint64_t) ( aSig0<<1 )
5847              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5848             return propagateFloatx80NaN(a, b, status);
5849         }
5850         goto invalid;
5851     }
5852     if ( bExp == 0x7FFF ) {
5853         if ((uint64_t)(bSig << 1)) {
5854             return propagateFloatx80NaN(a, b, status);
5855         }
5856         return a;
5857     }
5858     if ( bExp == 0 ) {
5859         if ( bSig == 0 ) {
5860  invalid:
5861             float_raise(float_flag_invalid, status);
5862             return floatx80_default_nan(status);
5863         }
5864         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5865     }
5866     if ( aExp == 0 ) {
5867         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5868         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5869     }
5870     bSig |= LIT64( 0x8000000000000000 );
5871     zSign = aSign;
5872     expDiff = aExp - bExp;
5873     aSig1 = 0;
5874     if ( expDiff < 0 ) {
5875         if ( expDiff < -1 ) return a;
5876         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5877         expDiff = 0;
5878     }
5879     q = ( bSig <= aSig0 );
5880     if ( q ) aSig0 -= bSig;
5881     expDiff -= 64;
5882     while ( 0 < expDiff ) {
5883         q = estimateDiv128To64( aSig0, aSig1, bSig );
5884         q = ( 2 < q ) ? q - 2 : 0;
5885         mul64To128( bSig, q, &term0, &term1 );
5886         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5887         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5888         expDiff -= 62;
5889     }
5890     expDiff += 64;
5891     if ( 0 < expDiff ) {
5892         q = estimateDiv128To64( aSig0, aSig1, bSig );
5893         q = ( 2 < q ) ? q - 2 : 0;
5894         q >>= 64 - expDiff;
5895         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5896         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5897         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5898         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5899             ++q;
5900             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5901         }
5902     }
5903     else {
5904         term1 = 0;
5905         term0 = bSig;
5906     }
5907     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5908     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5909          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5910               && ( q & 1 ) )
5911        ) {
5912         aSig0 = alternateASig0;
5913         aSig1 = alternateASig1;
5914         zSign = ! zSign;
5915     }
5916     return
5917         normalizeRoundAndPackFloatx80(
5918             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5919 
5920 }
5921 
5922 /*----------------------------------------------------------------------------
5923 | Returns the square root of the extended double-precision floating-point
5924 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5925 | for Binary Floating-Point Arithmetic.
5926 *----------------------------------------------------------------------------*/
5927 
5928 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5929 {
5930     flag aSign;
5931     int32_t aExp, zExp;
5932     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5933     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5934 
5935     if (floatx80_invalid_encoding(a)) {
5936         float_raise(float_flag_invalid, status);
5937         return floatx80_default_nan(status);
5938     }
5939     aSig0 = extractFloatx80Frac( a );
5940     aExp = extractFloatx80Exp( a );
5941     aSign = extractFloatx80Sign( a );
5942     if ( aExp == 0x7FFF ) {
5943         if ((uint64_t)(aSig0 << 1)) {
5944             return propagateFloatx80NaN(a, a, status);
5945         }
5946         if ( ! aSign ) return a;
5947         goto invalid;
5948     }
5949     if ( aSign ) {
5950         if ( ( aExp | aSig0 ) == 0 ) return a;
5951  invalid:
5952         float_raise(float_flag_invalid, status);
5953         return floatx80_default_nan(status);
5954     }
5955     if ( aExp == 0 ) {
5956         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5957         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5958     }
5959     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5960     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5961     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5962     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5963     doubleZSig0 = zSig0<<1;
5964     mul64To128( zSig0, zSig0, &term0, &term1 );
5965     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5966     while ( (int64_t) rem0 < 0 ) {
5967         --zSig0;
5968         doubleZSig0 -= 2;
5969         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5970     }
5971     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5972     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5973         if ( zSig1 == 0 ) zSig1 = 1;
5974         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5975         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5976         mul64To128( zSig1, zSig1, &term2, &term3 );
5977         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5978         while ( (int64_t) rem1 < 0 ) {
5979             --zSig1;
5980             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5981             term3 |= 1;
5982             term2 |= doubleZSig0;
5983             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5984         }
5985         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5986     }
5987     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5988     zSig0 |= doubleZSig0;
5989     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5990                                 0, zExp, zSig0, zSig1, status);
5991 }
5992 
5993 /*----------------------------------------------------------------------------
5994 | Returns 1 if the extended double-precision floating-point value `a' is equal
5995 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5996 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
5999 
6000 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6001 {
6002 
6003     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6004         || (extractFloatx80Exp(a) == 0x7FFF
6005             && (uint64_t) (extractFloatx80Frac(a) << 1))
6006         || (extractFloatx80Exp(b) == 0x7FFF
6007             && (uint64_t) (extractFloatx80Frac(b) << 1))
6008        ) {
6009         float_raise(float_flag_invalid, status);
6010         return 0;
6011     }
6012     return
6013            ( a.low == b.low )
6014         && (    ( a.high == b.high )
6015              || (    ( a.low == 0 )
6016                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6017            );
6018 
6019 }
6020 
6021 /*----------------------------------------------------------------------------
6022 | Returns 1 if the extended double-precision floating-point value `a' is
6023 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6024 | invalid exception is raised if either operand is a NaN.  The comparison is
6025 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6026 | Arithmetic.
6027 *----------------------------------------------------------------------------*/
6028 
6029 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6030 {
6031     flag aSign, bSign;
6032 
6033     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6034         || (extractFloatx80Exp(a) == 0x7FFF
6035             && (uint64_t) (extractFloatx80Frac(a) << 1))
6036         || (extractFloatx80Exp(b) == 0x7FFF
6037             && (uint64_t) (extractFloatx80Frac(b) << 1))
6038        ) {
6039         float_raise(float_flag_invalid, status);
6040         return 0;
6041     }
6042     aSign = extractFloatx80Sign( a );
6043     bSign = extractFloatx80Sign( b );
6044     if ( aSign != bSign ) {
6045         return
6046                aSign
6047             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6048                  == 0 );
6049     }
6050     return
6051           aSign ? le128( b.high, b.low, a.high, a.low )
6052         : le128( a.high, a.low, b.high, b.low );
6053 
6054 }
6055 
6056 /*----------------------------------------------------------------------------
6057 | Returns 1 if the extended double-precision floating-point value `a' is
6058 | less than the corresponding value `b', and 0 otherwise.  The invalid
6059 | exception is raised if either operand is a NaN.  The comparison is performed
6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6061 *----------------------------------------------------------------------------*/
6062 
6063 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6064 {
6065     flag aSign, bSign;
6066 
6067     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6068         || (extractFloatx80Exp(a) == 0x7FFF
6069             && (uint64_t) (extractFloatx80Frac(a) << 1))
6070         || (extractFloatx80Exp(b) == 0x7FFF
6071             && (uint64_t) (extractFloatx80Frac(b) << 1))
6072        ) {
6073         float_raise(float_flag_invalid, status);
6074         return 0;
6075     }
6076     aSign = extractFloatx80Sign( a );
6077     bSign = extractFloatx80Sign( b );
6078     if ( aSign != bSign ) {
6079         return
6080                aSign
6081             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6082                  != 0 );
6083     }
6084     return
6085           aSign ? lt128( b.high, b.low, a.high, a.low )
6086         : lt128( a.high, a.low, b.high, b.low );
6087 
6088 }
6089 
6090 /*----------------------------------------------------------------------------
6091 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6092 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6093 | either operand is a NaN.   The comparison is performed according to the
6094 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6095 *----------------------------------------------------------------------------*/
6096 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6097 {
6098     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6099         || (extractFloatx80Exp(a) == 0x7FFF
6100             && (uint64_t) (extractFloatx80Frac(a) << 1))
6101         || (extractFloatx80Exp(b) == 0x7FFF
6102             && (uint64_t) (extractFloatx80Frac(b) << 1))
6103        ) {
6104         float_raise(float_flag_invalid, status);
6105         return 1;
6106     }
6107     return 0;
6108 }
6109 
6110 /*----------------------------------------------------------------------------
6111 | Returns 1 if the extended double-precision floating-point value `a' is
6112 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6113 | cause an exception.  The comparison is performed according to the IEC/IEEE
6114 | Standard for Binary Floating-Point Arithmetic.
6115 *----------------------------------------------------------------------------*/
6116 
6117 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6118 {
6119 
6120     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6121         float_raise(float_flag_invalid, status);
6122         return 0;
6123     }
6124     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6125               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6126          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6127               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6128        ) {
6129         if (floatx80_is_signaling_nan(a, status)
6130          || floatx80_is_signaling_nan(b, status)) {
6131             float_raise(float_flag_invalid, status);
6132         }
6133         return 0;
6134     }
6135     return
6136            ( a.low == b.low )
6137         && (    ( a.high == b.high )
6138              || (    ( a.low == 0 )
6139                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6140            );
6141 
6142 }
6143 
6144 /*----------------------------------------------------------------------------
6145 | Returns 1 if the extended double-precision floating-point value `a' is less
6146 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6147 | do not cause an exception.  Otherwise, the comparison is performed according
6148 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6149 *----------------------------------------------------------------------------*/
6150 
6151 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6152 {
6153     flag aSign, bSign;
6154 
6155     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6156         float_raise(float_flag_invalid, status);
6157         return 0;
6158     }
6159     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6160               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6161          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6162               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6163        ) {
6164         if (floatx80_is_signaling_nan(a, status)
6165          || floatx80_is_signaling_nan(b, status)) {
6166             float_raise(float_flag_invalid, status);
6167         }
6168         return 0;
6169     }
6170     aSign = extractFloatx80Sign( a );
6171     bSign = extractFloatx80Sign( b );
6172     if ( aSign != bSign ) {
6173         return
6174                aSign
6175             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6176                  == 0 );
6177     }
6178     return
6179           aSign ? le128( b.high, b.low, a.high, a.low )
6180         : le128( a.high, a.low, b.high, b.low );
6181 
6182 }
6183 
6184 /*----------------------------------------------------------------------------
6185 | Returns 1 if the extended double-precision floating-point value `a' is less
6186 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6187 | an exception.  Otherwise, the comparison is performed according to the
6188 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6189 *----------------------------------------------------------------------------*/
6190 
6191 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6192 {
6193     flag aSign, bSign;
6194 
6195     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6196         float_raise(float_flag_invalid, status);
6197         return 0;
6198     }
6199     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6200               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6201          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6202               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6203        ) {
6204         if (floatx80_is_signaling_nan(a, status)
6205          || floatx80_is_signaling_nan(b, status)) {
6206             float_raise(float_flag_invalid, status);
6207         }
6208         return 0;
6209     }
6210     aSign = extractFloatx80Sign( a );
6211     bSign = extractFloatx80Sign( b );
6212     if ( aSign != bSign ) {
6213         return
6214                aSign
6215             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6216                  != 0 );
6217     }
6218     return
6219           aSign ? lt128( b.high, b.low, a.high, a.low )
6220         : lt128( a.high, a.low, b.high, b.low );
6221 
6222 }
6223 
6224 /*----------------------------------------------------------------------------
6225 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6226 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6227 | The comparison is performed according to the IEC/IEEE Standard for Binary
6228 | Floating-Point Arithmetic.
6229 *----------------------------------------------------------------------------*/
6230 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6231 {
6232     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6233         float_raise(float_flag_invalid, status);
6234         return 1;
6235     }
6236     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6237               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6238          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6239               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6240        ) {
6241         if (floatx80_is_signaling_nan(a, status)
6242          || floatx80_is_signaling_nan(b, status)) {
6243             float_raise(float_flag_invalid, status);
6244         }
6245         return 1;
6246     }
6247     return 0;
6248 }
6249 
6250 /*----------------------------------------------------------------------------
6251 | Returns the result of converting the quadruple-precision floating-point
6252 | value `a' to the 32-bit two's complement integer format.  The conversion
6253 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6254 | Arithmetic---which means in particular that the conversion is rounded
6255 | according to the current rounding mode.  If `a' is a NaN, the largest
6256 | positive integer is returned.  Otherwise, if the conversion overflows, the
6257 | largest integer with the same sign as `a' is returned.
6258 *----------------------------------------------------------------------------*/
6259 
6260 int32_t float128_to_int32(float128 a, float_status *status)
6261 {
6262     flag aSign;
6263     int32_t aExp, shiftCount;
6264     uint64_t aSig0, aSig1;
6265 
6266     aSig1 = extractFloat128Frac1( a );
6267     aSig0 = extractFloat128Frac0( a );
6268     aExp = extractFloat128Exp( a );
6269     aSign = extractFloat128Sign( a );
6270     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6271     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6272     aSig0 |= ( aSig1 != 0 );
6273     shiftCount = 0x4028 - aExp;
6274     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6275     return roundAndPackInt32(aSign, aSig0, status);
6276 
6277 }
6278 
6279 /*----------------------------------------------------------------------------
6280 | Returns the result of converting the quadruple-precision floating-point
6281 | value `a' to the 32-bit two's complement integer format.  The conversion
6282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6283 | Arithmetic, except that the conversion is always rounded toward zero.  If
6284 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6285 | conversion overflows, the largest integer with the same sign as `a' is
6286 | returned.
6287 *----------------------------------------------------------------------------*/
6288 
6289 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6290 {
6291     flag aSign;
6292     int32_t aExp, shiftCount;
6293     uint64_t aSig0, aSig1, savedASig;
6294     int32_t z;
6295 
6296     aSig1 = extractFloat128Frac1( a );
6297     aSig0 = extractFloat128Frac0( a );
6298     aExp = extractFloat128Exp( a );
6299     aSign = extractFloat128Sign( a );
6300     aSig0 |= ( aSig1 != 0 );
6301     if ( 0x401E < aExp ) {
6302         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6303         goto invalid;
6304     }
6305     else if ( aExp < 0x3FFF ) {
6306         if (aExp || aSig0) {
6307             status->float_exception_flags |= float_flag_inexact;
6308         }
6309         return 0;
6310     }
6311     aSig0 |= LIT64( 0x0001000000000000 );
6312     shiftCount = 0x402F - aExp;
6313     savedASig = aSig0;
6314     aSig0 >>= shiftCount;
6315     z = aSig0;
6316     if ( aSign ) z = - z;
6317     if ( ( z < 0 ) ^ aSign ) {
6318  invalid:
6319         float_raise(float_flag_invalid, status);
6320         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6321     }
6322     if ( ( aSig0<<shiftCount ) != savedASig ) {
6323         status->float_exception_flags |= float_flag_inexact;
6324     }
6325     return z;
6326 
6327 }
6328 
6329 /*----------------------------------------------------------------------------
6330 | Returns the result of converting the quadruple-precision floating-point
6331 | value `a' to the 64-bit two's complement integer format.  The conversion
6332 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6333 | Arithmetic---which means in particular that the conversion is rounded
6334 | according to the current rounding mode.  If `a' is a NaN, the largest
6335 | positive integer is returned.  Otherwise, if the conversion overflows, the
6336 | largest integer with the same sign as `a' is returned.
6337 *----------------------------------------------------------------------------*/
6338 
6339 int64_t float128_to_int64(float128 a, float_status *status)
6340 {
6341     flag aSign;
6342     int32_t aExp, shiftCount;
6343     uint64_t aSig0, aSig1;
6344 
6345     aSig1 = extractFloat128Frac1( a );
6346     aSig0 = extractFloat128Frac0( a );
6347     aExp = extractFloat128Exp( a );
6348     aSign = extractFloat128Sign( a );
6349     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6350     shiftCount = 0x402F - aExp;
6351     if ( shiftCount <= 0 ) {
6352         if ( 0x403E < aExp ) {
6353             float_raise(float_flag_invalid, status);
6354             if (    ! aSign
6355                  || (    ( aExp == 0x7FFF )
6356                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6357                     )
6358                ) {
6359                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6360             }
6361             return (int64_t) LIT64( 0x8000000000000000 );
6362         }
6363         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6364     }
6365     else {
6366         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6367     }
6368     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6369 
6370 }
6371 
6372 /*----------------------------------------------------------------------------
6373 | Returns the result of converting the quadruple-precision floating-point
6374 | value `a' to the 64-bit two's complement integer format.  The conversion
6375 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6376 | Arithmetic, except that the conversion is always rounded toward zero.
6377 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6378 | the conversion overflows, the largest integer with the same sign as `a' is
6379 | returned.
6380 *----------------------------------------------------------------------------*/
6381 
6382 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6383 {
6384     flag aSign;
6385     int32_t aExp, shiftCount;
6386     uint64_t aSig0, aSig1;
6387     int64_t z;
6388 
6389     aSig1 = extractFloat128Frac1( a );
6390     aSig0 = extractFloat128Frac0( a );
6391     aExp = extractFloat128Exp( a );
6392     aSign = extractFloat128Sign( a );
6393     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6394     shiftCount = aExp - 0x402F;
6395     if ( 0 < shiftCount ) {
6396         if ( 0x403E <= aExp ) {
6397             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6398             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6399                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6400                 if (aSig1) {
6401                     status->float_exception_flags |= float_flag_inexact;
6402                 }
6403             }
6404             else {
6405                 float_raise(float_flag_invalid, status);
6406                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6407                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6408                 }
6409             }
6410             return (int64_t) LIT64( 0x8000000000000000 );
6411         }
6412         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6413         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6414             status->float_exception_flags |= float_flag_inexact;
6415         }
6416     }
6417     else {
6418         if ( aExp < 0x3FFF ) {
6419             if ( aExp | aSig0 | aSig1 ) {
6420                 status->float_exception_flags |= float_flag_inexact;
6421             }
6422             return 0;
6423         }
6424         z = aSig0>>( - shiftCount );
6425         if (    aSig1
6426              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6427             status->float_exception_flags |= float_flag_inexact;
6428         }
6429     }
6430     if ( aSign ) z = - z;
6431     return z;
6432 
6433 }
6434 
6435 /*----------------------------------------------------------------------------
6436 | Returns the result of converting the quadruple-precision floating-point value
6437 | `a' to the 64-bit unsigned integer format.  The conversion is
6438 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6439 | Arithmetic---which means in particular that the conversion is rounded
6440 | according to the current rounding mode.  If `a' is a NaN, the largest
6441 | positive integer is returned.  If the conversion overflows, the
6442 | largest unsigned integer is returned.  If 'a' is negative, the value is
6443 | rounded and zero is returned; negative values that do not round to zero
6444 | will raise the inexact exception.
6445 *----------------------------------------------------------------------------*/
6446 
6447 uint64_t float128_to_uint64(float128 a, float_status *status)
6448 {
6449     flag aSign;
6450     int aExp;
6451     int shiftCount;
6452     uint64_t aSig0, aSig1;
6453 
6454     aSig0 = extractFloat128Frac0(a);
6455     aSig1 = extractFloat128Frac1(a);
6456     aExp = extractFloat128Exp(a);
6457     aSign = extractFloat128Sign(a);
6458     if (aSign && (aExp > 0x3FFE)) {
6459         float_raise(float_flag_invalid, status);
6460         if (float128_is_any_nan(a)) {
6461             return LIT64(0xFFFFFFFFFFFFFFFF);
6462         } else {
6463             return 0;
6464         }
6465     }
6466     if (aExp) {
6467         aSig0 |= LIT64(0x0001000000000000);
6468     }
6469     shiftCount = 0x402F - aExp;
6470     if (shiftCount <= 0) {
6471         if (0x403E < aExp) {
6472             float_raise(float_flag_invalid, status);
6473             return LIT64(0xFFFFFFFFFFFFFFFF);
6474         }
6475         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6476     } else {
6477         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6478     }
6479     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6480 }
6481 
6482 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6483 {
6484     uint64_t v;
6485     signed char current_rounding_mode = status->float_rounding_mode;
6486 
6487     set_float_rounding_mode(float_round_to_zero, status);
6488     v = float128_to_uint64(a, status);
6489     set_float_rounding_mode(current_rounding_mode, status);
6490 
6491     return v;
6492 }
6493 
6494 /*----------------------------------------------------------------------------
6495 | Returns the result of converting the quadruple-precision floating-point
6496 | value `a' to the 32-bit unsigned integer format.  The conversion
6497 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6498 | Arithmetic except that the conversion is always rounded toward zero.
6499 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6500 | if the conversion overflows, the largest unsigned integer is returned.
6501 | If 'a' is negative, the value is rounded and zero is returned; negative
6502 | values that do not round to zero will raise the inexact exception.
6503 *----------------------------------------------------------------------------*/
6504 
6505 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6506 {
6507     uint64_t v;
6508     uint32_t res;
6509     int old_exc_flags = get_float_exception_flags(status);
6510 
6511     v = float128_to_uint64_round_to_zero(a, status);
6512     if (v > 0xffffffff) {
6513         res = 0xffffffff;
6514     } else {
6515         return v;
6516     }
6517     set_float_exception_flags(old_exc_flags, status);
6518     float_raise(float_flag_invalid, status);
6519     return res;
6520 }
6521 
6522 /*----------------------------------------------------------------------------
6523 | Returns the result of converting the quadruple-precision floating-point
6524 | value `a' to the single-precision floating-point format.  The conversion
6525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6526 | Arithmetic.
6527 *----------------------------------------------------------------------------*/
6528 
6529 float32 float128_to_float32(float128 a, float_status *status)
6530 {
6531     flag aSign;
6532     int32_t aExp;
6533     uint64_t aSig0, aSig1;
6534     uint32_t zSig;
6535 
6536     aSig1 = extractFloat128Frac1( a );
6537     aSig0 = extractFloat128Frac0( a );
6538     aExp = extractFloat128Exp( a );
6539     aSign = extractFloat128Sign( a );
6540     if ( aExp == 0x7FFF ) {
6541         if ( aSig0 | aSig1 ) {
6542             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6543         }
6544         return packFloat32( aSign, 0xFF, 0 );
6545     }
6546     aSig0 |= ( aSig1 != 0 );
6547     shift64RightJamming( aSig0, 18, &aSig0 );
6548     zSig = aSig0;
6549     if ( aExp || zSig ) {
6550         zSig |= 0x40000000;
6551         aExp -= 0x3F81;
6552     }
6553     return roundAndPackFloat32(aSign, aExp, zSig, status);
6554 
6555 }
6556 
6557 /*----------------------------------------------------------------------------
6558 | Returns the result of converting the quadruple-precision floating-point
6559 | value `a' to the double-precision floating-point format.  The conversion
6560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6561 | Arithmetic.
6562 *----------------------------------------------------------------------------*/
6563 
6564 float64 float128_to_float64(float128 a, float_status *status)
6565 {
6566     flag aSign;
6567     int32_t aExp;
6568     uint64_t aSig0, aSig1;
6569 
6570     aSig1 = extractFloat128Frac1( a );
6571     aSig0 = extractFloat128Frac0( a );
6572     aExp = extractFloat128Exp( a );
6573     aSign = extractFloat128Sign( a );
6574     if ( aExp == 0x7FFF ) {
6575         if ( aSig0 | aSig1 ) {
6576             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6577         }
6578         return packFloat64( aSign, 0x7FF, 0 );
6579     }
6580     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6581     aSig0 |= ( aSig1 != 0 );
6582     if ( aExp || aSig0 ) {
6583         aSig0 |= LIT64( 0x4000000000000000 );
6584         aExp -= 0x3C01;
6585     }
6586     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6587 
6588 }
6589 
6590 /*----------------------------------------------------------------------------
6591 | Returns the result of converting the quadruple-precision floating-point
6592 | value `a' to the extended double-precision floating-point format.  The
6593 | conversion is performed according to the IEC/IEEE Standard for Binary
6594 | Floating-Point Arithmetic.
6595 *----------------------------------------------------------------------------*/
6596 
6597 floatx80 float128_to_floatx80(float128 a, float_status *status)
6598 {
6599     flag aSign;
6600     int32_t aExp;
6601     uint64_t aSig0, aSig1;
6602 
6603     aSig1 = extractFloat128Frac1( a );
6604     aSig0 = extractFloat128Frac0( a );
6605     aExp = extractFloat128Exp( a );
6606     aSign = extractFloat128Sign( a );
6607     if ( aExp == 0x7FFF ) {
6608         if ( aSig0 | aSig1 ) {
6609             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6610         }
6611         return packFloatx80(aSign, floatx80_infinity_high,
6612                                    floatx80_infinity_low);
6613     }
6614     if ( aExp == 0 ) {
6615         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6616         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6617     }
6618     else {
6619         aSig0 |= LIT64( 0x0001000000000000 );
6620     }
6621     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6622     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6623 
6624 }
6625 
6626 /*----------------------------------------------------------------------------
6627 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6628 | returns the result as a quadruple-precision floating-point value.  The
6629 | operation is performed according to the IEC/IEEE Standard for Binary
6630 | Floating-Point Arithmetic.
6631 *----------------------------------------------------------------------------*/
6632 
6633 float128 float128_round_to_int(float128 a, float_status *status)
6634 {
6635     flag aSign;
6636     int32_t aExp;
6637     uint64_t lastBitMask, roundBitsMask;
6638     float128 z;
6639 
6640     aExp = extractFloat128Exp( a );
6641     if ( 0x402F <= aExp ) {
6642         if ( 0x406F <= aExp ) {
6643             if (    ( aExp == 0x7FFF )
6644                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6645                ) {
6646                 return propagateFloat128NaN(a, a, status);
6647             }
6648             return a;
6649         }
6650         lastBitMask = 1;
6651         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6652         roundBitsMask = lastBitMask - 1;
6653         z = a;
6654         switch (status->float_rounding_mode) {
6655         case float_round_nearest_even:
6656             if ( lastBitMask ) {
6657                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6658                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6659             }
6660             else {
6661                 if ( (int64_t) z.low < 0 ) {
6662                     ++z.high;
6663                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6664                 }
6665             }
6666             break;
6667         case float_round_ties_away:
6668             if (lastBitMask) {
6669                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6670             } else {
6671                 if ((int64_t) z.low < 0) {
6672                     ++z.high;
6673                 }
6674             }
6675             break;
6676         case float_round_to_zero:
6677             break;
6678         case float_round_up:
6679             if (!extractFloat128Sign(z)) {
6680                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6681             }
6682             break;
6683         case float_round_down:
6684             if (extractFloat128Sign(z)) {
6685                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6686             }
6687             break;
6688         default:
6689             abort();
6690         }
6691         z.low &= ~ roundBitsMask;
6692     }
6693     else {
6694         if ( aExp < 0x3FFF ) {
6695             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6696             status->float_exception_flags |= float_flag_inexact;
6697             aSign = extractFloat128Sign( a );
6698             switch (status->float_rounding_mode) {
6699              case float_round_nearest_even:
6700                 if (    ( aExp == 0x3FFE )
6701                      && (   extractFloat128Frac0( a )
6702                           | extractFloat128Frac1( a ) )
6703                    ) {
6704                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6705                 }
6706                 break;
6707             case float_round_ties_away:
6708                 if (aExp == 0x3FFE) {
6709                     return packFloat128(aSign, 0x3FFF, 0, 0);
6710                 }
6711                 break;
6712              case float_round_down:
6713                 return
6714                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6715                     : packFloat128( 0, 0, 0, 0 );
6716              case float_round_up:
6717                 return
6718                       aSign ? packFloat128( 1, 0, 0, 0 )
6719                     : packFloat128( 0, 0x3FFF, 0, 0 );
6720             }
6721             return packFloat128( aSign, 0, 0, 0 );
6722         }
6723         lastBitMask = 1;
6724         lastBitMask <<= 0x402F - aExp;
6725         roundBitsMask = lastBitMask - 1;
6726         z.low = 0;
6727         z.high = a.high;
6728         switch (status->float_rounding_mode) {
6729         case float_round_nearest_even:
6730             z.high += lastBitMask>>1;
6731             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6732                 z.high &= ~ lastBitMask;
6733             }
6734             break;
6735         case float_round_ties_away:
6736             z.high += lastBitMask>>1;
6737             break;
6738         case float_round_to_zero:
6739             break;
6740         case float_round_up:
6741             if (!extractFloat128Sign(z)) {
6742                 z.high |= ( a.low != 0 );
6743                 z.high += roundBitsMask;
6744             }
6745             break;
6746         case float_round_down:
6747             if (extractFloat128Sign(z)) {
6748                 z.high |= (a.low != 0);
6749                 z.high += roundBitsMask;
6750             }
6751             break;
6752         default:
6753             abort();
6754         }
6755         z.high &= ~ roundBitsMask;
6756     }
6757     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6758         status->float_exception_flags |= float_flag_inexact;
6759     }
6760     return z;
6761 
6762 }
6763 
6764 /*----------------------------------------------------------------------------
6765 | Returns the result of adding the absolute values of the quadruple-precision
6766 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6767 | before being returned.  `zSign' is ignored if the result is a NaN.
6768 | The addition is performed according to the IEC/IEEE Standard for Binary
6769 | Floating-Point Arithmetic.
6770 *----------------------------------------------------------------------------*/
6771 
6772 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6773                                 float_status *status)
6774 {
6775     int32_t aExp, bExp, zExp;
6776     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6777     int32_t expDiff;
6778 
6779     aSig1 = extractFloat128Frac1( a );
6780     aSig0 = extractFloat128Frac0( a );
6781     aExp = extractFloat128Exp( a );
6782     bSig1 = extractFloat128Frac1( b );
6783     bSig0 = extractFloat128Frac0( b );
6784     bExp = extractFloat128Exp( b );
6785     expDiff = aExp - bExp;
6786     if ( 0 < expDiff ) {
6787         if ( aExp == 0x7FFF ) {
6788             if (aSig0 | aSig1) {
6789                 return propagateFloat128NaN(a, b, status);
6790             }
6791             return a;
6792         }
6793         if ( bExp == 0 ) {
6794             --expDiff;
6795         }
6796         else {
6797             bSig0 |= LIT64( 0x0001000000000000 );
6798         }
6799         shift128ExtraRightJamming(
6800             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6801         zExp = aExp;
6802     }
6803     else if ( expDiff < 0 ) {
6804         if ( bExp == 0x7FFF ) {
6805             if (bSig0 | bSig1) {
6806                 return propagateFloat128NaN(a, b, status);
6807             }
6808             return packFloat128( zSign, 0x7FFF, 0, 0 );
6809         }
6810         if ( aExp == 0 ) {
6811             ++expDiff;
6812         }
6813         else {
6814             aSig0 |= LIT64( 0x0001000000000000 );
6815         }
6816         shift128ExtraRightJamming(
6817             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6818         zExp = bExp;
6819     }
6820     else {
6821         if ( aExp == 0x7FFF ) {
6822             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6823                 return propagateFloat128NaN(a, b, status);
6824             }
6825             return a;
6826         }
6827         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6828         if ( aExp == 0 ) {
6829             if (status->flush_to_zero) {
6830                 if (zSig0 | zSig1) {
6831                     float_raise(float_flag_output_denormal, status);
6832                 }
6833                 return packFloat128(zSign, 0, 0, 0);
6834             }
6835             return packFloat128( zSign, 0, zSig0, zSig1 );
6836         }
6837         zSig2 = 0;
6838         zSig0 |= LIT64( 0x0002000000000000 );
6839         zExp = aExp;
6840         goto shiftRight1;
6841     }
6842     aSig0 |= LIT64( 0x0001000000000000 );
6843     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6844     --zExp;
6845     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6846     ++zExp;
6847  shiftRight1:
6848     shift128ExtraRightJamming(
6849         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6850  roundAndPack:
6851     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6852 
6853 }
6854 
6855 /*----------------------------------------------------------------------------
6856 | Returns the result of subtracting the absolute values of the quadruple-
6857 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6858 | difference is negated before being returned.  `zSign' is ignored if the
6859 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6860 | Standard for Binary Floating-Point Arithmetic.
6861 *----------------------------------------------------------------------------*/
6862 
6863 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6864                                 float_status *status)
6865 {
6866     int32_t aExp, bExp, zExp;
6867     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6868     int32_t expDiff;
6869 
6870     aSig1 = extractFloat128Frac1( a );
6871     aSig0 = extractFloat128Frac0( a );
6872     aExp = extractFloat128Exp( a );
6873     bSig1 = extractFloat128Frac1( b );
6874     bSig0 = extractFloat128Frac0( b );
6875     bExp = extractFloat128Exp( b );
6876     expDiff = aExp - bExp;
6877     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6878     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6879     if ( 0 < expDiff ) goto aExpBigger;
6880     if ( expDiff < 0 ) goto bExpBigger;
6881     if ( aExp == 0x7FFF ) {
6882         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6883             return propagateFloat128NaN(a, b, status);
6884         }
6885         float_raise(float_flag_invalid, status);
6886         return float128_default_nan(status);
6887     }
6888     if ( aExp == 0 ) {
6889         aExp = 1;
6890         bExp = 1;
6891     }
6892     if ( bSig0 < aSig0 ) goto aBigger;
6893     if ( aSig0 < bSig0 ) goto bBigger;
6894     if ( bSig1 < aSig1 ) goto aBigger;
6895     if ( aSig1 < bSig1 ) goto bBigger;
6896     return packFloat128(status->float_rounding_mode == float_round_down,
6897                         0, 0, 0);
6898  bExpBigger:
6899     if ( bExp == 0x7FFF ) {
6900         if (bSig0 | bSig1) {
6901             return propagateFloat128NaN(a, b, status);
6902         }
6903         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6904     }
6905     if ( aExp == 0 ) {
6906         ++expDiff;
6907     }
6908     else {
6909         aSig0 |= LIT64( 0x4000000000000000 );
6910     }
6911     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6912     bSig0 |= LIT64( 0x4000000000000000 );
6913  bBigger:
6914     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6915     zExp = bExp;
6916     zSign ^= 1;
6917     goto normalizeRoundAndPack;
6918  aExpBigger:
6919     if ( aExp == 0x7FFF ) {
6920         if (aSig0 | aSig1) {
6921             return propagateFloat128NaN(a, b, status);
6922         }
6923         return a;
6924     }
6925     if ( bExp == 0 ) {
6926         --expDiff;
6927     }
6928     else {
6929         bSig0 |= LIT64( 0x4000000000000000 );
6930     }
6931     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6932     aSig0 |= LIT64( 0x4000000000000000 );
6933  aBigger:
6934     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6935     zExp = aExp;
6936  normalizeRoundAndPack:
6937     --zExp;
6938     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6939                                          status);
6940 
6941 }
6942 
6943 /*----------------------------------------------------------------------------
6944 | Returns the result of adding the quadruple-precision floating-point values
6945 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6946 | for Binary Floating-Point Arithmetic.
6947 *----------------------------------------------------------------------------*/
6948 
6949 float128 float128_add(float128 a, float128 b, float_status *status)
6950 {
6951     flag aSign, bSign;
6952 
6953     aSign = extractFloat128Sign( a );
6954     bSign = extractFloat128Sign( b );
6955     if ( aSign == bSign ) {
6956         return addFloat128Sigs(a, b, aSign, status);
6957     }
6958     else {
6959         return subFloat128Sigs(a, b, aSign, status);
6960     }
6961 
6962 }
6963 
6964 /*----------------------------------------------------------------------------
6965 | Returns the result of subtracting the quadruple-precision floating-point
6966 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6967 | Standard for Binary Floating-Point Arithmetic.
6968 *----------------------------------------------------------------------------*/
6969 
6970 float128 float128_sub(float128 a, float128 b, float_status *status)
6971 {
6972     flag aSign, bSign;
6973 
6974     aSign = extractFloat128Sign( a );
6975     bSign = extractFloat128Sign( b );
6976     if ( aSign == bSign ) {
6977         return subFloat128Sigs(a, b, aSign, status);
6978     }
6979     else {
6980         return addFloat128Sigs(a, b, aSign, status);
6981     }
6982 
6983 }
6984 
6985 /*----------------------------------------------------------------------------
6986 | Returns the result of multiplying the quadruple-precision floating-point
6987 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6988 | Standard for Binary Floating-Point Arithmetic.
6989 *----------------------------------------------------------------------------*/
6990 
6991 float128 float128_mul(float128 a, float128 b, float_status *status)
6992 {
6993     flag aSign, bSign, zSign;
6994     int32_t aExp, bExp, zExp;
6995     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6996 
6997     aSig1 = extractFloat128Frac1( a );
6998     aSig0 = extractFloat128Frac0( a );
6999     aExp = extractFloat128Exp( a );
7000     aSign = extractFloat128Sign( a );
7001     bSig1 = extractFloat128Frac1( b );
7002     bSig0 = extractFloat128Frac0( b );
7003     bExp = extractFloat128Exp( b );
7004     bSign = extractFloat128Sign( b );
7005     zSign = aSign ^ bSign;
7006     if ( aExp == 0x7FFF ) {
7007         if (    ( aSig0 | aSig1 )
7008              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7009             return propagateFloat128NaN(a, b, status);
7010         }
7011         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7012         return packFloat128( zSign, 0x7FFF, 0, 0 );
7013     }
7014     if ( bExp == 0x7FFF ) {
7015         if (bSig0 | bSig1) {
7016             return propagateFloat128NaN(a, b, status);
7017         }
7018         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7019  invalid:
7020             float_raise(float_flag_invalid, status);
7021             return float128_default_nan(status);
7022         }
7023         return packFloat128( zSign, 0x7FFF, 0, 0 );
7024     }
7025     if ( aExp == 0 ) {
7026         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7027         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7028     }
7029     if ( bExp == 0 ) {
7030         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7031         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7032     }
7033     zExp = aExp + bExp - 0x4000;
7034     aSig0 |= LIT64( 0x0001000000000000 );
7035     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7036     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7037     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7038     zSig2 |= ( zSig3 != 0 );
7039     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7040         shift128ExtraRightJamming(
7041             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7042         ++zExp;
7043     }
7044     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7045 
7046 }
7047 
7048 /*----------------------------------------------------------------------------
7049 | Returns the result of dividing the quadruple-precision floating-point value
7050 | `a' by the corresponding value `b'.  The operation is performed according to
7051 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7052 *----------------------------------------------------------------------------*/
7053 
7054 float128 float128_div(float128 a, float128 b, float_status *status)
7055 {
7056     flag aSign, bSign, zSign;
7057     int32_t aExp, bExp, zExp;
7058     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7059     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7060 
7061     aSig1 = extractFloat128Frac1( a );
7062     aSig0 = extractFloat128Frac0( a );
7063     aExp = extractFloat128Exp( a );
7064     aSign = extractFloat128Sign( a );
7065     bSig1 = extractFloat128Frac1( b );
7066     bSig0 = extractFloat128Frac0( b );
7067     bExp = extractFloat128Exp( b );
7068     bSign = extractFloat128Sign( b );
7069     zSign = aSign ^ bSign;
7070     if ( aExp == 0x7FFF ) {
7071         if (aSig0 | aSig1) {
7072             return propagateFloat128NaN(a, b, status);
7073         }
7074         if ( bExp == 0x7FFF ) {
7075             if (bSig0 | bSig1) {
7076                 return propagateFloat128NaN(a, b, status);
7077             }
7078             goto invalid;
7079         }
7080         return packFloat128( zSign, 0x7FFF, 0, 0 );
7081     }
7082     if ( bExp == 0x7FFF ) {
7083         if (bSig0 | bSig1) {
7084             return propagateFloat128NaN(a, b, status);
7085         }
7086         return packFloat128( zSign, 0, 0, 0 );
7087     }
7088     if ( bExp == 0 ) {
7089         if ( ( bSig0 | bSig1 ) == 0 ) {
7090             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7091  invalid:
7092                 float_raise(float_flag_invalid, status);
7093                 return float128_default_nan(status);
7094             }
7095             float_raise(float_flag_divbyzero, status);
7096             return packFloat128( zSign, 0x7FFF, 0, 0 );
7097         }
7098         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7099     }
7100     if ( aExp == 0 ) {
7101         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7102         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7103     }
7104     zExp = aExp - bExp + 0x3FFD;
7105     shortShift128Left(
7106         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7107     shortShift128Left(
7108         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7109     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7110         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7111         ++zExp;
7112     }
7113     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7114     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7115     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7116     while ( (int64_t) rem0 < 0 ) {
7117         --zSig0;
7118         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7119     }
7120     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7121     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7122         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7123         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7124         while ( (int64_t) rem1 < 0 ) {
7125             --zSig1;
7126             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7127         }
7128         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7129     }
7130     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7131     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7132 
7133 }
7134 
7135 /*----------------------------------------------------------------------------
7136 | Returns the remainder of the quadruple-precision floating-point value `a'
7137 | with respect to the corresponding value `b'.  The operation is performed
7138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7139 *----------------------------------------------------------------------------*/
7140 
7141 float128 float128_rem(float128 a, float128 b, float_status *status)
7142 {
7143     flag aSign, zSign;
7144     int32_t aExp, bExp, expDiff;
7145     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7146     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7147     int64_t sigMean0;
7148 
7149     aSig1 = extractFloat128Frac1( a );
7150     aSig0 = extractFloat128Frac0( a );
7151     aExp = extractFloat128Exp( a );
7152     aSign = extractFloat128Sign( a );
7153     bSig1 = extractFloat128Frac1( b );
7154     bSig0 = extractFloat128Frac0( b );
7155     bExp = extractFloat128Exp( b );
7156     if ( aExp == 0x7FFF ) {
7157         if (    ( aSig0 | aSig1 )
7158              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7159             return propagateFloat128NaN(a, b, status);
7160         }
7161         goto invalid;
7162     }
7163     if ( bExp == 0x7FFF ) {
7164         if (bSig0 | bSig1) {
7165             return propagateFloat128NaN(a, b, status);
7166         }
7167         return a;
7168     }
7169     if ( bExp == 0 ) {
7170         if ( ( bSig0 | bSig1 ) == 0 ) {
7171  invalid:
7172             float_raise(float_flag_invalid, status);
7173             return float128_default_nan(status);
7174         }
7175         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7176     }
7177     if ( aExp == 0 ) {
7178         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7179         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7180     }
7181     expDiff = aExp - bExp;
7182     if ( expDiff < -1 ) return a;
7183     shortShift128Left(
7184         aSig0 | LIT64( 0x0001000000000000 ),
7185         aSig1,
7186         15 - ( expDiff < 0 ),
7187         &aSig0,
7188         &aSig1
7189     );
7190     shortShift128Left(
7191         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7192     q = le128( bSig0, bSig1, aSig0, aSig1 );
7193     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7194     expDiff -= 64;
7195     while ( 0 < expDiff ) {
7196         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7197         q = ( 4 < q ) ? q - 4 : 0;
7198         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7199         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7200         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7201         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7202         expDiff -= 61;
7203     }
7204     if ( -64 < expDiff ) {
7205         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7206         q = ( 4 < q ) ? q - 4 : 0;
7207         q >>= - expDiff;
7208         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7209         expDiff += 52;
7210         if ( expDiff < 0 ) {
7211             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7212         }
7213         else {
7214             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7215         }
7216         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7217         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7218     }
7219     else {
7220         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7221         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7222     }
7223     do {
7224         alternateASig0 = aSig0;
7225         alternateASig1 = aSig1;
7226         ++q;
7227         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7228     } while ( 0 <= (int64_t) aSig0 );
7229     add128(
7230         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7231     if (    ( sigMean0 < 0 )
7232          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7233         aSig0 = alternateASig0;
7234         aSig1 = alternateASig1;
7235     }
7236     zSign = ( (int64_t) aSig0 < 0 );
7237     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7238     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7239                                          status);
7240 }
7241 
7242 /*----------------------------------------------------------------------------
7243 | Returns the square root of the quadruple-precision floating-point value `a'.
7244 | The operation is performed according to the IEC/IEEE Standard for Binary
7245 | Floating-Point Arithmetic.
7246 *----------------------------------------------------------------------------*/
7247 
7248 float128 float128_sqrt(float128 a, float_status *status)
7249 {
7250     flag aSign;
7251     int32_t aExp, zExp;
7252     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7253     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7254 
7255     aSig1 = extractFloat128Frac1( a );
7256     aSig0 = extractFloat128Frac0( a );
7257     aExp = extractFloat128Exp( a );
7258     aSign = extractFloat128Sign( a );
7259     if ( aExp == 0x7FFF ) {
7260         if (aSig0 | aSig1) {
7261             return propagateFloat128NaN(a, a, status);
7262         }
7263         if ( ! aSign ) return a;
7264         goto invalid;
7265     }
7266     if ( aSign ) {
7267         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7268  invalid:
7269         float_raise(float_flag_invalid, status);
7270         return float128_default_nan(status);
7271     }
7272     if ( aExp == 0 ) {
7273         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7274         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7275     }
7276     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7277     aSig0 |= LIT64( 0x0001000000000000 );
7278     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7279     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7280     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7281     doubleZSig0 = zSig0<<1;
7282     mul64To128( zSig0, zSig0, &term0, &term1 );
7283     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7284     while ( (int64_t) rem0 < 0 ) {
7285         --zSig0;
7286         doubleZSig0 -= 2;
7287         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7288     }
7289     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7290     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7291         if ( zSig1 == 0 ) zSig1 = 1;
7292         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7293         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7294         mul64To128( zSig1, zSig1, &term2, &term3 );
7295         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7296         while ( (int64_t) rem1 < 0 ) {
7297             --zSig1;
7298             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7299             term3 |= 1;
7300             term2 |= doubleZSig0;
7301             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7302         }
7303         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7304     }
7305     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7306     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7307 
7308 }
7309 
7310 /*----------------------------------------------------------------------------
7311 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7312 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7313 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7314 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7315 *----------------------------------------------------------------------------*/
7316 
7317 int float128_eq(float128 a, float128 b, float_status *status)
7318 {
7319 
7320     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7321               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7322          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7323               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7324        ) {
7325         float_raise(float_flag_invalid, status);
7326         return 0;
7327     }
7328     return
7329            ( a.low == b.low )
7330         && (    ( a.high == b.high )
7331              || (    ( a.low == 0 )
7332                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7333            );
7334 
7335 }
7336 
7337 /*----------------------------------------------------------------------------
7338 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7339 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7340 | exception is raised if either operand is a NaN.  The comparison is performed
7341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7342 *----------------------------------------------------------------------------*/
7343 
7344 int float128_le(float128 a, float128 b, float_status *status)
7345 {
7346     flag aSign, bSign;
7347 
7348     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7349               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7350          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7351               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7352        ) {
7353         float_raise(float_flag_invalid, status);
7354         return 0;
7355     }
7356     aSign = extractFloat128Sign( a );
7357     bSign = extractFloat128Sign( b );
7358     if ( aSign != bSign ) {
7359         return
7360                aSign
7361             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7362                  == 0 );
7363     }
7364     return
7365           aSign ? le128( b.high, b.low, a.high, a.low )
7366         : le128( a.high, a.low, b.high, b.low );
7367 
7368 }
7369 
7370 /*----------------------------------------------------------------------------
7371 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7372 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7373 | raised if either operand is a NaN.  The comparison is performed according
7374 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7375 *----------------------------------------------------------------------------*/
7376 
7377 int float128_lt(float128 a, float128 b, float_status *status)
7378 {
7379     flag aSign, bSign;
7380 
7381     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7382               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7383          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7384               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7385        ) {
7386         float_raise(float_flag_invalid, status);
7387         return 0;
7388     }
7389     aSign = extractFloat128Sign( a );
7390     bSign = extractFloat128Sign( b );
7391     if ( aSign != bSign ) {
7392         return
7393                aSign
7394             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7395                  != 0 );
7396     }
7397     return
7398           aSign ? lt128( b.high, b.low, a.high, a.low )
7399         : lt128( a.high, a.low, b.high, b.low );
7400 
7401 }
7402 
7403 /*----------------------------------------------------------------------------
7404 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7405 | be compared, and 0 otherwise.  The invalid exception is raised if either
7406 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7407 | Standard for Binary Floating-Point Arithmetic.
7408 *----------------------------------------------------------------------------*/
7409 
7410 int float128_unordered(float128 a, float128 b, float_status *status)
7411 {
7412     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7413               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7414          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7415               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7416        ) {
7417         float_raise(float_flag_invalid, status);
7418         return 1;
7419     }
7420     return 0;
7421 }
7422 
7423 /*----------------------------------------------------------------------------
7424 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7425 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7426 | exception.  The comparison is performed according to the IEC/IEEE Standard
7427 | for Binary Floating-Point Arithmetic.
7428 *----------------------------------------------------------------------------*/
7429 
7430 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7431 {
7432 
7433     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7434               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7435          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7436               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7437        ) {
7438         if (float128_is_signaling_nan(a, status)
7439          || float128_is_signaling_nan(b, status)) {
7440             float_raise(float_flag_invalid, status);
7441         }
7442         return 0;
7443     }
7444     return
7445            ( a.low == b.low )
7446         && (    ( a.high == b.high )
7447              || (    ( a.low == 0 )
7448                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7449            );
7450 
7451 }
7452 
7453 /*----------------------------------------------------------------------------
7454 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7455 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7456 | cause an exception.  Otherwise, the comparison is performed according to the
7457 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7458 *----------------------------------------------------------------------------*/
7459 
7460 int float128_le_quiet(float128 a, float128 b, float_status *status)
7461 {
7462     flag aSign, bSign;
7463 
7464     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7465               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7466          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7467               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7468        ) {
7469         if (float128_is_signaling_nan(a, status)
7470          || float128_is_signaling_nan(b, status)) {
7471             float_raise(float_flag_invalid, status);
7472         }
7473         return 0;
7474     }
7475     aSign = extractFloat128Sign( a );
7476     bSign = extractFloat128Sign( b );
7477     if ( aSign != bSign ) {
7478         return
7479                aSign
7480             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7481                  == 0 );
7482     }
7483     return
7484           aSign ? le128( b.high, b.low, a.high, a.low )
7485         : le128( a.high, a.low, b.high, b.low );
7486 
7487 }
7488 
7489 /*----------------------------------------------------------------------------
7490 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7491 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7492 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7493 | Standard for Binary Floating-Point Arithmetic.
7494 *----------------------------------------------------------------------------*/
7495 
7496 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7497 {
7498     flag aSign, bSign;
7499 
7500     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7501               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7502          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7503               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7504        ) {
7505         if (float128_is_signaling_nan(a, status)
7506          || float128_is_signaling_nan(b, status)) {
7507             float_raise(float_flag_invalid, status);
7508         }
7509         return 0;
7510     }
7511     aSign = extractFloat128Sign( a );
7512     bSign = extractFloat128Sign( b );
7513     if ( aSign != bSign ) {
7514         return
7515                aSign
7516             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7517                  != 0 );
7518     }
7519     return
7520           aSign ? lt128( b.high, b.low, a.high, a.low )
7521         : lt128( a.high, a.low, b.high, b.low );
7522 
7523 }
7524 
7525 /*----------------------------------------------------------------------------
7526 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7527 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7528 | comparison is performed according to the IEC/IEEE Standard for Binary
7529 | Floating-Point Arithmetic.
7530 *----------------------------------------------------------------------------*/
7531 
7532 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7533 {
7534     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7535               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7536          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7537               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7538        ) {
7539         if (float128_is_signaling_nan(a, status)
7540          || float128_is_signaling_nan(b, status)) {
7541             float_raise(float_flag_invalid, status);
7542         }
7543         return 1;
7544     }
7545     return 0;
7546 }
7547 
7548 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7549                                             int is_quiet, float_status *status)
7550 {
7551     flag aSign, bSign;
7552 
7553     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7554         float_raise(float_flag_invalid, status);
7555         return float_relation_unordered;
7556     }
7557     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7558           ( extractFloatx80Frac( a )<<1 ) ) ||
7559         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7560           ( extractFloatx80Frac( b )<<1 ) )) {
7561         if (!is_quiet ||
7562             floatx80_is_signaling_nan(a, status) ||
7563             floatx80_is_signaling_nan(b, status)) {
7564             float_raise(float_flag_invalid, status);
7565         }
7566         return float_relation_unordered;
7567     }
7568     aSign = extractFloatx80Sign( a );
7569     bSign = extractFloatx80Sign( b );
7570     if ( aSign != bSign ) {
7571 
7572         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7573              ( ( a.low | b.low ) == 0 ) ) {
7574             /* zero case */
7575             return float_relation_equal;
7576         } else {
7577             return 1 - (2 * aSign);
7578         }
7579     } else {
7580         if (a.low == b.low && a.high == b.high) {
7581             return float_relation_equal;
7582         } else {
7583             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7584         }
7585     }
7586 }
7587 
7588 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7589 {
7590     return floatx80_compare_internal(a, b, 0, status);
7591 }
7592 
7593 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7594 {
7595     return floatx80_compare_internal(a, b, 1, status);
7596 }
7597 
7598 static inline int float128_compare_internal(float128 a, float128 b,
7599                                             int is_quiet, float_status *status)
7600 {
7601     flag aSign, bSign;
7602 
7603     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7604           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7605         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7606           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7607         if (!is_quiet ||
7608             float128_is_signaling_nan(a, status) ||
7609             float128_is_signaling_nan(b, status)) {
7610             float_raise(float_flag_invalid, status);
7611         }
7612         return float_relation_unordered;
7613     }
7614     aSign = extractFloat128Sign( a );
7615     bSign = extractFloat128Sign( b );
7616     if ( aSign != bSign ) {
7617         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7618             /* zero case */
7619             return float_relation_equal;
7620         } else {
7621             return 1 - (2 * aSign);
7622         }
7623     } else {
7624         if (a.low == b.low && a.high == b.high) {
7625             return float_relation_equal;
7626         } else {
7627             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7628         }
7629     }
7630 }
7631 
7632 int float128_compare(float128 a, float128 b, float_status *status)
7633 {
7634     return float128_compare_internal(a, b, 0, status);
7635 }
7636 
7637 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7638 {
7639     return float128_compare_internal(a, b, 1, status);
7640 }
7641 
7642 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7643 {
7644     flag aSign;
7645     int32_t aExp;
7646     uint64_t aSig;
7647 
7648     if (floatx80_invalid_encoding(a)) {
7649         float_raise(float_flag_invalid, status);
7650         return floatx80_default_nan(status);
7651     }
7652     aSig = extractFloatx80Frac( a );
7653     aExp = extractFloatx80Exp( a );
7654     aSign = extractFloatx80Sign( a );
7655 
7656     if ( aExp == 0x7FFF ) {
7657         if ( aSig<<1 ) {
7658             return propagateFloatx80NaN(a, a, status);
7659         }
7660         return a;
7661     }
7662 
7663     if (aExp == 0) {
7664         if (aSig == 0) {
7665             return a;
7666         }
7667         aExp++;
7668     }
7669 
7670     if (n > 0x10000) {
7671         n = 0x10000;
7672     } else if (n < -0x10000) {
7673         n = -0x10000;
7674     }
7675 
7676     aExp += n;
7677     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7678                                          aSign, aExp, aSig, 0, status);
7679 }
7680 
7681 float128 float128_scalbn(float128 a, int n, float_status *status)
7682 {
7683     flag aSign;
7684     int32_t aExp;
7685     uint64_t aSig0, aSig1;
7686 
7687     aSig1 = extractFloat128Frac1( a );
7688     aSig0 = extractFloat128Frac0( a );
7689     aExp = extractFloat128Exp( a );
7690     aSign = extractFloat128Sign( a );
7691     if ( aExp == 0x7FFF ) {
7692         if ( aSig0 | aSig1 ) {
7693             return propagateFloat128NaN(a, a, status);
7694         }
7695         return a;
7696     }
7697     if (aExp != 0) {
7698         aSig0 |= LIT64( 0x0001000000000000 );
7699     } else if (aSig0 == 0 && aSig1 == 0) {
7700         return a;
7701     } else {
7702         aExp++;
7703     }
7704 
7705     if (n > 0x10000) {
7706         n = 0x10000;
7707     } else if (n < -0x10000) {
7708         n = -0x10000;
7709     }
7710 
7711     aExp += n - 1;
7712     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7713                                          , status);
7714 
7715 }
7716