xref: /qemu/fpu/softfloat.c (revision 88857aca93f6ec8f372fb9c8201394b0e5582034)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333 
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341 
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377 
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396 
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400 
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404 
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408 
1409 #undef FLOAT_TO_INT
1410 
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423 
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429 
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446 
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455 
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488 
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492 
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496 
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500 
1501 #undef FLOAT_TO_UINT
1502 
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510 
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536 
1537     return r;
1538 }
1539 
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545 
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550 
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555 
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561 
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566 
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571 
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577 
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582 
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587 
1588 
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596 
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600 
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614 
1615     return r;
1616 }
1617 
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623 
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628 
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633 
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639 
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644 
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649 
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655 
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660 
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665 
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668  * 'compare and pick one input' because that would mishandle
1669  * NaNs and +0 vs -0.
1670  *
1671  * minnum() and maxnum() functions. These are similar to the min()
1672  * and max() functions but if one of the arguments is a QNaN and
1673  * the other is numerical then the numerical argument is returned.
1674  * SNaNs will get quietened before being returned.
1675  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676  * and maxNum() operations. min() and max() are the typical min/max
1677  * semantics provided by many CPUs which predate that specification.
1678  *
1679  * minnummag() and maxnummag() functions correspond to minNumMag()
1680  * and minNumMag() from the IEEE-754 2008.
1681  */
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683                                 bool ieee, bool ismag, float_status *s)
1684 {
1685     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686         if (ieee) {
1687             /* Takes two floating-point values `a' and `b', one of
1688              * which is a NaN, and returns the appropriate NaN
1689              * result. If either `a' or `b' is a signaling NaN,
1690              * the invalid exception is raised.
1691              */
1692             if (is_snan(a.cls) || is_snan(b.cls)) {
1693                 return pick_nan(a, b, s);
1694             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695                 return b;
1696             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697                 return a;
1698             }
1699         }
1700         return pick_nan(a, b, s);
1701     } else {
1702         int a_exp, b_exp;
1703         bool a_sign, b_sign;
1704 
1705         switch (a.cls) {
1706         case float_class_normal:
1707             a_exp = a.exp;
1708             break;
1709         case float_class_inf:
1710             a_exp = INT_MAX;
1711             break;
1712         case float_class_zero:
1713             a_exp = INT_MIN;
1714             break;
1715         default:
1716             g_assert_not_reached();
1717             break;
1718         }
1719         switch (b.cls) {
1720         case float_class_normal:
1721             b_exp = b.exp;
1722             break;
1723         case float_class_inf:
1724             b_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             b_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733 
1734         a_sign = a.sign;
1735         b_sign = b.sign;
1736         if (ismag) {
1737             a_sign = b_sign = 0;
1738         }
1739 
1740         if (a_sign == b_sign) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_sign ^ a_less ^ ismin ? b : a;
1746         } else {
1747             return a_sign ^ ismin ? b : a;
1748         }
1749     }
1750 }
1751 
1752 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1754                                      float_status *s)                   \
1755 {                                                                       \
1756     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1757     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1758     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1759                                                                         \
1760     return float ## sz ## _round_pack_canonical(pr, s);                 \
1761 }
1762 
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1769 
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1776 
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1783 
1784 #undef MINMAX
1785 
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788                           float_status *s)
1789 {
1790     if (is_nan(a.cls) || is_nan(b.cls)) {
1791         if (!is_quiet ||
1792             a.cls == float_class_snan ||
1793             b.cls == float_class_snan) {
1794             s->float_exception_flags |= float_flag_invalid;
1795         }
1796         return float_relation_unordered;
1797     }
1798 
1799     if (a.cls == float_class_zero) {
1800         if (b.cls == float_class_zero) {
1801             return float_relation_equal;
1802         }
1803         return b.sign ? float_relation_greater : float_relation_less;
1804     } else if (b.cls == float_class_zero) {
1805         return a.sign ? float_relation_less : float_relation_greater;
1806     }
1807 
1808     /* The only really important thing about infinity is its sign. If
1809      * both are infinities the sign marks the smallest of the two.
1810      */
1811     if (a.cls == float_class_inf) {
1812         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813             return float_relation_equal;
1814         }
1815         return a.sign ? float_relation_less : float_relation_greater;
1816     } else if (b.cls == float_class_inf) {
1817         return b.sign ? float_relation_greater : float_relation_less;
1818     }
1819 
1820     if (a.sign != b.sign) {
1821         return a.sign ? float_relation_less : float_relation_greater;
1822     }
1823 
1824     if (a.exp == b.exp) {
1825         if (a.frac == b.frac) {
1826             return float_relation_equal;
1827         }
1828         if (a.sign) {
1829             return a.frac > b.frac ?
1830                 float_relation_less : float_relation_greater;
1831         } else {
1832             return a.frac > b.frac ?
1833                 float_relation_greater : float_relation_less;
1834         }
1835     } else {
1836         if (a.sign) {
1837             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838         } else {
1839             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1840         }
1841     }
1842 }
1843 
1844 #define COMPARE(sz)                                                     \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1846                             float_status *s)                            \
1847 {                                                                       \
1848     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1849     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1850     return compare_floats(pa, pb, false, s);                            \
1851 }                                                                       \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1853                                   float_status *s)                      \
1854 {                                                                       \
1855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1857     return compare_floats(pa, pb, true, s);                             \
1858 }
1859 
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1863 
1864 #undef COMPARE
1865 
1866 /* Multiply A by 2 raised to the power N.  */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1868 {
1869     if (unlikely(is_nan(a.cls))) {
1870         return return_nan(a, s);
1871     }
1872     if (a.cls == float_class_normal) {
1873         a.exp += n;
1874     }
1875     return a;
1876 }
1877 
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1879 {
1880     FloatParts pa = float16_unpack_canonical(a, status);
1881     FloatParts pr = scalbn_decomposed(pa, n, status);
1882     return float16_round_pack_canonical(pr, status);
1883 }
1884 
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1886 {
1887     FloatParts pa = float32_unpack_canonical(a, status);
1888     FloatParts pr = scalbn_decomposed(pa, n, status);
1889     return float32_round_pack_canonical(pr, status);
1890 }
1891 
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1893 {
1894     FloatParts pa = float64_unpack_canonical(a, status);
1895     FloatParts pr = scalbn_decomposed(pa, n, status);
1896     return float64_round_pack_canonical(pr, status);
1897 }
1898 
1899 /*
1900  * Square Root
1901  *
1902  * The old softfloat code did an approximation step before zeroing in
1903  * on the final result. However for simpleness we just compute the
1904  * square root by iterating down from the implicit bit to enough extra
1905  * bits to ensure we get a correctly rounded result.
1906  *
1907  * This does mean however the calculation is slower than before,
1908  * especially for 64 bit floats.
1909  */
1910 
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1912 {
1913     uint64_t a_frac, r_frac, s_frac;
1914     int bit, last_bit;
1915 
1916     if (is_nan(a.cls)) {
1917         return return_nan(a, s);
1918     }
1919     if (a.cls == float_class_zero) {
1920         return a;  /* sqrt(+-0) = +-0 */
1921     }
1922     if (a.sign) {
1923         s->float_exception_flags |= float_flag_invalid;
1924         a.cls = float_class_dnan;
1925         return a;
1926     }
1927     if (a.cls == float_class_inf) {
1928         return a;  /* sqrt(+inf) = +inf */
1929     }
1930 
1931     assert(a.cls == float_class_normal);
1932 
1933     /* We need two overflow bits at the top. Adding room for that is a
1934      * right shift. If the exponent is odd, we can discard the low bit
1935      * by multiplying the fraction by 2; that's a left shift. Combine
1936      * those and we shift right if the exponent is even.
1937      */
1938     a_frac = a.frac;
1939     if (!(a.exp & 1)) {
1940         a_frac >>= 1;
1941     }
1942     a.exp >>= 1;
1943 
1944     /* Bit-by-bit computation of sqrt.  */
1945     r_frac = 0;
1946     s_frac = 0;
1947 
1948     /* Iterate from implicit bit down to the 3 extra bits to compute a
1949      * properly rounded result. Remember we've inserted one more bit
1950      * at the top, so these positions are one less.
1951      */
1952     bit = DECOMPOSED_BINARY_POINT - 1;
1953     last_bit = MAX(p->frac_shift - 4, 0);
1954     do {
1955         uint64_t q = 1ULL << bit;
1956         uint64_t t_frac = s_frac + q;
1957         if (t_frac <= a_frac) {
1958             s_frac = t_frac + q;
1959             a_frac -= t_frac;
1960             r_frac += q;
1961         }
1962         a_frac <<= 1;
1963     } while (--bit >= last_bit);
1964 
1965     /* Undo the right shift done above. If there is any remaining
1966      * fraction, the result is inexact. Set the sticky bit.
1967      */
1968     a.frac = (r_frac << 1) + (a_frac != 0);
1969 
1970     return a;
1971 }
1972 
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1974 {
1975     FloatParts pa = float16_unpack_canonical(a, status);
1976     FloatParts pr = sqrt_float(pa, status, &float16_params);
1977     return float16_round_pack_canonical(pr, status);
1978 }
1979 
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1981 {
1982     FloatParts pa = float32_unpack_canonical(a, status);
1983     FloatParts pr = sqrt_float(pa, status, &float32_params);
1984     return float32_round_pack_canonical(pr, status);
1985 }
1986 
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1988 {
1989     FloatParts pa = float64_unpack_canonical(a, status);
1990     FloatParts pr = sqrt_float(pa, status, &float64_params);
1991     return float64_round_pack_canonical(pr, status);
1992 }
1993 
1994 
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input.  If `zSign' is 1, the input is negated before being converted to an
1999 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer.  However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2005 
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2007 {
2008     int8_t roundingMode;
2009     flag roundNearestEven;
2010     int8_t roundIncrement, roundBits;
2011     int32_t z;
2012 
2013     roundingMode = status->float_rounding_mode;
2014     roundNearestEven = ( roundingMode == float_round_nearest_even );
2015     switch (roundingMode) {
2016     case float_round_nearest_even:
2017     case float_round_ties_away:
2018         roundIncrement = 0x40;
2019         break;
2020     case float_round_to_zero:
2021         roundIncrement = 0;
2022         break;
2023     case float_round_up:
2024         roundIncrement = zSign ? 0 : 0x7f;
2025         break;
2026     case float_round_down:
2027         roundIncrement = zSign ? 0x7f : 0;
2028         break;
2029     default:
2030         abort();
2031     }
2032     roundBits = absZ & 0x7F;
2033     absZ = ( absZ + roundIncrement )>>7;
2034     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2035     z = absZ;
2036     if ( zSign ) z = - z;
2037     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038         float_raise(float_flag_invalid, status);
2039         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2040     }
2041     if (roundBits) {
2042         status->float_exception_flags |= float_flag_inexact;
2043     }
2044     return z;
2045 
2046 }
2047 
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer.  However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2057 | returned.
2058 *----------------------------------------------------------------------------*/
2059 
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061                                float_status *status)
2062 {
2063     int8_t roundingMode;
2064     flag roundNearestEven, increment;
2065     int64_t z;
2066 
2067     roundingMode = status->float_rounding_mode;
2068     roundNearestEven = ( roundingMode == float_round_nearest_even );
2069     switch (roundingMode) {
2070     case float_round_nearest_even:
2071     case float_round_ties_away:
2072         increment = ((int64_t) absZ1 < 0);
2073         break;
2074     case float_round_to_zero:
2075         increment = 0;
2076         break;
2077     case float_round_up:
2078         increment = !zSign && absZ1;
2079         break;
2080     case float_round_down:
2081         increment = zSign && absZ1;
2082         break;
2083     default:
2084         abort();
2085     }
2086     if ( increment ) {
2087         ++absZ0;
2088         if ( absZ0 == 0 ) goto overflow;
2089         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2090     }
2091     z = absZ0;
2092     if ( zSign ) z = - z;
2093     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2094  overflow:
2095         float_raise(float_flag_invalid, status);
2096         return
2097               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098             : LIT64( 0x7FFFFFFFFFFFFFFF );
2099     }
2100     if (absZ1) {
2101         status->float_exception_flags |= float_flag_inexact;
2102     }
2103     return z;
2104 
2105 }
2106 
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer.  However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2116 
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118                                 uint64_t absZ1, float_status *status)
2119 {
2120     int8_t roundingMode;
2121     flag roundNearestEven, increment;
2122 
2123     roundingMode = status->float_rounding_mode;
2124     roundNearestEven = (roundingMode == float_round_nearest_even);
2125     switch (roundingMode) {
2126     case float_round_nearest_even:
2127     case float_round_ties_away:
2128         increment = ((int64_t)absZ1 < 0);
2129         break;
2130     case float_round_to_zero:
2131         increment = 0;
2132         break;
2133     case float_round_up:
2134         increment = !zSign && absZ1;
2135         break;
2136     case float_round_down:
2137         increment = zSign && absZ1;
2138         break;
2139     default:
2140         abort();
2141     }
2142     if (increment) {
2143         ++absZ0;
2144         if (absZ0 == 0) {
2145             float_raise(float_flag_invalid, status);
2146             return LIT64(0xFFFFFFFFFFFFFFFF);
2147         }
2148         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2149     }
2150 
2151     if (zSign && absZ0) {
2152         float_raise(float_flag_invalid, status);
2153         return 0;
2154     }
2155 
2156     if (absZ1) {
2157         status->float_exception_flags |= float_flag_inexact;
2158     }
2159     return absZ0;
2160 }
2161 
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2167 {
2168     if (status->flush_inputs_to_zero) {
2169         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170             float_raise(float_flag_input_denormal, status);
2171             return make_float32(float32_val(a) & 0x80000000);
2172         }
2173     }
2174     return a;
2175 }
2176 
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'.  The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2183 
2184 static void
2185  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2186 {
2187     int8_t shiftCount;
2188 
2189     shiftCount = countLeadingZeros32( aSig ) - 8;
2190     *zSigPtr = aSig<<shiftCount;
2191     *zExpPtr = 1 - shiftCount;
2192 
2193 }
2194 
2195 /*----------------------------------------------------------------------------
2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2197 | and significand `zSig', and returns the proper single-precision floating-
2198 | point value corresponding to the abstract input.  Ordinarily, the abstract
2199 | value is simply rounded and packed into the single-precision format, with
2200 | the inexact exception raised if the abstract input cannot be represented
2201 | exactly.  However, if the abstract value is too large, the overflow and
2202 | inexact exceptions are raised and an infinity or maximal finite value is
2203 | returned.  If the abstract value is too small, the input value is rounded to
2204 | a subnormal number, and the underflow and inexact exceptions are raised if
2205 | the abstract input cannot be represented exactly as a subnormal single-
2206 | precision floating-point number.
2207 |     The input significand `zSig' has its binary point between bits 30
2208 | and 29, which is 7 bits to the left of the usual location.  This shifted
2209 | significand must be normalized or smaller.  If `zSig' is not normalized,
2210 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2211 | and it must not require rounding.  In the usual case that `zSig' is
2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2214 | Binary Floating-Point Arithmetic.
2215 *----------------------------------------------------------------------------*/
2216 
2217 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2218                                    float_status *status)
2219 {
2220     int8_t roundingMode;
2221     flag roundNearestEven;
2222     int8_t roundIncrement, roundBits;
2223     flag isTiny;
2224 
2225     roundingMode = status->float_rounding_mode;
2226     roundNearestEven = ( roundingMode == float_round_nearest_even );
2227     switch (roundingMode) {
2228     case float_round_nearest_even:
2229     case float_round_ties_away:
2230         roundIncrement = 0x40;
2231         break;
2232     case float_round_to_zero:
2233         roundIncrement = 0;
2234         break;
2235     case float_round_up:
2236         roundIncrement = zSign ? 0 : 0x7f;
2237         break;
2238     case float_round_down:
2239         roundIncrement = zSign ? 0x7f : 0;
2240         break;
2241     default:
2242         abort();
2243         break;
2244     }
2245     roundBits = zSig & 0x7F;
2246     if ( 0xFD <= (uint16_t) zExp ) {
2247         if (    ( 0xFD < zExp )
2248              || (    ( zExp == 0xFD )
2249                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2250            ) {
2251             float_raise(float_flag_overflow | float_flag_inexact, status);
2252             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2253         }
2254         if ( zExp < 0 ) {
2255             if (status->flush_to_zero) {
2256                 float_raise(float_flag_output_denormal, status);
2257                 return packFloat32(zSign, 0, 0);
2258             }
2259             isTiny =
2260                 (status->float_detect_tininess
2261                  == float_tininess_before_rounding)
2262                 || ( zExp < -1 )
2263                 || ( zSig + roundIncrement < 0x80000000 );
2264             shift32RightJamming( zSig, - zExp, &zSig );
2265             zExp = 0;
2266             roundBits = zSig & 0x7F;
2267             if (isTiny && roundBits) {
2268                 float_raise(float_flag_underflow, status);
2269             }
2270         }
2271     }
2272     if (roundBits) {
2273         status->float_exception_flags |= float_flag_inexact;
2274     }
2275     zSig = ( zSig + roundIncrement )>>7;
2276     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2277     if ( zSig == 0 ) zExp = 0;
2278     return packFloat32( zSign, zExp, zSig );
2279 
2280 }
2281 
2282 /*----------------------------------------------------------------------------
2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2284 | and significand `zSig', and returns the proper single-precision floating-
2285 | point value corresponding to the abstract input.  This routine is just like
2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2288 | floating-point exponent.
2289 *----------------------------------------------------------------------------*/
2290 
2291 static float32
2292  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2293                               float_status *status)
2294 {
2295     int8_t shiftCount;
2296 
2297     shiftCount = countLeadingZeros32( zSig ) - 1;
2298     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2299                                status);
2300 
2301 }
2302 
2303 /*----------------------------------------------------------------------------
2304 | If `a' is denormal and we are in flush-to-zero mode then set the
2305 | input-denormal exception and return zero. Otherwise just return the value.
2306 *----------------------------------------------------------------------------*/
2307 float64 float64_squash_input_denormal(float64 a, float_status *status)
2308 {
2309     if (status->flush_inputs_to_zero) {
2310         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2311             float_raise(float_flag_input_denormal, status);
2312             return make_float64(float64_val(a) & (1ULL << 63));
2313         }
2314     }
2315     return a;
2316 }
2317 
2318 /*----------------------------------------------------------------------------
2319 | Normalizes the subnormal double-precision floating-point value represented
2320 | by the denormalized significand `aSig'.  The normalized exponent and
2321 | significand are stored at the locations pointed to by `zExpPtr' and
2322 | `zSigPtr', respectively.
2323 *----------------------------------------------------------------------------*/
2324 
2325 static void
2326  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2327 {
2328     int8_t shiftCount;
2329 
2330     shiftCount = countLeadingZeros64( aSig ) - 11;
2331     *zSigPtr = aSig<<shiftCount;
2332     *zExpPtr = 1 - shiftCount;
2333 
2334 }
2335 
2336 /*----------------------------------------------------------------------------
2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2338 | double-precision floating-point value, returning the result.  After being
2339 | shifted into the proper positions, the three fields are simply added
2340 | together to form the result.  This means that any integer portion of `zSig'
2341 | will be added into the exponent.  Since a properly normalized significand
2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2343 | than the desired result exponent whenever `zSig' is a complete, normalized
2344 | significand.
2345 *----------------------------------------------------------------------------*/
2346 
2347 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2348 {
2349 
2350     return make_float64(
2351         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2352 
2353 }
2354 
2355 /*----------------------------------------------------------------------------
2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2357 | and significand `zSig', and returns the proper double-precision floating-
2358 | point value corresponding to the abstract input.  Ordinarily, the abstract
2359 | value is simply rounded and packed into the double-precision format, with
2360 | the inexact exception raised if the abstract input cannot be represented
2361 | exactly.  However, if the abstract value is too large, the overflow and
2362 | inexact exceptions are raised and an infinity or maximal finite value is
2363 | returned.  If the abstract value is too small, the input value is rounded to
2364 | a subnormal number, and the underflow and inexact exceptions are raised if
2365 | the abstract input cannot be represented exactly as a subnormal double-
2366 | precision floating-point number.
2367 |     The input significand `zSig' has its binary point between bits 62
2368 | and 61, which is 10 bits to the left of the usual location.  This shifted
2369 | significand must be normalized or smaller.  If `zSig' is not normalized,
2370 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2371 | and it must not require rounding.  In the usual case that `zSig' is
2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2374 | Binary Floating-Point Arithmetic.
2375 *----------------------------------------------------------------------------*/
2376 
2377 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2378                                    float_status *status)
2379 {
2380     int8_t roundingMode;
2381     flag roundNearestEven;
2382     int roundIncrement, roundBits;
2383     flag isTiny;
2384 
2385     roundingMode = status->float_rounding_mode;
2386     roundNearestEven = ( roundingMode == float_round_nearest_even );
2387     switch (roundingMode) {
2388     case float_round_nearest_even:
2389     case float_round_ties_away:
2390         roundIncrement = 0x200;
2391         break;
2392     case float_round_to_zero:
2393         roundIncrement = 0;
2394         break;
2395     case float_round_up:
2396         roundIncrement = zSign ? 0 : 0x3ff;
2397         break;
2398     case float_round_down:
2399         roundIncrement = zSign ? 0x3ff : 0;
2400         break;
2401     case float_round_to_odd:
2402         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2403         break;
2404     default:
2405         abort();
2406     }
2407     roundBits = zSig & 0x3FF;
2408     if ( 0x7FD <= (uint16_t) zExp ) {
2409         if (    ( 0x7FD < zExp )
2410              || (    ( zExp == 0x7FD )
2411                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2412            ) {
2413             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2414                                    roundIncrement != 0;
2415             float_raise(float_flag_overflow | float_flag_inexact, status);
2416             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2417         }
2418         if ( zExp < 0 ) {
2419             if (status->flush_to_zero) {
2420                 float_raise(float_flag_output_denormal, status);
2421                 return packFloat64(zSign, 0, 0);
2422             }
2423             isTiny =
2424                    (status->float_detect_tininess
2425                     == float_tininess_before_rounding)
2426                 || ( zExp < -1 )
2427                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2428             shift64RightJamming( zSig, - zExp, &zSig );
2429             zExp = 0;
2430             roundBits = zSig & 0x3FF;
2431             if (isTiny && roundBits) {
2432                 float_raise(float_flag_underflow, status);
2433             }
2434             if (roundingMode == float_round_to_odd) {
2435                 /*
2436                  * For round-to-odd case, the roundIncrement depends on
2437                  * zSig which just changed.
2438                  */
2439                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2440             }
2441         }
2442     }
2443     if (roundBits) {
2444         status->float_exception_flags |= float_flag_inexact;
2445     }
2446     zSig = ( zSig + roundIncrement )>>10;
2447     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2448     if ( zSig == 0 ) zExp = 0;
2449     return packFloat64( zSign, zExp, zSig );
2450 
2451 }
2452 
2453 /*----------------------------------------------------------------------------
2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2455 | and significand `zSig', and returns the proper double-precision floating-
2456 | point value corresponding to the abstract input.  This routine is just like
2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2459 | floating-point exponent.
2460 *----------------------------------------------------------------------------*/
2461 
2462 static float64
2463  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2464                               float_status *status)
2465 {
2466     int8_t shiftCount;
2467 
2468     shiftCount = countLeadingZeros64( zSig ) - 1;
2469     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2470                                status);
2471 
2472 }
2473 
2474 /*----------------------------------------------------------------------------
2475 | Normalizes the subnormal extended double-precision floating-point value
2476 | represented by the denormalized significand `aSig'.  The normalized exponent
2477 | and significand are stored at the locations pointed to by `zExpPtr' and
2478 | `zSigPtr', respectively.
2479 *----------------------------------------------------------------------------*/
2480 
2481 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2482                                 uint64_t *zSigPtr)
2483 {
2484     int8_t shiftCount;
2485 
2486     shiftCount = countLeadingZeros64( aSig );
2487     *zSigPtr = aSig<<shiftCount;
2488     *zExpPtr = 1 - shiftCount;
2489 }
2490 
2491 /*----------------------------------------------------------------------------
2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2494 | and returns the proper extended double-precision floating-point value
2495 | corresponding to the abstract input.  Ordinarily, the abstract value is
2496 | rounded and packed into the extended double-precision format, with the
2497 | inexact exception raised if the abstract input cannot be represented
2498 | exactly.  However, if the abstract value is too large, the overflow and
2499 | inexact exceptions are raised and an infinity or maximal finite value is
2500 | returned.  If the abstract value is too small, the input value is rounded to
2501 | a subnormal number, and the underflow and inexact exceptions are raised if
2502 | the abstract input cannot be represented exactly as a subnormal extended
2503 | double-precision floating-point number.
2504 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2505 | number of bits as single or double precision, respectively.  Otherwise, the
2506 | result is rounded to the full precision of the extended double-precision
2507 | format.
2508 |     The input significand must be normalized or smaller.  If the input
2509 | significand is not normalized, `zExp' must be 0; in that case, the result
2510 | returned is a subnormal number, and it must not require rounding.  The
2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2512 | Floating-Point Arithmetic.
2513 *----------------------------------------------------------------------------*/
2514 
2515 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2516                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2517                               float_status *status)
2518 {
2519     int8_t roundingMode;
2520     flag roundNearestEven, increment, isTiny;
2521     int64_t roundIncrement, roundMask, roundBits;
2522 
2523     roundingMode = status->float_rounding_mode;
2524     roundNearestEven = ( roundingMode == float_round_nearest_even );
2525     if ( roundingPrecision == 80 ) goto precision80;
2526     if ( roundingPrecision == 64 ) {
2527         roundIncrement = LIT64( 0x0000000000000400 );
2528         roundMask = LIT64( 0x00000000000007FF );
2529     }
2530     else if ( roundingPrecision == 32 ) {
2531         roundIncrement = LIT64( 0x0000008000000000 );
2532         roundMask = LIT64( 0x000000FFFFFFFFFF );
2533     }
2534     else {
2535         goto precision80;
2536     }
2537     zSig0 |= ( zSig1 != 0 );
2538     switch (roundingMode) {
2539     case float_round_nearest_even:
2540     case float_round_ties_away:
2541         break;
2542     case float_round_to_zero:
2543         roundIncrement = 0;
2544         break;
2545     case float_round_up:
2546         roundIncrement = zSign ? 0 : roundMask;
2547         break;
2548     case float_round_down:
2549         roundIncrement = zSign ? roundMask : 0;
2550         break;
2551     default:
2552         abort();
2553     }
2554     roundBits = zSig0 & roundMask;
2555     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2556         if (    ( 0x7FFE < zExp )
2557              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2558            ) {
2559             goto overflow;
2560         }
2561         if ( zExp <= 0 ) {
2562             if (status->flush_to_zero) {
2563                 float_raise(float_flag_output_denormal, status);
2564                 return packFloatx80(zSign, 0, 0);
2565             }
2566             isTiny =
2567                    (status->float_detect_tininess
2568                     == float_tininess_before_rounding)
2569                 || ( zExp < 0 )
2570                 || ( zSig0 <= zSig0 + roundIncrement );
2571             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2572             zExp = 0;
2573             roundBits = zSig0 & roundMask;
2574             if (isTiny && roundBits) {
2575                 float_raise(float_flag_underflow, status);
2576             }
2577             if (roundBits) {
2578                 status->float_exception_flags |= float_flag_inexact;
2579             }
2580             zSig0 += roundIncrement;
2581             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2582             roundIncrement = roundMask + 1;
2583             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2584                 roundMask |= roundIncrement;
2585             }
2586             zSig0 &= ~ roundMask;
2587             return packFloatx80( zSign, zExp, zSig0 );
2588         }
2589     }
2590     if (roundBits) {
2591         status->float_exception_flags |= float_flag_inexact;
2592     }
2593     zSig0 += roundIncrement;
2594     if ( zSig0 < roundIncrement ) {
2595         ++zExp;
2596         zSig0 = LIT64( 0x8000000000000000 );
2597     }
2598     roundIncrement = roundMask + 1;
2599     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2600         roundMask |= roundIncrement;
2601     }
2602     zSig0 &= ~ roundMask;
2603     if ( zSig0 == 0 ) zExp = 0;
2604     return packFloatx80( zSign, zExp, zSig0 );
2605  precision80:
2606     switch (roundingMode) {
2607     case float_round_nearest_even:
2608     case float_round_ties_away:
2609         increment = ((int64_t)zSig1 < 0);
2610         break;
2611     case float_round_to_zero:
2612         increment = 0;
2613         break;
2614     case float_round_up:
2615         increment = !zSign && zSig1;
2616         break;
2617     case float_round_down:
2618         increment = zSign && zSig1;
2619         break;
2620     default:
2621         abort();
2622     }
2623     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2624         if (    ( 0x7FFE < zExp )
2625              || (    ( zExp == 0x7FFE )
2626                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2627                   && increment
2628                 )
2629            ) {
2630             roundMask = 0;
2631  overflow:
2632             float_raise(float_flag_overflow | float_flag_inexact, status);
2633             if (    ( roundingMode == float_round_to_zero )
2634                  || ( zSign && ( roundingMode == float_round_up ) )
2635                  || ( ! zSign && ( roundingMode == float_round_down ) )
2636                ) {
2637                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2638             }
2639             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2640         }
2641         if ( zExp <= 0 ) {
2642             isTiny =
2643                    (status->float_detect_tininess
2644                     == float_tininess_before_rounding)
2645                 || ( zExp < 0 )
2646                 || ! increment
2647                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2648             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2649             zExp = 0;
2650             if (isTiny && zSig1) {
2651                 float_raise(float_flag_underflow, status);
2652             }
2653             if (zSig1) {
2654                 status->float_exception_flags |= float_flag_inexact;
2655             }
2656             switch (roundingMode) {
2657             case float_round_nearest_even:
2658             case float_round_ties_away:
2659                 increment = ((int64_t)zSig1 < 0);
2660                 break;
2661             case float_round_to_zero:
2662                 increment = 0;
2663                 break;
2664             case float_round_up:
2665                 increment = !zSign && zSig1;
2666                 break;
2667             case float_round_down:
2668                 increment = zSign && zSig1;
2669                 break;
2670             default:
2671                 abort();
2672             }
2673             if ( increment ) {
2674                 ++zSig0;
2675                 zSig0 &=
2676                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2677                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2678             }
2679             return packFloatx80( zSign, zExp, zSig0 );
2680         }
2681     }
2682     if (zSig1) {
2683         status->float_exception_flags |= float_flag_inexact;
2684     }
2685     if ( increment ) {
2686         ++zSig0;
2687         if ( zSig0 == 0 ) {
2688             ++zExp;
2689             zSig0 = LIT64( 0x8000000000000000 );
2690         }
2691         else {
2692             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2693         }
2694     }
2695     else {
2696         if ( zSig0 == 0 ) zExp = 0;
2697     }
2698     return packFloatx80( zSign, zExp, zSig0 );
2699 
2700 }
2701 
2702 /*----------------------------------------------------------------------------
2703 | Takes an abstract floating-point value having sign `zSign', exponent
2704 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2705 | and returns the proper extended double-precision floating-point value
2706 | corresponding to the abstract input.  This routine is just like
2707 | `roundAndPackFloatx80' except that the input significand does not have to be
2708 | normalized.
2709 *----------------------------------------------------------------------------*/
2710 
2711 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2712                                        flag zSign, int32_t zExp,
2713                                        uint64_t zSig0, uint64_t zSig1,
2714                                        float_status *status)
2715 {
2716     int8_t shiftCount;
2717 
2718     if ( zSig0 == 0 ) {
2719         zSig0 = zSig1;
2720         zSig1 = 0;
2721         zExp -= 64;
2722     }
2723     shiftCount = countLeadingZeros64( zSig0 );
2724     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2725     zExp -= shiftCount;
2726     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2727                                 zSig0, zSig1, status);
2728 
2729 }
2730 
2731 /*----------------------------------------------------------------------------
2732 | Returns the least-significant 64 fraction bits of the quadruple-precision
2733 | floating-point value `a'.
2734 *----------------------------------------------------------------------------*/
2735 
2736 static inline uint64_t extractFloat128Frac1( float128 a )
2737 {
2738 
2739     return a.low;
2740 
2741 }
2742 
2743 /*----------------------------------------------------------------------------
2744 | Returns the most-significant 48 fraction bits of the quadruple-precision
2745 | floating-point value `a'.
2746 *----------------------------------------------------------------------------*/
2747 
2748 static inline uint64_t extractFloat128Frac0( float128 a )
2749 {
2750 
2751     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2752 
2753 }
2754 
2755 /*----------------------------------------------------------------------------
2756 | Returns the exponent bits of the quadruple-precision floating-point value
2757 | `a'.
2758 *----------------------------------------------------------------------------*/
2759 
2760 static inline int32_t extractFloat128Exp( float128 a )
2761 {
2762 
2763     return ( a.high>>48 ) & 0x7FFF;
2764 
2765 }
2766 
2767 /*----------------------------------------------------------------------------
2768 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2769 *----------------------------------------------------------------------------*/
2770 
2771 static inline flag extractFloat128Sign( float128 a )
2772 {
2773 
2774     return a.high>>63;
2775 
2776 }
2777 
2778 /*----------------------------------------------------------------------------
2779 | Normalizes the subnormal quadruple-precision floating-point value
2780 | represented by the denormalized significand formed by the concatenation of
2781 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2782 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2783 | significand are stored at the location pointed to by `zSig0Ptr', and the
2784 | least significant 64 bits of the normalized significand are stored at the
2785 | location pointed to by `zSig1Ptr'.
2786 *----------------------------------------------------------------------------*/
2787 
2788 static void
2789  normalizeFloat128Subnormal(
2790      uint64_t aSig0,
2791      uint64_t aSig1,
2792      int32_t *zExpPtr,
2793      uint64_t *zSig0Ptr,
2794      uint64_t *zSig1Ptr
2795  )
2796 {
2797     int8_t shiftCount;
2798 
2799     if ( aSig0 == 0 ) {
2800         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2801         if ( shiftCount < 0 ) {
2802             *zSig0Ptr = aSig1>>( - shiftCount );
2803             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2804         }
2805         else {
2806             *zSig0Ptr = aSig1<<shiftCount;
2807             *zSig1Ptr = 0;
2808         }
2809         *zExpPtr = - shiftCount - 63;
2810     }
2811     else {
2812         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2813         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2814         *zExpPtr = 1 - shiftCount;
2815     }
2816 
2817 }
2818 
2819 /*----------------------------------------------------------------------------
2820 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2821 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2822 | floating-point value, returning the result.  After being shifted into the
2823 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2824 | added together to form the most significant 32 bits of the result.  This
2825 | means that any integer portion of `zSig0' will be added into the exponent.
2826 | Since a properly normalized significand will have an integer portion equal
2827 | to 1, the `zExp' input should be 1 less than the desired result exponent
2828 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2829 | significand.
2830 *----------------------------------------------------------------------------*/
2831 
2832 static inline float128
2833  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2834 {
2835     float128 z;
2836 
2837     z.low = zSig1;
2838     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2839     return z;
2840 
2841 }
2842 
2843 /*----------------------------------------------------------------------------
2844 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2845 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2846 | and `zSig2', and returns the proper quadruple-precision floating-point value
2847 | corresponding to the abstract input.  Ordinarily, the abstract value is
2848 | simply rounded and packed into the quadruple-precision format, with the
2849 | inexact exception raised if the abstract input cannot be represented
2850 | exactly.  However, if the abstract value is too large, the overflow and
2851 | inexact exceptions are raised and an infinity or maximal finite value is
2852 | returned.  If the abstract value is too small, the input value is rounded to
2853 | a subnormal number, and the underflow and inexact exceptions are raised if
2854 | the abstract input cannot be represented exactly as a subnormal quadruple-
2855 | precision floating-point number.
2856 |     The input significand must be normalized or smaller.  If the input
2857 | significand is not normalized, `zExp' must be 0; in that case, the result
2858 | returned is a subnormal number, and it must not require rounding.  In the
2859 | usual case that the input significand is normalized, `zExp' must be 1 less
2860 | than the ``true'' floating-point exponent.  The handling of underflow and
2861 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2862 *----------------------------------------------------------------------------*/
2863 
2864 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2865                                      uint64_t zSig0, uint64_t zSig1,
2866                                      uint64_t zSig2, float_status *status)
2867 {
2868     int8_t roundingMode;
2869     flag roundNearestEven, increment, isTiny;
2870 
2871     roundingMode = status->float_rounding_mode;
2872     roundNearestEven = ( roundingMode == float_round_nearest_even );
2873     switch (roundingMode) {
2874     case float_round_nearest_even:
2875     case float_round_ties_away:
2876         increment = ((int64_t)zSig2 < 0);
2877         break;
2878     case float_round_to_zero:
2879         increment = 0;
2880         break;
2881     case float_round_up:
2882         increment = !zSign && zSig2;
2883         break;
2884     case float_round_down:
2885         increment = zSign && zSig2;
2886         break;
2887     case float_round_to_odd:
2888         increment = !(zSig1 & 0x1) && zSig2;
2889         break;
2890     default:
2891         abort();
2892     }
2893     if ( 0x7FFD <= (uint32_t) zExp ) {
2894         if (    ( 0x7FFD < zExp )
2895              || (    ( zExp == 0x7FFD )
2896                   && eq128(
2897                          LIT64( 0x0001FFFFFFFFFFFF ),
2898                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2899                          zSig0,
2900                          zSig1
2901                      )
2902                   && increment
2903                 )
2904            ) {
2905             float_raise(float_flag_overflow | float_flag_inexact, status);
2906             if (    ( roundingMode == float_round_to_zero )
2907                  || ( zSign && ( roundingMode == float_round_up ) )
2908                  || ( ! zSign && ( roundingMode == float_round_down ) )
2909                  || (roundingMode == float_round_to_odd)
2910                ) {
2911                 return
2912                     packFloat128(
2913                         zSign,
2914                         0x7FFE,
2915                         LIT64( 0x0000FFFFFFFFFFFF ),
2916                         LIT64( 0xFFFFFFFFFFFFFFFF )
2917                     );
2918             }
2919             return packFloat128( zSign, 0x7FFF, 0, 0 );
2920         }
2921         if ( zExp < 0 ) {
2922             if (status->flush_to_zero) {
2923                 float_raise(float_flag_output_denormal, status);
2924                 return packFloat128(zSign, 0, 0, 0);
2925             }
2926             isTiny =
2927                    (status->float_detect_tininess
2928                     == float_tininess_before_rounding)
2929                 || ( zExp < -1 )
2930                 || ! increment
2931                 || lt128(
2932                        zSig0,
2933                        zSig1,
2934                        LIT64( 0x0001FFFFFFFFFFFF ),
2935                        LIT64( 0xFFFFFFFFFFFFFFFF )
2936                    );
2937             shift128ExtraRightJamming(
2938                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2939             zExp = 0;
2940             if (isTiny && zSig2) {
2941                 float_raise(float_flag_underflow, status);
2942             }
2943             switch (roundingMode) {
2944             case float_round_nearest_even:
2945             case float_round_ties_away:
2946                 increment = ((int64_t)zSig2 < 0);
2947                 break;
2948             case float_round_to_zero:
2949                 increment = 0;
2950                 break;
2951             case float_round_up:
2952                 increment = !zSign && zSig2;
2953                 break;
2954             case float_round_down:
2955                 increment = zSign && zSig2;
2956                 break;
2957             case float_round_to_odd:
2958                 increment = !(zSig1 & 0x1) && zSig2;
2959                 break;
2960             default:
2961                 abort();
2962             }
2963         }
2964     }
2965     if (zSig2) {
2966         status->float_exception_flags |= float_flag_inexact;
2967     }
2968     if ( increment ) {
2969         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2970         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2971     }
2972     else {
2973         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2974     }
2975     return packFloat128( zSign, zExp, zSig0, zSig1 );
2976 
2977 }
2978 
2979 /*----------------------------------------------------------------------------
2980 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2981 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2982 | returns the proper quadruple-precision floating-point value corresponding
2983 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2984 | except that the input significand has fewer bits and does not have to be
2985 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2986 | point exponent.
2987 *----------------------------------------------------------------------------*/
2988 
2989 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2990                                               uint64_t zSig0, uint64_t zSig1,
2991                                               float_status *status)
2992 {
2993     int8_t shiftCount;
2994     uint64_t zSig2;
2995 
2996     if ( zSig0 == 0 ) {
2997         zSig0 = zSig1;
2998         zSig1 = 0;
2999         zExp -= 64;
3000     }
3001     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3002     if ( 0 <= shiftCount ) {
3003         zSig2 = 0;
3004         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3005     }
3006     else {
3007         shift128ExtraRightJamming(
3008             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3009     }
3010     zExp -= shiftCount;
3011     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3012 
3013 }
3014 
3015 
3016 /*----------------------------------------------------------------------------
3017 | Returns the result of converting the 32-bit two's complement integer `a'
3018 | to the extended double-precision floating-point format.  The conversion
3019 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3020 | Arithmetic.
3021 *----------------------------------------------------------------------------*/
3022 
3023 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3024 {
3025     flag zSign;
3026     uint32_t absA;
3027     int8_t shiftCount;
3028     uint64_t zSig;
3029 
3030     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3031     zSign = ( a < 0 );
3032     absA = zSign ? - a : a;
3033     shiftCount = countLeadingZeros32( absA ) + 32;
3034     zSig = absA;
3035     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3036 
3037 }
3038 
3039 /*----------------------------------------------------------------------------
3040 | Returns the result of converting the 32-bit two's complement integer `a' to
3041 | the quadruple-precision floating-point format.  The conversion is performed
3042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044 
3045 float128 int32_to_float128(int32_t a, float_status *status)
3046 {
3047     flag zSign;
3048     uint32_t absA;
3049     int8_t shiftCount;
3050     uint64_t zSig0;
3051 
3052     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3053     zSign = ( a < 0 );
3054     absA = zSign ? - a : a;
3055     shiftCount = countLeadingZeros32( absA ) + 17;
3056     zSig0 = absA;
3057     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3058 
3059 }
3060 
3061 /*----------------------------------------------------------------------------
3062 | Returns the result of converting the 64-bit two's complement integer `a'
3063 | to the extended double-precision floating-point format.  The conversion
3064 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3065 | Arithmetic.
3066 *----------------------------------------------------------------------------*/
3067 
3068 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3069 {
3070     flag zSign;
3071     uint64_t absA;
3072     int8_t shiftCount;
3073 
3074     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3075     zSign = ( a < 0 );
3076     absA = zSign ? - a : a;
3077     shiftCount = countLeadingZeros64( absA );
3078     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3079 
3080 }
3081 
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the 64-bit two's complement integer `a' to
3084 | the quadruple-precision floating-point format.  The conversion is performed
3085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3086 *----------------------------------------------------------------------------*/
3087 
3088 float128 int64_to_float128(int64_t a, float_status *status)
3089 {
3090     flag zSign;
3091     uint64_t absA;
3092     int8_t shiftCount;
3093     int32_t zExp;
3094     uint64_t zSig0, zSig1;
3095 
3096     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3097     zSign = ( a < 0 );
3098     absA = zSign ? - a : a;
3099     shiftCount = countLeadingZeros64( absA ) + 49;
3100     zExp = 0x406E - shiftCount;
3101     if ( 64 <= shiftCount ) {
3102         zSig1 = 0;
3103         zSig0 = absA;
3104         shiftCount -= 64;
3105     }
3106     else {
3107         zSig1 = absA;
3108         zSig0 = 0;
3109     }
3110     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3111     return packFloat128( zSign, zExp, zSig0, zSig1 );
3112 
3113 }
3114 
3115 /*----------------------------------------------------------------------------
3116 | Returns the result of converting the 64-bit unsigned integer `a'
3117 | to the quadruple-precision floating-point format.  The conversion is performed
3118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3119 *----------------------------------------------------------------------------*/
3120 
3121 float128 uint64_to_float128(uint64_t a, float_status *status)
3122 {
3123     if (a == 0) {
3124         return float128_zero;
3125     }
3126     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3127 }
3128 
3129 
3130 
3131 
3132 /*----------------------------------------------------------------------------
3133 | Returns the result of converting the single-precision floating-point value
3134 | `a' to the double-precision floating-point format.  The conversion is
3135 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3136 | Arithmetic.
3137 *----------------------------------------------------------------------------*/
3138 
3139 float64 float32_to_float64(float32 a, float_status *status)
3140 {
3141     flag aSign;
3142     int aExp;
3143     uint32_t aSig;
3144     a = float32_squash_input_denormal(a, status);
3145 
3146     aSig = extractFloat32Frac( a );
3147     aExp = extractFloat32Exp( a );
3148     aSign = extractFloat32Sign( a );
3149     if ( aExp == 0xFF ) {
3150         if (aSig) {
3151             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3152         }
3153         return packFloat64( aSign, 0x7FF, 0 );
3154     }
3155     if ( aExp == 0 ) {
3156         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3157         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3158         --aExp;
3159     }
3160     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3161 
3162 }
3163 
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of converting the single-precision floating-point value
3166 | `a' to the extended double-precision floating-point format.  The conversion
3167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3168 | Arithmetic.
3169 *----------------------------------------------------------------------------*/
3170 
3171 floatx80 float32_to_floatx80(float32 a, float_status *status)
3172 {
3173     flag aSign;
3174     int aExp;
3175     uint32_t aSig;
3176 
3177     a = float32_squash_input_denormal(a, status);
3178     aSig = extractFloat32Frac( a );
3179     aExp = extractFloat32Exp( a );
3180     aSign = extractFloat32Sign( a );
3181     if ( aExp == 0xFF ) {
3182         if (aSig) {
3183             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3184         }
3185         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3186     }
3187     if ( aExp == 0 ) {
3188         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3189         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3190     }
3191     aSig |= 0x00800000;
3192     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3193 
3194 }
3195 
3196 /*----------------------------------------------------------------------------
3197 | Returns the result of converting the single-precision floating-point value
3198 | `a' to the double-precision floating-point format.  The conversion is
3199 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3200 | Arithmetic.
3201 *----------------------------------------------------------------------------*/
3202 
3203 float128 float32_to_float128(float32 a, float_status *status)
3204 {
3205     flag aSign;
3206     int aExp;
3207     uint32_t aSig;
3208 
3209     a = float32_squash_input_denormal(a, status);
3210     aSig = extractFloat32Frac( a );
3211     aExp = extractFloat32Exp( a );
3212     aSign = extractFloat32Sign( a );
3213     if ( aExp == 0xFF ) {
3214         if (aSig) {
3215             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3216         }
3217         return packFloat128( aSign, 0x7FFF, 0, 0 );
3218     }
3219     if ( aExp == 0 ) {
3220         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3221         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3222         --aExp;
3223     }
3224     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3225 
3226 }
3227 
3228 /*----------------------------------------------------------------------------
3229 | Returns the remainder of the single-precision floating-point value `a'
3230 | with respect to the corresponding value `b'.  The operation is performed
3231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3232 *----------------------------------------------------------------------------*/
3233 
3234 float32 float32_rem(float32 a, float32 b, float_status *status)
3235 {
3236     flag aSign, zSign;
3237     int aExp, bExp, expDiff;
3238     uint32_t aSig, bSig;
3239     uint32_t q;
3240     uint64_t aSig64, bSig64, q64;
3241     uint32_t alternateASig;
3242     int32_t sigMean;
3243     a = float32_squash_input_denormal(a, status);
3244     b = float32_squash_input_denormal(b, status);
3245 
3246     aSig = extractFloat32Frac( a );
3247     aExp = extractFloat32Exp( a );
3248     aSign = extractFloat32Sign( a );
3249     bSig = extractFloat32Frac( b );
3250     bExp = extractFloat32Exp( b );
3251     if ( aExp == 0xFF ) {
3252         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3253             return propagateFloat32NaN(a, b, status);
3254         }
3255         float_raise(float_flag_invalid, status);
3256         return float32_default_nan(status);
3257     }
3258     if ( bExp == 0xFF ) {
3259         if (bSig) {
3260             return propagateFloat32NaN(a, b, status);
3261         }
3262         return a;
3263     }
3264     if ( bExp == 0 ) {
3265         if ( bSig == 0 ) {
3266             float_raise(float_flag_invalid, status);
3267             return float32_default_nan(status);
3268         }
3269         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3270     }
3271     if ( aExp == 0 ) {
3272         if ( aSig == 0 ) return a;
3273         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3274     }
3275     expDiff = aExp - bExp;
3276     aSig |= 0x00800000;
3277     bSig |= 0x00800000;
3278     if ( expDiff < 32 ) {
3279         aSig <<= 8;
3280         bSig <<= 8;
3281         if ( expDiff < 0 ) {
3282             if ( expDiff < -1 ) return a;
3283             aSig >>= 1;
3284         }
3285         q = ( bSig <= aSig );
3286         if ( q ) aSig -= bSig;
3287         if ( 0 < expDiff ) {
3288             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3289             q >>= 32 - expDiff;
3290             bSig >>= 2;
3291             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3292         }
3293         else {
3294             aSig >>= 2;
3295             bSig >>= 2;
3296         }
3297     }
3298     else {
3299         if ( bSig <= aSig ) aSig -= bSig;
3300         aSig64 = ( (uint64_t) aSig )<<40;
3301         bSig64 = ( (uint64_t) bSig )<<40;
3302         expDiff -= 64;
3303         while ( 0 < expDiff ) {
3304             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3305             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3306             aSig64 = - ( ( bSig * q64 )<<38 );
3307             expDiff -= 62;
3308         }
3309         expDiff += 64;
3310         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3311         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3312         q = q64>>( 64 - expDiff );
3313         bSig <<= 6;
3314         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3315     }
3316     do {
3317         alternateASig = aSig;
3318         ++q;
3319         aSig -= bSig;
3320     } while ( 0 <= (int32_t) aSig );
3321     sigMean = aSig + alternateASig;
3322     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3323         aSig = alternateASig;
3324     }
3325     zSign = ( (int32_t) aSig < 0 );
3326     if ( zSign ) aSig = - aSig;
3327     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3328 }
3329 
3330 
3331 
3332 /*----------------------------------------------------------------------------
3333 | Returns the binary exponential of the single-precision floating-point value
3334 | `a'. The operation is performed according to the IEC/IEEE Standard for
3335 | Binary Floating-Point Arithmetic.
3336 |
3337 | Uses the following identities:
3338 |
3339 | 1. -------------------------------------------------------------------------
3340 |      x    x*ln(2)
3341 |     2  = e
3342 |
3343 | 2. -------------------------------------------------------------------------
3344 |                      2     3     4     5           n
3345 |      x        x     x     x     x     x           x
3346 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3347 |               1!    2!    3!    4!    5!          n!
3348 *----------------------------------------------------------------------------*/
3349 
3350 static const float64 float32_exp2_coefficients[15] =
3351 {
3352     const_float64( 0x3ff0000000000000ll ), /*  1 */
3353     const_float64( 0x3fe0000000000000ll ), /*  2 */
3354     const_float64( 0x3fc5555555555555ll ), /*  3 */
3355     const_float64( 0x3fa5555555555555ll ), /*  4 */
3356     const_float64( 0x3f81111111111111ll ), /*  5 */
3357     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3358     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3359     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3360     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3361     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3362     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3363     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3364     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3365     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3366     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3367 };
3368 
3369 float32 float32_exp2(float32 a, float_status *status)
3370 {
3371     flag aSign;
3372     int aExp;
3373     uint32_t aSig;
3374     float64 r, x, xn;
3375     int i;
3376     a = float32_squash_input_denormal(a, status);
3377 
3378     aSig = extractFloat32Frac( a );
3379     aExp = extractFloat32Exp( a );
3380     aSign = extractFloat32Sign( a );
3381 
3382     if ( aExp == 0xFF) {
3383         if (aSig) {
3384             return propagateFloat32NaN(a, float32_zero, status);
3385         }
3386         return (aSign) ? float32_zero : a;
3387     }
3388     if (aExp == 0) {
3389         if (aSig == 0) return float32_one;
3390     }
3391 
3392     float_raise(float_flag_inexact, status);
3393 
3394     /* ******************************* */
3395     /* using float64 for approximation */
3396     /* ******************************* */
3397     x = float32_to_float64(a, status);
3398     x = float64_mul(x, float64_ln2, status);
3399 
3400     xn = x;
3401     r = float64_one;
3402     for (i = 0 ; i < 15 ; i++) {
3403         float64 f;
3404 
3405         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3406         r = float64_add(r, f, status);
3407 
3408         xn = float64_mul(xn, x, status);
3409     }
3410 
3411     return float64_to_float32(r, status);
3412 }
3413 
3414 /*----------------------------------------------------------------------------
3415 | Returns the binary log of the single-precision floating-point value `a'.
3416 | The operation is performed according to the IEC/IEEE Standard for Binary
3417 | Floating-Point Arithmetic.
3418 *----------------------------------------------------------------------------*/
3419 float32 float32_log2(float32 a, float_status *status)
3420 {
3421     flag aSign, zSign;
3422     int aExp;
3423     uint32_t aSig, zSig, i;
3424 
3425     a = float32_squash_input_denormal(a, status);
3426     aSig = extractFloat32Frac( a );
3427     aExp = extractFloat32Exp( a );
3428     aSign = extractFloat32Sign( a );
3429 
3430     if ( aExp == 0 ) {
3431         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3432         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3433     }
3434     if ( aSign ) {
3435         float_raise(float_flag_invalid, status);
3436         return float32_default_nan(status);
3437     }
3438     if ( aExp == 0xFF ) {
3439         if (aSig) {
3440             return propagateFloat32NaN(a, float32_zero, status);
3441         }
3442         return a;
3443     }
3444 
3445     aExp -= 0x7F;
3446     aSig |= 0x00800000;
3447     zSign = aExp < 0;
3448     zSig = aExp << 23;
3449 
3450     for (i = 1 << 22; i > 0; i >>= 1) {
3451         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3452         if ( aSig & 0x01000000 ) {
3453             aSig >>= 1;
3454             zSig |= i;
3455         }
3456     }
3457 
3458     if ( zSign )
3459         zSig = -zSig;
3460 
3461     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3462 }
3463 
3464 /*----------------------------------------------------------------------------
3465 | Returns 1 if the single-precision floating-point value `a' is equal to
3466 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3467 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3468 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3469 *----------------------------------------------------------------------------*/
3470 
3471 int float32_eq(float32 a, float32 b, float_status *status)
3472 {
3473     uint32_t av, bv;
3474     a = float32_squash_input_denormal(a, status);
3475     b = float32_squash_input_denormal(b, status);
3476 
3477     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3478          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3479        ) {
3480         float_raise(float_flag_invalid, status);
3481         return 0;
3482     }
3483     av = float32_val(a);
3484     bv = float32_val(b);
3485     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3486 }
3487 
3488 /*----------------------------------------------------------------------------
3489 | Returns 1 if the single-precision floating-point value `a' is less than
3490 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3491 | exception is raised if either operand is a NaN.  The comparison is performed
3492 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3493 *----------------------------------------------------------------------------*/
3494 
3495 int float32_le(float32 a, float32 b, float_status *status)
3496 {
3497     flag aSign, bSign;
3498     uint32_t av, bv;
3499     a = float32_squash_input_denormal(a, status);
3500     b = float32_squash_input_denormal(b, status);
3501 
3502     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3503          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3504        ) {
3505         float_raise(float_flag_invalid, status);
3506         return 0;
3507     }
3508     aSign = extractFloat32Sign( a );
3509     bSign = extractFloat32Sign( b );
3510     av = float32_val(a);
3511     bv = float32_val(b);
3512     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3513     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3514 
3515 }
3516 
3517 /*----------------------------------------------------------------------------
3518 | Returns 1 if the single-precision floating-point value `a' is less than
3519 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3520 | raised if either operand is a NaN.  The comparison is performed according
3521 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3522 *----------------------------------------------------------------------------*/
3523 
3524 int float32_lt(float32 a, float32 b, float_status *status)
3525 {
3526     flag aSign, bSign;
3527     uint32_t av, bv;
3528     a = float32_squash_input_denormal(a, status);
3529     b = float32_squash_input_denormal(b, status);
3530 
3531     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3532          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3533        ) {
3534         float_raise(float_flag_invalid, status);
3535         return 0;
3536     }
3537     aSign = extractFloat32Sign( a );
3538     bSign = extractFloat32Sign( b );
3539     av = float32_val(a);
3540     bv = float32_val(b);
3541     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3542     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3543 
3544 }
3545 
3546 /*----------------------------------------------------------------------------
3547 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3548 | be compared, and 0 otherwise.  The invalid exception is raised if either
3549 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3550 | Standard for Binary Floating-Point Arithmetic.
3551 *----------------------------------------------------------------------------*/
3552 
3553 int float32_unordered(float32 a, float32 b, float_status *status)
3554 {
3555     a = float32_squash_input_denormal(a, status);
3556     b = float32_squash_input_denormal(b, status);
3557 
3558     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3559          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3560        ) {
3561         float_raise(float_flag_invalid, status);
3562         return 1;
3563     }
3564     return 0;
3565 }
3566 
3567 /*----------------------------------------------------------------------------
3568 | Returns 1 if the single-precision floating-point value `a' is equal to
3569 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3570 | exception.  The comparison is performed according to the IEC/IEEE Standard
3571 | for Binary Floating-Point Arithmetic.
3572 *----------------------------------------------------------------------------*/
3573 
3574 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3575 {
3576     a = float32_squash_input_denormal(a, status);
3577     b = float32_squash_input_denormal(b, status);
3578 
3579     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3580          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3581        ) {
3582         if (float32_is_signaling_nan(a, status)
3583          || float32_is_signaling_nan(b, status)) {
3584             float_raise(float_flag_invalid, status);
3585         }
3586         return 0;
3587     }
3588     return ( float32_val(a) == float32_val(b) ) ||
3589             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3590 }
3591 
3592 /*----------------------------------------------------------------------------
3593 | Returns 1 if the single-precision floating-point value `a' is less than or
3594 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3595 | cause an exception.  Otherwise, the comparison is performed according to the
3596 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3597 *----------------------------------------------------------------------------*/
3598 
3599 int float32_le_quiet(float32 a, float32 b, float_status *status)
3600 {
3601     flag aSign, bSign;
3602     uint32_t av, bv;
3603     a = float32_squash_input_denormal(a, status);
3604     b = float32_squash_input_denormal(b, status);
3605 
3606     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3607          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3608        ) {
3609         if (float32_is_signaling_nan(a, status)
3610          || float32_is_signaling_nan(b, status)) {
3611             float_raise(float_flag_invalid, status);
3612         }
3613         return 0;
3614     }
3615     aSign = extractFloat32Sign( a );
3616     bSign = extractFloat32Sign( b );
3617     av = float32_val(a);
3618     bv = float32_val(b);
3619     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3620     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3621 
3622 }
3623 
3624 /*----------------------------------------------------------------------------
3625 | Returns 1 if the single-precision floating-point value `a' is less than
3626 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3627 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3628 | Standard for Binary Floating-Point Arithmetic.
3629 *----------------------------------------------------------------------------*/
3630 
3631 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3632 {
3633     flag aSign, bSign;
3634     uint32_t av, bv;
3635     a = float32_squash_input_denormal(a, status);
3636     b = float32_squash_input_denormal(b, status);
3637 
3638     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3639          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3640        ) {
3641         if (float32_is_signaling_nan(a, status)
3642          || float32_is_signaling_nan(b, status)) {
3643             float_raise(float_flag_invalid, status);
3644         }
3645         return 0;
3646     }
3647     aSign = extractFloat32Sign( a );
3648     bSign = extractFloat32Sign( b );
3649     av = float32_val(a);
3650     bv = float32_val(b);
3651     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3652     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3653 
3654 }
3655 
3656 /*----------------------------------------------------------------------------
3657 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3658 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3659 | comparison is performed according to the IEC/IEEE Standard for Binary
3660 | Floating-Point Arithmetic.
3661 *----------------------------------------------------------------------------*/
3662 
3663 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3664 {
3665     a = float32_squash_input_denormal(a, status);
3666     b = float32_squash_input_denormal(b, status);
3667 
3668     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3669          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3670        ) {
3671         if (float32_is_signaling_nan(a, status)
3672          || float32_is_signaling_nan(b, status)) {
3673             float_raise(float_flag_invalid, status);
3674         }
3675         return 1;
3676     }
3677     return 0;
3678 }
3679 
3680 
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the double-precision floating-point value
3683 | `a' to the single-precision floating-point format.  The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687 
3688 float32 float64_to_float32(float64 a, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint64_t aSig;
3693     uint32_t zSig;
3694     a = float64_squash_input_denormal(a, status);
3695 
3696     aSig = extractFloat64Frac( a );
3697     aExp = extractFloat64Exp( a );
3698     aSign = extractFloat64Sign( a );
3699     if ( aExp == 0x7FF ) {
3700         if (aSig) {
3701             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3702         }
3703         return packFloat32( aSign, 0xFF, 0 );
3704     }
3705     shift64RightJamming( aSig, 22, &aSig );
3706     zSig = aSig;
3707     if ( aExp || zSig ) {
3708         zSig |= 0x40000000;
3709         aExp -= 0x381;
3710     }
3711     return roundAndPackFloat32(aSign, aExp, zSig, status);
3712 
3713 }
3714 
3715 
3716 /*----------------------------------------------------------------------------
3717 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3718 | half-precision floating-point value, returning the result.  After being
3719 | shifted into the proper positions, the three fields are simply added
3720 | together to form the result.  This means that any integer portion of `zSig'
3721 | will be added into the exponent.  Since a properly normalized significand
3722 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3723 | than the desired result exponent whenever `zSig' is a complete, normalized
3724 | significand.
3725 *----------------------------------------------------------------------------*/
3726 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3727 {
3728     return make_float16(
3729         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3730 }
3731 
3732 /*----------------------------------------------------------------------------
3733 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3734 | and significand `zSig', and returns the proper half-precision floating-
3735 | point value corresponding to the abstract input.  Ordinarily, the abstract
3736 | value is simply rounded and packed into the half-precision format, with
3737 | the inexact exception raised if the abstract input cannot be represented
3738 | exactly.  However, if the abstract value is too large, the overflow and
3739 | inexact exceptions are raised and an infinity or maximal finite value is
3740 | returned.  If the abstract value is too small, the input value is rounded to
3741 | a subnormal number, and the underflow and inexact exceptions are raised if
3742 | the abstract input cannot be represented exactly as a subnormal half-
3743 | precision floating-point number.
3744 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3745 | ARM-style "alternative representation", which omits the NaN and Inf
3746 | encodings in order to raise the maximum representable exponent by one.
3747 |     The input significand `zSig' has its binary point between bits 22
3748 | and 23, which is 13 bits to the left of the usual location.  This shifted
3749 | significand must be normalized or smaller.  If `zSig' is not normalized,
3750 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3751 | and it must not require rounding.  In the usual case that `zSig' is
3752 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3753 | Note the slightly odd position of the binary point in zSig compared with the
3754 | other roundAndPackFloat functions. This should probably be fixed if we
3755 | need to implement more float16 routines than just conversion.
3756 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3757 | Binary Floating-Point Arithmetic.
3758 *----------------------------------------------------------------------------*/
3759 
3760 static float16 roundAndPackFloat16(flag zSign, int zExp,
3761                                    uint32_t zSig, flag ieee,
3762                                    float_status *status)
3763 {
3764     int maxexp = ieee ? 29 : 30;
3765     uint32_t mask;
3766     uint32_t increment;
3767     bool rounding_bumps_exp;
3768     bool is_tiny = false;
3769 
3770     /* Calculate the mask of bits of the mantissa which are not
3771      * representable in half-precision and will be lost.
3772      */
3773     if (zExp < 1) {
3774         /* Will be denormal in halfprec */
3775         mask = 0x00ffffff;
3776         if (zExp >= -11) {
3777             mask >>= 11 + zExp;
3778         }
3779     } else {
3780         /* Normal number in halfprec */
3781         mask = 0x00001fff;
3782     }
3783 
3784     switch (status->float_rounding_mode) {
3785     case float_round_nearest_even:
3786         increment = (mask + 1) >> 1;
3787         if ((zSig & mask) == increment) {
3788             increment = zSig & (increment << 1);
3789         }
3790         break;
3791     case float_round_ties_away:
3792         increment = (mask + 1) >> 1;
3793         break;
3794     case float_round_up:
3795         increment = zSign ? 0 : mask;
3796         break;
3797     case float_round_down:
3798         increment = zSign ? mask : 0;
3799         break;
3800     default: /* round_to_zero */
3801         increment = 0;
3802         break;
3803     }
3804 
3805     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3806 
3807     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3808         if (ieee) {
3809             float_raise(float_flag_overflow | float_flag_inexact, status);
3810             return packFloat16(zSign, 0x1f, 0);
3811         } else {
3812             float_raise(float_flag_invalid, status);
3813             return packFloat16(zSign, 0x1f, 0x3ff);
3814         }
3815     }
3816 
3817     if (zExp < 0) {
3818         /* Note that flush-to-zero does not affect half-precision results */
3819         is_tiny =
3820             (status->float_detect_tininess == float_tininess_before_rounding)
3821             || (zExp < -1)
3822             || (!rounding_bumps_exp);
3823     }
3824     if (zSig & mask) {
3825         float_raise(float_flag_inexact, status);
3826         if (is_tiny) {
3827             float_raise(float_flag_underflow, status);
3828         }
3829     }
3830 
3831     zSig += increment;
3832     if (rounding_bumps_exp) {
3833         zSig >>= 1;
3834         zExp++;
3835     }
3836 
3837     if (zExp < -10) {
3838         return packFloat16(zSign, 0, 0);
3839     }
3840     if (zExp < 0) {
3841         zSig >>= -zExp;
3842         zExp = 0;
3843     }
3844     return packFloat16(zSign, zExp, zSig >> 13);
3845 }
3846 
3847 /*----------------------------------------------------------------------------
3848 | If `a' is denormal and we are in flush-to-zero mode then set the
3849 | input-denormal exception and return zero. Otherwise just return the value.
3850 *----------------------------------------------------------------------------*/
3851 float16 float16_squash_input_denormal(float16 a, float_status *status)
3852 {
3853     if (status->flush_inputs_to_zero) {
3854         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3855             float_raise(float_flag_input_denormal, status);
3856             return make_float16(float16_val(a) & 0x8000);
3857         }
3858     }
3859     return a;
3860 }
3861 
3862 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3863                                       uint32_t *zSigPtr)
3864 {
3865     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3866     *zSigPtr = aSig << shiftCount;
3867     *zExpPtr = 1 - shiftCount;
3868 }
3869 
3870 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3871    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3872 
3873 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3874 {
3875     flag aSign;
3876     int aExp;
3877     uint32_t aSig;
3878 
3879     aSign = extractFloat16Sign(a);
3880     aExp = extractFloat16Exp(a);
3881     aSig = extractFloat16Frac(a);
3882 
3883     if (aExp == 0x1f && ieee) {
3884         if (aSig) {
3885             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3886         }
3887         return packFloat32(aSign, 0xff, 0);
3888     }
3889     if (aExp == 0) {
3890         if (aSig == 0) {
3891             return packFloat32(aSign, 0, 0);
3892         }
3893 
3894         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3895         aExp--;
3896     }
3897     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3898 }
3899 
3900 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3901 {
3902     flag aSign;
3903     int aExp;
3904     uint32_t aSig;
3905 
3906     a = float32_squash_input_denormal(a, status);
3907 
3908     aSig = extractFloat32Frac( a );
3909     aExp = extractFloat32Exp( a );
3910     aSign = extractFloat32Sign( a );
3911     if ( aExp == 0xFF ) {
3912         if (aSig) {
3913             /* Input is a NaN */
3914             if (!ieee) {
3915                 float_raise(float_flag_invalid, status);
3916                 return packFloat16(aSign, 0, 0);
3917             }
3918             return commonNaNToFloat16(
3919                 float32ToCommonNaN(a, status), status);
3920         }
3921         /* Infinity */
3922         if (!ieee) {
3923             float_raise(float_flag_invalid, status);
3924             return packFloat16(aSign, 0x1f, 0x3ff);
3925         }
3926         return packFloat16(aSign, 0x1f, 0);
3927     }
3928     if (aExp == 0 && aSig == 0) {
3929         return packFloat16(aSign, 0, 0);
3930     }
3931     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3932      * even if the input is denormal; however this is harmless because
3933      * the largest possible single-precision denormal is still smaller
3934      * than the smallest representable half-precision denormal, and so we
3935      * will end up ignoring aSig and returning via the "always return zero"
3936      * codepath.
3937      */
3938     aSig |= 0x00800000;
3939     aExp -= 0x71;
3940 
3941     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3942 }
3943 
3944 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3945 {
3946     flag aSign;
3947     int aExp;
3948     uint32_t aSig;
3949 
3950     aSign = extractFloat16Sign(a);
3951     aExp = extractFloat16Exp(a);
3952     aSig = extractFloat16Frac(a);
3953 
3954     if (aExp == 0x1f && ieee) {
3955         if (aSig) {
3956             return commonNaNToFloat64(
3957                 float16ToCommonNaN(a, status), status);
3958         }
3959         return packFloat64(aSign, 0x7ff, 0);
3960     }
3961     if (aExp == 0) {
3962         if (aSig == 0) {
3963             return packFloat64(aSign, 0, 0);
3964         }
3965 
3966         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3967         aExp--;
3968     }
3969     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3970 }
3971 
3972 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3973 {
3974     flag aSign;
3975     int aExp;
3976     uint64_t aSig;
3977     uint32_t zSig;
3978 
3979     a = float64_squash_input_denormal(a, status);
3980 
3981     aSig = extractFloat64Frac(a);
3982     aExp = extractFloat64Exp(a);
3983     aSign = extractFloat64Sign(a);
3984     if (aExp == 0x7FF) {
3985         if (aSig) {
3986             /* Input is a NaN */
3987             if (!ieee) {
3988                 float_raise(float_flag_invalid, status);
3989                 return packFloat16(aSign, 0, 0);
3990             }
3991             return commonNaNToFloat16(
3992                 float64ToCommonNaN(a, status), status);
3993         }
3994         /* Infinity */
3995         if (!ieee) {
3996             float_raise(float_flag_invalid, status);
3997             return packFloat16(aSign, 0x1f, 0x3ff);
3998         }
3999         return packFloat16(aSign, 0x1f, 0);
4000     }
4001     shift64RightJamming(aSig, 29, &aSig);
4002     zSig = aSig;
4003     if (aExp == 0 && zSig == 0) {
4004         return packFloat16(aSign, 0, 0);
4005     }
4006     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4007      * even if the input is denormal; however this is harmless because
4008      * the largest possible single-precision denormal is still smaller
4009      * than the smallest representable half-precision denormal, and so we
4010      * will end up ignoring aSig and returning via the "always return zero"
4011      * codepath.
4012      */
4013     zSig |= 0x00800000;
4014     aExp -= 0x3F1;
4015 
4016     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4017 }
4018 
4019 /*----------------------------------------------------------------------------
4020 | Returns the result of converting the double-precision floating-point value
4021 | `a' to the extended double-precision floating-point format.  The conversion
4022 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4023 | Arithmetic.
4024 *----------------------------------------------------------------------------*/
4025 
4026 floatx80 float64_to_floatx80(float64 a, float_status *status)
4027 {
4028     flag aSign;
4029     int aExp;
4030     uint64_t aSig;
4031 
4032     a = float64_squash_input_denormal(a, status);
4033     aSig = extractFloat64Frac( a );
4034     aExp = extractFloat64Exp( a );
4035     aSign = extractFloat64Sign( a );
4036     if ( aExp == 0x7FF ) {
4037         if (aSig) {
4038             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4039         }
4040         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4041     }
4042     if ( aExp == 0 ) {
4043         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4044         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4045     }
4046     return
4047         packFloatx80(
4048             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4049 
4050 }
4051 
4052 /*----------------------------------------------------------------------------
4053 | Returns the result of converting the double-precision floating-point value
4054 | `a' to the quadruple-precision floating-point format.  The conversion is
4055 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4056 | Arithmetic.
4057 *----------------------------------------------------------------------------*/
4058 
4059 float128 float64_to_float128(float64 a, float_status *status)
4060 {
4061     flag aSign;
4062     int aExp;
4063     uint64_t aSig, zSig0, zSig1;
4064 
4065     a = float64_squash_input_denormal(a, status);
4066     aSig = extractFloat64Frac( a );
4067     aExp = extractFloat64Exp( a );
4068     aSign = extractFloat64Sign( a );
4069     if ( aExp == 0x7FF ) {
4070         if (aSig) {
4071             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4072         }
4073         return packFloat128( aSign, 0x7FFF, 0, 0 );
4074     }
4075     if ( aExp == 0 ) {
4076         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4077         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4078         --aExp;
4079     }
4080     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4081     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4082 
4083 }
4084 
4085 
4086 /*----------------------------------------------------------------------------
4087 | Returns the remainder of the double-precision floating-point value `a'
4088 | with respect to the corresponding value `b'.  The operation is performed
4089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4090 *----------------------------------------------------------------------------*/
4091 
4092 float64 float64_rem(float64 a, float64 b, float_status *status)
4093 {
4094     flag aSign, zSign;
4095     int aExp, bExp, expDiff;
4096     uint64_t aSig, bSig;
4097     uint64_t q, alternateASig;
4098     int64_t sigMean;
4099 
4100     a = float64_squash_input_denormal(a, status);
4101     b = float64_squash_input_denormal(b, status);
4102     aSig = extractFloat64Frac( a );
4103     aExp = extractFloat64Exp( a );
4104     aSign = extractFloat64Sign( a );
4105     bSig = extractFloat64Frac( b );
4106     bExp = extractFloat64Exp( b );
4107     if ( aExp == 0x7FF ) {
4108         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4109             return propagateFloat64NaN(a, b, status);
4110         }
4111         float_raise(float_flag_invalid, status);
4112         return float64_default_nan(status);
4113     }
4114     if ( bExp == 0x7FF ) {
4115         if (bSig) {
4116             return propagateFloat64NaN(a, b, status);
4117         }
4118         return a;
4119     }
4120     if ( bExp == 0 ) {
4121         if ( bSig == 0 ) {
4122             float_raise(float_flag_invalid, status);
4123             return float64_default_nan(status);
4124         }
4125         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4126     }
4127     if ( aExp == 0 ) {
4128         if ( aSig == 0 ) return a;
4129         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4130     }
4131     expDiff = aExp - bExp;
4132     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4133     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4134     if ( expDiff < 0 ) {
4135         if ( expDiff < -1 ) return a;
4136         aSig >>= 1;
4137     }
4138     q = ( bSig <= aSig );
4139     if ( q ) aSig -= bSig;
4140     expDiff -= 64;
4141     while ( 0 < expDiff ) {
4142         q = estimateDiv128To64( aSig, 0, bSig );
4143         q = ( 2 < q ) ? q - 2 : 0;
4144         aSig = - ( ( bSig>>2 ) * q );
4145         expDiff -= 62;
4146     }
4147     expDiff += 64;
4148     if ( 0 < expDiff ) {
4149         q = estimateDiv128To64( aSig, 0, bSig );
4150         q = ( 2 < q ) ? q - 2 : 0;
4151         q >>= 64 - expDiff;
4152         bSig >>= 2;
4153         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4154     }
4155     else {
4156         aSig >>= 2;
4157         bSig >>= 2;
4158     }
4159     do {
4160         alternateASig = aSig;
4161         ++q;
4162         aSig -= bSig;
4163     } while ( 0 <= (int64_t) aSig );
4164     sigMean = aSig + alternateASig;
4165     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4166         aSig = alternateASig;
4167     }
4168     zSign = ( (int64_t) aSig < 0 );
4169     if ( zSign ) aSig = - aSig;
4170     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4171 
4172 }
4173 
4174 /*----------------------------------------------------------------------------
4175 | Returns the binary log of the double-precision floating-point value `a'.
4176 | The operation is performed according to the IEC/IEEE Standard for Binary
4177 | Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4179 float64 float64_log2(float64 a, float_status *status)
4180 {
4181     flag aSign, zSign;
4182     int aExp;
4183     uint64_t aSig, aSig0, aSig1, zSig, i;
4184     a = float64_squash_input_denormal(a, status);
4185 
4186     aSig = extractFloat64Frac( a );
4187     aExp = extractFloat64Exp( a );
4188     aSign = extractFloat64Sign( a );
4189 
4190     if ( aExp == 0 ) {
4191         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4192         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4193     }
4194     if ( aSign ) {
4195         float_raise(float_flag_invalid, status);
4196         return float64_default_nan(status);
4197     }
4198     if ( aExp == 0x7FF ) {
4199         if (aSig) {
4200             return propagateFloat64NaN(a, float64_zero, status);
4201         }
4202         return a;
4203     }
4204 
4205     aExp -= 0x3FF;
4206     aSig |= LIT64( 0x0010000000000000 );
4207     zSign = aExp < 0;
4208     zSig = (uint64_t)aExp << 52;
4209     for (i = 1LL << 51; i > 0; i >>= 1) {
4210         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4211         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4212         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4213             aSig >>= 1;
4214             zSig |= i;
4215         }
4216     }
4217 
4218     if ( zSign )
4219         zSig = -zSig;
4220     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4221 }
4222 
4223 /*----------------------------------------------------------------------------
4224 | Returns 1 if the double-precision floating-point value `a' is equal to the
4225 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4226 | if either operand is a NaN.  Otherwise, the comparison is performed
4227 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 *----------------------------------------------------------------------------*/
4229 
4230 int float64_eq(float64 a, float64 b, float_status *status)
4231 {
4232     uint64_t av, bv;
4233     a = float64_squash_input_denormal(a, status);
4234     b = float64_squash_input_denormal(b, status);
4235 
4236     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4237          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4238        ) {
4239         float_raise(float_flag_invalid, status);
4240         return 0;
4241     }
4242     av = float64_val(a);
4243     bv = float64_val(b);
4244     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4245 
4246 }
4247 
4248 /*----------------------------------------------------------------------------
4249 | Returns 1 if the double-precision floating-point value `a' is less than or
4250 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4251 | exception is raised if either operand is a NaN.  The comparison is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4254 
4255 int float64_le(float64 a, float64 b, float_status *status)
4256 {
4257     flag aSign, bSign;
4258     uint64_t av, bv;
4259     a = float64_squash_input_denormal(a, status);
4260     b = float64_squash_input_denormal(b, status);
4261 
4262     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4263          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4264        ) {
4265         float_raise(float_flag_invalid, status);
4266         return 0;
4267     }
4268     aSign = extractFloat64Sign( a );
4269     bSign = extractFloat64Sign( b );
4270     av = float64_val(a);
4271     bv = float64_val(b);
4272     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4273     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4274 
4275 }
4276 
4277 /*----------------------------------------------------------------------------
4278 | Returns 1 if the double-precision floating-point value `a' is less than
4279 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4280 | raised if either operand is a NaN.  The comparison is performed according
4281 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4282 *----------------------------------------------------------------------------*/
4283 
4284 int float64_lt(float64 a, float64 b, float_status *status)
4285 {
4286     flag aSign, bSign;
4287     uint64_t av, bv;
4288 
4289     a = float64_squash_input_denormal(a, status);
4290     b = float64_squash_input_denormal(b, status);
4291     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4292          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4293        ) {
4294         float_raise(float_flag_invalid, status);
4295         return 0;
4296     }
4297     aSign = extractFloat64Sign( a );
4298     bSign = extractFloat64Sign( b );
4299     av = float64_val(a);
4300     bv = float64_val(b);
4301     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4302     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4303 
4304 }
4305 
4306 /*----------------------------------------------------------------------------
4307 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4308 | be compared, and 0 otherwise.  The invalid exception is raised if either
4309 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4310 | Standard for Binary Floating-Point Arithmetic.
4311 *----------------------------------------------------------------------------*/
4312 
4313 int float64_unordered(float64 a, float64 b, float_status *status)
4314 {
4315     a = float64_squash_input_denormal(a, status);
4316     b = float64_squash_input_denormal(b, status);
4317 
4318     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4319          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4320        ) {
4321         float_raise(float_flag_invalid, status);
4322         return 1;
4323     }
4324     return 0;
4325 }
4326 
4327 /*----------------------------------------------------------------------------
4328 | Returns 1 if the double-precision floating-point value `a' is equal to the
4329 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4330 | exception.The comparison is performed according to the IEC/IEEE Standard
4331 | for Binary Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333 
4334 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4335 {
4336     uint64_t av, bv;
4337     a = float64_squash_input_denormal(a, status);
4338     b = float64_squash_input_denormal(b, status);
4339 
4340     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4341          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4342        ) {
4343         if (float64_is_signaling_nan(a, status)
4344          || float64_is_signaling_nan(b, status)) {
4345             float_raise(float_flag_invalid, status);
4346         }
4347         return 0;
4348     }
4349     av = float64_val(a);
4350     bv = float64_val(b);
4351     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4352 
4353 }
4354 
4355 /*----------------------------------------------------------------------------
4356 | Returns 1 if the double-precision floating-point value `a' is less than or
4357 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4358 | cause an exception.  Otherwise, the comparison is performed according to the
4359 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4360 *----------------------------------------------------------------------------*/
4361 
4362 int float64_le_quiet(float64 a, float64 b, float_status *status)
4363 {
4364     flag aSign, bSign;
4365     uint64_t av, bv;
4366     a = float64_squash_input_denormal(a, status);
4367     b = float64_squash_input_denormal(b, status);
4368 
4369     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4370          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4371        ) {
4372         if (float64_is_signaling_nan(a, status)
4373          || float64_is_signaling_nan(b, status)) {
4374             float_raise(float_flag_invalid, status);
4375         }
4376         return 0;
4377     }
4378     aSign = extractFloat64Sign( a );
4379     bSign = extractFloat64Sign( b );
4380     av = float64_val(a);
4381     bv = float64_val(b);
4382     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4383     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4384 
4385 }
4386 
4387 /*----------------------------------------------------------------------------
4388 | Returns 1 if the double-precision floating-point value `a' is less than
4389 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4390 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4391 | Standard for Binary Floating-Point Arithmetic.
4392 *----------------------------------------------------------------------------*/
4393 
4394 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4395 {
4396     flag aSign, bSign;
4397     uint64_t av, bv;
4398     a = float64_squash_input_denormal(a, status);
4399     b = float64_squash_input_denormal(b, status);
4400 
4401     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4402          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4403        ) {
4404         if (float64_is_signaling_nan(a, status)
4405          || float64_is_signaling_nan(b, status)) {
4406             float_raise(float_flag_invalid, status);
4407         }
4408         return 0;
4409     }
4410     aSign = extractFloat64Sign( a );
4411     bSign = extractFloat64Sign( b );
4412     av = float64_val(a);
4413     bv = float64_val(b);
4414     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4415     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4416 
4417 }
4418 
4419 /*----------------------------------------------------------------------------
4420 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4421 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4422 | comparison is performed according to the IEC/IEEE Standard for Binary
4423 | Floating-Point Arithmetic.
4424 *----------------------------------------------------------------------------*/
4425 
4426 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4427 {
4428     a = float64_squash_input_denormal(a, status);
4429     b = float64_squash_input_denormal(b, status);
4430 
4431     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4432          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4433        ) {
4434         if (float64_is_signaling_nan(a, status)
4435          || float64_is_signaling_nan(b, status)) {
4436             float_raise(float_flag_invalid, status);
4437         }
4438         return 1;
4439     }
4440     return 0;
4441 }
4442 
4443 /*----------------------------------------------------------------------------
4444 | Returns the result of converting the extended double-precision floating-
4445 | point value `a' to the 32-bit two's complement integer format.  The
4446 | conversion is performed according to the IEC/IEEE Standard for Binary
4447 | Floating-Point Arithmetic---which means in particular that the conversion
4448 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4449 | largest positive integer is returned.  Otherwise, if the conversion
4450 | overflows, the largest integer with the same sign as `a' is returned.
4451 *----------------------------------------------------------------------------*/
4452 
4453 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4454 {
4455     flag aSign;
4456     int32_t aExp, shiftCount;
4457     uint64_t aSig;
4458 
4459     if (floatx80_invalid_encoding(a)) {
4460         float_raise(float_flag_invalid, status);
4461         return 1 << 31;
4462     }
4463     aSig = extractFloatx80Frac( a );
4464     aExp = extractFloatx80Exp( a );
4465     aSign = extractFloatx80Sign( a );
4466     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4467     shiftCount = 0x4037 - aExp;
4468     if ( shiftCount <= 0 ) shiftCount = 1;
4469     shift64RightJamming( aSig, shiftCount, &aSig );
4470     return roundAndPackInt32(aSign, aSig, status);
4471 
4472 }
4473 
4474 /*----------------------------------------------------------------------------
4475 | Returns the result of converting the extended double-precision floating-
4476 | point value `a' to the 32-bit two's complement integer format.  The
4477 | conversion is performed according to the IEC/IEEE Standard for Binary
4478 | Floating-Point Arithmetic, except that the conversion is always rounded
4479 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4480 | Otherwise, if the conversion overflows, the largest integer with the same
4481 | sign as `a' is returned.
4482 *----------------------------------------------------------------------------*/
4483 
4484 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4485 {
4486     flag aSign;
4487     int32_t aExp, shiftCount;
4488     uint64_t aSig, savedASig;
4489     int32_t z;
4490 
4491     if (floatx80_invalid_encoding(a)) {
4492         float_raise(float_flag_invalid, status);
4493         return 1 << 31;
4494     }
4495     aSig = extractFloatx80Frac( a );
4496     aExp = extractFloatx80Exp( a );
4497     aSign = extractFloatx80Sign( a );
4498     if ( 0x401E < aExp ) {
4499         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4500         goto invalid;
4501     }
4502     else if ( aExp < 0x3FFF ) {
4503         if (aExp || aSig) {
4504             status->float_exception_flags |= float_flag_inexact;
4505         }
4506         return 0;
4507     }
4508     shiftCount = 0x403E - aExp;
4509     savedASig = aSig;
4510     aSig >>= shiftCount;
4511     z = aSig;
4512     if ( aSign ) z = - z;
4513     if ( ( z < 0 ) ^ aSign ) {
4514  invalid:
4515         float_raise(float_flag_invalid, status);
4516         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4517     }
4518     if ( ( aSig<<shiftCount ) != savedASig ) {
4519         status->float_exception_flags |= float_flag_inexact;
4520     }
4521     return z;
4522 
4523 }
4524 
4525 /*----------------------------------------------------------------------------
4526 | Returns the result of converting the extended double-precision floating-
4527 | point value `a' to the 64-bit two's complement integer format.  The
4528 | conversion is performed according to the IEC/IEEE Standard for Binary
4529 | Floating-Point Arithmetic---which means in particular that the conversion
4530 | is rounded according to the current rounding mode.  If `a' is a NaN,
4531 | the largest positive integer is returned.  Otherwise, if the conversion
4532 | overflows, the largest integer with the same sign as `a' is returned.
4533 *----------------------------------------------------------------------------*/
4534 
4535 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4536 {
4537     flag aSign;
4538     int32_t aExp, shiftCount;
4539     uint64_t aSig, aSigExtra;
4540 
4541     if (floatx80_invalid_encoding(a)) {
4542         float_raise(float_flag_invalid, status);
4543         return 1ULL << 63;
4544     }
4545     aSig = extractFloatx80Frac( a );
4546     aExp = extractFloatx80Exp( a );
4547     aSign = extractFloatx80Sign( a );
4548     shiftCount = 0x403E - aExp;
4549     if ( shiftCount <= 0 ) {
4550         if ( shiftCount ) {
4551             float_raise(float_flag_invalid, status);
4552             if (    ! aSign
4553                  || (    ( aExp == 0x7FFF )
4554                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4555                ) {
4556                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4557             }
4558             return (int64_t) LIT64( 0x8000000000000000 );
4559         }
4560         aSigExtra = 0;
4561     }
4562     else {
4563         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4564     }
4565     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4566 
4567 }
4568 
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the extended double-precision floating-
4571 | point value `a' to the 64-bit two's complement integer format.  The
4572 | conversion is performed according to the IEC/IEEE Standard for Binary
4573 | Floating-Point Arithmetic, except that the conversion is always rounded
4574 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4575 | Otherwise, if the conversion overflows, the largest integer with the same
4576 | sign as `a' is returned.
4577 *----------------------------------------------------------------------------*/
4578 
4579 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4580 {
4581     flag aSign;
4582     int32_t aExp, shiftCount;
4583     uint64_t aSig;
4584     int64_t z;
4585 
4586     if (floatx80_invalid_encoding(a)) {
4587         float_raise(float_flag_invalid, status);
4588         return 1ULL << 63;
4589     }
4590     aSig = extractFloatx80Frac( a );
4591     aExp = extractFloatx80Exp( a );
4592     aSign = extractFloatx80Sign( a );
4593     shiftCount = aExp - 0x403E;
4594     if ( 0 <= shiftCount ) {
4595         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4596         if ( ( a.high != 0xC03E ) || aSig ) {
4597             float_raise(float_flag_invalid, status);
4598             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4599                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4600             }
4601         }
4602         return (int64_t) LIT64( 0x8000000000000000 );
4603     }
4604     else if ( aExp < 0x3FFF ) {
4605         if (aExp | aSig) {
4606             status->float_exception_flags |= float_flag_inexact;
4607         }
4608         return 0;
4609     }
4610     z = aSig>>( - shiftCount );
4611     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4612         status->float_exception_flags |= float_flag_inexact;
4613     }
4614     if ( aSign ) z = - z;
4615     return z;
4616 
4617 }
4618 
4619 /*----------------------------------------------------------------------------
4620 | Returns the result of converting the extended double-precision floating-
4621 | point value `a' to the single-precision floating-point format.  The
4622 | conversion is performed according to the IEC/IEEE Standard for Binary
4623 | Floating-Point Arithmetic.
4624 *----------------------------------------------------------------------------*/
4625 
4626 float32 floatx80_to_float32(floatx80 a, float_status *status)
4627 {
4628     flag aSign;
4629     int32_t aExp;
4630     uint64_t aSig;
4631 
4632     if (floatx80_invalid_encoding(a)) {
4633         float_raise(float_flag_invalid, status);
4634         return float32_default_nan(status);
4635     }
4636     aSig = extractFloatx80Frac( a );
4637     aExp = extractFloatx80Exp( a );
4638     aSign = extractFloatx80Sign( a );
4639     if ( aExp == 0x7FFF ) {
4640         if ( (uint64_t) ( aSig<<1 ) ) {
4641             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4642         }
4643         return packFloat32( aSign, 0xFF, 0 );
4644     }
4645     shift64RightJamming( aSig, 33, &aSig );
4646     if ( aExp || aSig ) aExp -= 0x3F81;
4647     return roundAndPackFloat32(aSign, aExp, aSig, status);
4648 
4649 }
4650 
4651 /*----------------------------------------------------------------------------
4652 | Returns the result of converting the extended double-precision floating-
4653 | point value `a' to the double-precision floating-point format.  The
4654 | conversion is performed according to the IEC/IEEE Standard for Binary
4655 | Floating-Point Arithmetic.
4656 *----------------------------------------------------------------------------*/
4657 
4658 float64 floatx80_to_float64(floatx80 a, float_status *status)
4659 {
4660     flag aSign;
4661     int32_t aExp;
4662     uint64_t aSig, zSig;
4663 
4664     if (floatx80_invalid_encoding(a)) {
4665         float_raise(float_flag_invalid, status);
4666         return float64_default_nan(status);
4667     }
4668     aSig = extractFloatx80Frac( a );
4669     aExp = extractFloatx80Exp( a );
4670     aSign = extractFloatx80Sign( a );
4671     if ( aExp == 0x7FFF ) {
4672         if ( (uint64_t) ( aSig<<1 ) ) {
4673             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4674         }
4675         return packFloat64( aSign, 0x7FF, 0 );
4676     }
4677     shift64RightJamming( aSig, 1, &zSig );
4678     if ( aExp || aSig ) aExp -= 0x3C01;
4679     return roundAndPackFloat64(aSign, aExp, zSig, status);
4680 
4681 }
4682 
4683 /*----------------------------------------------------------------------------
4684 | Returns the result of converting the extended double-precision floating-
4685 | point value `a' to the quadruple-precision floating-point format.  The
4686 | conversion is performed according to the IEC/IEEE Standard for Binary
4687 | Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4689 
4690 float128 floatx80_to_float128(floatx80 a, float_status *status)
4691 {
4692     flag aSign;
4693     int aExp;
4694     uint64_t aSig, zSig0, zSig1;
4695 
4696     if (floatx80_invalid_encoding(a)) {
4697         float_raise(float_flag_invalid, status);
4698         return float128_default_nan(status);
4699     }
4700     aSig = extractFloatx80Frac( a );
4701     aExp = extractFloatx80Exp( a );
4702     aSign = extractFloatx80Sign( a );
4703     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4704         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4705     }
4706     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4707     return packFloat128( aSign, aExp, zSig0, zSig1 );
4708 
4709 }
4710 
4711 /*----------------------------------------------------------------------------
4712 | Rounds the extended double-precision floating-point value `a'
4713 | to the precision provided by floatx80_rounding_precision and returns the
4714 | result as an extended double-precision floating-point value.
4715 | The operation is performed according to the IEC/IEEE Standard for Binary
4716 | Floating-Point Arithmetic.
4717 *----------------------------------------------------------------------------*/
4718 
4719 floatx80 floatx80_round(floatx80 a, float_status *status)
4720 {
4721     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4722                                 extractFloatx80Sign(a),
4723                                 extractFloatx80Exp(a),
4724                                 extractFloatx80Frac(a), 0, status);
4725 }
4726 
4727 /*----------------------------------------------------------------------------
4728 | Rounds the extended double-precision floating-point value `a' to an integer,
4729 | and returns the result as an extended quadruple-precision floating-point
4730 | value.  The operation is performed according to the IEC/IEEE Standard for
4731 | Binary Floating-Point Arithmetic.
4732 *----------------------------------------------------------------------------*/
4733 
4734 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4735 {
4736     flag aSign;
4737     int32_t aExp;
4738     uint64_t lastBitMask, roundBitsMask;
4739     floatx80 z;
4740 
4741     if (floatx80_invalid_encoding(a)) {
4742         float_raise(float_flag_invalid, status);
4743         return floatx80_default_nan(status);
4744     }
4745     aExp = extractFloatx80Exp( a );
4746     if ( 0x403E <= aExp ) {
4747         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4748             return propagateFloatx80NaN(a, a, status);
4749         }
4750         return a;
4751     }
4752     if ( aExp < 0x3FFF ) {
4753         if (    ( aExp == 0 )
4754              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4755             return a;
4756         }
4757         status->float_exception_flags |= float_flag_inexact;
4758         aSign = extractFloatx80Sign( a );
4759         switch (status->float_rounding_mode) {
4760          case float_round_nearest_even:
4761             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4762                ) {
4763                 return
4764                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4765             }
4766             break;
4767         case float_round_ties_away:
4768             if (aExp == 0x3FFE) {
4769                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4770             }
4771             break;
4772          case float_round_down:
4773             return
4774                   aSign ?
4775                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4776                 : packFloatx80( 0, 0, 0 );
4777          case float_round_up:
4778             return
4779                   aSign ? packFloatx80( 1, 0, 0 )
4780                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4781         }
4782         return packFloatx80( aSign, 0, 0 );
4783     }
4784     lastBitMask = 1;
4785     lastBitMask <<= 0x403E - aExp;
4786     roundBitsMask = lastBitMask - 1;
4787     z = a;
4788     switch (status->float_rounding_mode) {
4789     case float_round_nearest_even:
4790         z.low += lastBitMask>>1;
4791         if ((z.low & roundBitsMask) == 0) {
4792             z.low &= ~lastBitMask;
4793         }
4794         break;
4795     case float_round_ties_away:
4796         z.low += lastBitMask >> 1;
4797         break;
4798     case float_round_to_zero:
4799         break;
4800     case float_round_up:
4801         if (!extractFloatx80Sign(z)) {
4802             z.low += roundBitsMask;
4803         }
4804         break;
4805     case float_round_down:
4806         if (extractFloatx80Sign(z)) {
4807             z.low += roundBitsMask;
4808         }
4809         break;
4810     default:
4811         abort();
4812     }
4813     z.low &= ~ roundBitsMask;
4814     if ( z.low == 0 ) {
4815         ++z.high;
4816         z.low = LIT64( 0x8000000000000000 );
4817     }
4818     if (z.low != a.low) {
4819         status->float_exception_flags |= float_flag_inexact;
4820     }
4821     return z;
4822 
4823 }
4824 
4825 /*----------------------------------------------------------------------------
4826 | Returns the result of adding the absolute values of the extended double-
4827 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4828 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4829 | The addition is performed according to the IEC/IEEE Standard for Binary
4830 | Floating-Point Arithmetic.
4831 *----------------------------------------------------------------------------*/
4832 
4833 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4834                                 float_status *status)
4835 {
4836     int32_t aExp, bExp, zExp;
4837     uint64_t aSig, bSig, zSig0, zSig1;
4838     int32_t expDiff;
4839 
4840     aSig = extractFloatx80Frac( a );
4841     aExp = extractFloatx80Exp( a );
4842     bSig = extractFloatx80Frac( b );
4843     bExp = extractFloatx80Exp( b );
4844     expDiff = aExp - bExp;
4845     if ( 0 < expDiff ) {
4846         if ( aExp == 0x7FFF ) {
4847             if ((uint64_t)(aSig << 1)) {
4848                 return propagateFloatx80NaN(a, b, status);
4849             }
4850             return a;
4851         }
4852         if ( bExp == 0 ) --expDiff;
4853         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4854         zExp = aExp;
4855     }
4856     else if ( expDiff < 0 ) {
4857         if ( bExp == 0x7FFF ) {
4858             if ((uint64_t)(bSig << 1)) {
4859                 return propagateFloatx80NaN(a, b, status);
4860             }
4861             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4862         }
4863         if ( aExp == 0 ) ++expDiff;
4864         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4865         zExp = bExp;
4866     }
4867     else {
4868         if ( aExp == 0x7FFF ) {
4869             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4870                 return propagateFloatx80NaN(a, b, status);
4871             }
4872             return a;
4873         }
4874         zSig1 = 0;
4875         zSig0 = aSig + bSig;
4876         if ( aExp == 0 ) {
4877             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4878             goto roundAndPack;
4879         }
4880         zExp = aExp;
4881         goto shiftRight1;
4882     }
4883     zSig0 = aSig + bSig;
4884     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4885  shiftRight1:
4886     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4887     zSig0 |= LIT64( 0x8000000000000000 );
4888     ++zExp;
4889  roundAndPack:
4890     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4891                                 zSign, zExp, zSig0, zSig1, status);
4892 }
4893 
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of subtracting the absolute values of the extended
4896 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4897 | difference is negated before being returned.  `zSign' is ignored if the
4898 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4899 | Standard for Binary Floating-Point Arithmetic.
4900 *----------------------------------------------------------------------------*/
4901 
4902 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4903                                 float_status *status)
4904 {
4905     int32_t aExp, bExp, zExp;
4906     uint64_t aSig, bSig, zSig0, zSig1;
4907     int32_t expDiff;
4908 
4909     aSig = extractFloatx80Frac( a );
4910     aExp = extractFloatx80Exp( a );
4911     bSig = extractFloatx80Frac( b );
4912     bExp = extractFloatx80Exp( b );
4913     expDiff = aExp - bExp;
4914     if ( 0 < expDiff ) goto aExpBigger;
4915     if ( expDiff < 0 ) goto bExpBigger;
4916     if ( aExp == 0x7FFF ) {
4917         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4918             return propagateFloatx80NaN(a, b, status);
4919         }
4920         float_raise(float_flag_invalid, status);
4921         return floatx80_default_nan(status);
4922     }
4923     if ( aExp == 0 ) {
4924         aExp = 1;
4925         bExp = 1;
4926     }
4927     zSig1 = 0;
4928     if ( bSig < aSig ) goto aBigger;
4929     if ( aSig < bSig ) goto bBigger;
4930     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4931  bExpBigger:
4932     if ( bExp == 0x7FFF ) {
4933         if ((uint64_t)(bSig << 1)) {
4934             return propagateFloatx80NaN(a, b, status);
4935         }
4936         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4937     }
4938     if ( aExp == 0 ) ++expDiff;
4939     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4940  bBigger:
4941     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4942     zExp = bExp;
4943     zSign ^= 1;
4944     goto normalizeRoundAndPack;
4945  aExpBigger:
4946     if ( aExp == 0x7FFF ) {
4947         if ((uint64_t)(aSig << 1)) {
4948             return propagateFloatx80NaN(a, b, status);
4949         }
4950         return a;
4951     }
4952     if ( bExp == 0 ) --expDiff;
4953     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4954  aBigger:
4955     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4956     zExp = aExp;
4957  normalizeRoundAndPack:
4958     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4959                                          zSign, zExp, zSig0, zSig1, status);
4960 }
4961 
4962 /*----------------------------------------------------------------------------
4963 | Returns the result of adding the extended double-precision floating-point
4964 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4965 | Standard for Binary Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967 
4968 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4969 {
4970     flag aSign, bSign;
4971 
4972     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4973         float_raise(float_flag_invalid, status);
4974         return floatx80_default_nan(status);
4975     }
4976     aSign = extractFloatx80Sign( a );
4977     bSign = extractFloatx80Sign( b );
4978     if ( aSign == bSign ) {
4979         return addFloatx80Sigs(a, b, aSign, status);
4980     }
4981     else {
4982         return subFloatx80Sigs(a, b, aSign, status);
4983     }
4984 
4985 }
4986 
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of subtracting the extended double-precision floating-
4989 | point values `a' and `b'.  The operation is performed according to the
4990 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4992 
4993 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4994 {
4995     flag aSign, bSign;
4996 
4997     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4998         float_raise(float_flag_invalid, status);
4999         return floatx80_default_nan(status);
5000     }
5001     aSign = extractFloatx80Sign( a );
5002     bSign = extractFloatx80Sign( b );
5003     if ( aSign == bSign ) {
5004         return subFloatx80Sigs(a, b, aSign, status);
5005     }
5006     else {
5007         return addFloatx80Sigs(a, b, aSign, status);
5008     }
5009 
5010 }
5011 
5012 /*----------------------------------------------------------------------------
5013 | Returns the result of multiplying the extended double-precision floating-
5014 | point values `a' and `b'.  The operation is performed according to the
5015 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5016 *----------------------------------------------------------------------------*/
5017 
5018 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5019 {
5020     flag aSign, bSign, zSign;
5021     int32_t aExp, bExp, zExp;
5022     uint64_t aSig, bSig, zSig0, zSig1;
5023 
5024     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5025         float_raise(float_flag_invalid, status);
5026         return floatx80_default_nan(status);
5027     }
5028     aSig = extractFloatx80Frac( a );
5029     aExp = extractFloatx80Exp( a );
5030     aSign = extractFloatx80Sign( a );
5031     bSig = extractFloatx80Frac( b );
5032     bExp = extractFloatx80Exp( b );
5033     bSign = extractFloatx80Sign( b );
5034     zSign = aSign ^ bSign;
5035     if ( aExp == 0x7FFF ) {
5036         if (    (uint64_t) ( aSig<<1 )
5037              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5038             return propagateFloatx80NaN(a, b, status);
5039         }
5040         if ( ( bExp | bSig ) == 0 ) goto invalid;
5041         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5042     }
5043     if ( bExp == 0x7FFF ) {
5044         if ((uint64_t)(bSig << 1)) {
5045             return propagateFloatx80NaN(a, b, status);
5046         }
5047         if ( ( aExp | aSig ) == 0 ) {
5048  invalid:
5049             float_raise(float_flag_invalid, status);
5050             return floatx80_default_nan(status);
5051         }
5052         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5053     }
5054     if ( aExp == 0 ) {
5055         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5056         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5057     }
5058     if ( bExp == 0 ) {
5059         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5060         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5061     }
5062     zExp = aExp + bExp - 0x3FFE;
5063     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5064     if ( 0 < (int64_t) zSig0 ) {
5065         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5066         --zExp;
5067     }
5068     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5069                                 zSign, zExp, zSig0, zSig1, status);
5070 }
5071 
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of dividing the extended double-precision floating-point
5074 | value `a' by the corresponding value `b'.  The operation is performed
5075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5077 
5078 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5079 {
5080     flag aSign, bSign, zSign;
5081     int32_t aExp, bExp, zExp;
5082     uint64_t aSig, bSig, zSig0, zSig1;
5083     uint64_t rem0, rem1, rem2, term0, term1, term2;
5084 
5085     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5086         float_raise(float_flag_invalid, status);
5087         return floatx80_default_nan(status);
5088     }
5089     aSig = extractFloatx80Frac( a );
5090     aExp = extractFloatx80Exp( a );
5091     aSign = extractFloatx80Sign( a );
5092     bSig = extractFloatx80Frac( b );
5093     bExp = extractFloatx80Exp( b );
5094     bSign = extractFloatx80Sign( b );
5095     zSign = aSign ^ bSign;
5096     if ( aExp == 0x7FFF ) {
5097         if ((uint64_t)(aSig << 1)) {
5098             return propagateFloatx80NaN(a, b, status);
5099         }
5100         if ( bExp == 0x7FFF ) {
5101             if ((uint64_t)(bSig << 1)) {
5102                 return propagateFloatx80NaN(a, b, status);
5103             }
5104             goto invalid;
5105         }
5106         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5107     }
5108     if ( bExp == 0x7FFF ) {
5109         if ((uint64_t)(bSig << 1)) {
5110             return propagateFloatx80NaN(a, b, status);
5111         }
5112         return packFloatx80( zSign, 0, 0 );
5113     }
5114     if ( bExp == 0 ) {
5115         if ( bSig == 0 ) {
5116             if ( ( aExp | aSig ) == 0 ) {
5117  invalid:
5118                 float_raise(float_flag_invalid, status);
5119                 return floatx80_default_nan(status);
5120             }
5121             float_raise(float_flag_divbyzero, status);
5122             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5123         }
5124         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5125     }
5126     if ( aExp == 0 ) {
5127         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5128         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5129     }
5130     zExp = aExp - bExp + 0x3FFE;
5131     rem1 = 0;
5132     if ( bSig <= aSig ) {
5133         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5134         ++zExp;
5135     }
5136     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5137     mul64To128( bSig, zSig0, &term0, &term1 );
5138     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5139     while ( (int64_t) rem0 < 0 ) {
5140         --zSig0;
5141         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5142     }
5143     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5144     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5145         mul64To128( bSig, zSig1, &term1, &term2 );
5146         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5147         while ( (int64_t) rem1 < 0 ) {
5148             --zSig1;
5149             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5150         }
5151         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5152     }
5153     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5154                                 zSign, zExp, zSig0, zSig1, status);
5155 }
5156 
5157 /*----------------------------------------------------------------------------
5158 | Returns the remainder of the extended double-precision floating-point value
5159 | `a' with respect to the corresponding value `b'.  The operation is performed
5160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162 
5163 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5164 {
5165     flag aSign, zSign;
5166     int32_t aExp, bExp, expDiff;
5167     uint64_t aSig0, aSig1, bSig;
5168     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5169 
5170     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5171         float_raise(float_flag_invalid, status);
5172         return floatx80_default_nan(status);
5173     }
5174     aSig0 = extractFloatx80Frac( a );
5175     aExp = extractFloatx80Exp( a );
5176     aSign = extractFloatx80Sign( a );
5177     bSig = extractFloatx80Frac( b );
5178     bExp = extractFloatx80Exp( b );
5179     if ( aExp == 0x7FFF ) {
5180         if (    (uint64_t) ( aSig0<<1 )
5181              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5182             return propagateFloatx80NaN(a, b, status);
5183         }
5184         goto invalid;
5185     }
5186     if ( bExp == 0x7FFF ) {
5187         if ((uint64_t)(bSig << 1)) {
5188             return propagateFloatx80NaN(a, b, status);
5189         }
5190         return a;
5191     }
5192     if ( bExp == 0 ) {
5193         if ( bSig == 0 ) {
5194  invalid:
5195             float_raise(float_flag_invalid, status);
5196             return floatx80_default_nan(status);
5197         }
5198         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5199     }
5200     if ( aExp == 0 ) {
5201         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5202         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5203     }
5204     bSig |= LIT64( 0x8000000000000000 );
5205     zSign = aSign;
5206     expDiff = aExp - bExp;
5207     aSig1 = 0;
5208     if ( expDiff < 0 ) {
5209         if ( expDiff < -1 ) return a;
5210         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5211         expDiff = 0;
5212     }
5213     q = ( bSig <= aSig0 );
5214     if ( q ) aSig0 -= bSig;
5215     expDiff -= 64;
5216     while ( 0 < expDiff ) {
5217         q = estimateDiv128To64( aSig0, aSig1, bSig );
5218         q = ( 2 < q ) ? q - 2 : 0;
5219         mul64To128( bSig, q, &term0, &term1 );
5220         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5221         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5222         expDiff -= 62;
5223     }
5224     expDiff += 64;
5225     if ( 0 < expDiff ) {
5226         q = estimateDiv128To64( aSig0, aSig1, bSig );
5227         q = ( 2 < q ) ? q - 2 : 0;
5228         q >>= 64 - expDiff;
5229         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5230         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5231         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5232         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5233             ++q;
5234             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5235         }
5236     }
5237     else {
5238         term1 = 0;
5239         term0 = bSig;
5240     }
5241     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5242     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5243          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5244               && ( q & 1 ) )
5245        ) {
5246         aSig0 = alternateASig0;
5247         aSig1 = alternateASig1;
5248         zSign = ! zSign;
5249     }
5250     return
5251         normalizeRoundAndPackFloatx80(
5252             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5253 
5254 }
5255 
5256 /*----------------------------------------------------------------------------
5257 | Returns the square root of the extended double-precision floating-point
5258 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5259 | for Binary Floating-Point Arithmetic.
5260 *----------------------------------------------------------------------------*/
5261 
5262 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5263 {
5264     flag aSign;
5265     int32_t aExp, zExp;
5266     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5267     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5268 
5269     if (floatx80_invalid_encoding(a)) {
5270         float_raise(float_flag_invalid, status);
5271         return floatx80_default_nan(status);
5272     }
5273     aSig0 = extractFloatx80Frac( a );
5274     aExp = extractFloatx80Exp( a );
5275     aSign = extractFloatx80Sign( a );
5276     if ( aExp == 0x7FFF ) {
5277         if ((uint64_t)(aSig0 << 1)) {
5278             return propagateFloatx80NaN(a, a, status);
5279         }
5280         if ( ! aSign ) return a;
5281         goto invalid;
5282     }
5283     if ( aSign ) {
5284         if ( ( aExp | aSig0 ) == 0 ) return a;
5285  invalid:
5286         float_raise(float_flag_invalid, status);
5287         return floatx80_default_nan(status);
5288     }
5289     if ( aExp == 0 ) {
5290         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5291         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5292     }
5293     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5294     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5295     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5296     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5297     doubleZSig0 = zSig0<<1;
5298     mul64To128( zSig0, zSig0, &term0, &term1 );
5299     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5300     while ( (int64_t) rem0 < 0 ) {
5301         --zSig0;
5302         doubleZSig0 -= 2;
5303         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5304     }
5305     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5306     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5307         if ( zSig1 == 0 ) zSig1 = 1;
5308         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5309         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5310         mul64To128( zSig1, zSig1, &term2, &term3 );
5311         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5312         while ( (int64_t) rem1 < 0 ) {
5313             --zSig1;
5314             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5315             term3 |= 1;
5316             term2 |= doubleZSig0;
5317             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5318         }
5319         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5320     }
5321     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5322     zSig0 |= doubleZSig0;
5323     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5324                                 0, zExp, zSig0, zSig1, status);
5325 }
5326 
5327 /*----------------------------------------------------------------------------
5328 | Returns 1 if the extended double-precision floating-point value `a' is equal
5329 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5330 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5332 *----------------------------------------------------------------------------*/
5333 
5334 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5335 {
5336 
5337     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5338         || (extractFloatx80Exp(a) == 0x7FFF
5339             && (uint64_t) (extractFloatx80Frac(a) << 1))
5340         || (extractFloatx80Exp(b) == 0x7FFF
5341             && (uint64_t) (extractFloatx80Frac(b) << 1))
5342        ) {
5343         float_raise(float_flag_invalid, status);
5344         return 0;
5345     }
5346     return
5347            ( a.low == b.low )
5348         && (    ( a.high == b.high )
5349              || (    ( a.low == 0 )
5350                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5351            );
5352 
5353 }
5354 
5355 /*----------------------------------------------------------------------------
5356 | Returns 1 if the extended double-precision floating-point value `a' is
5357 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5358 | invalid exception is raised if either operand is a NaN.  The comparison is
5359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5360 | Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 
5363 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5364 {
5365     flag aSign, bSign;
5366 
5367     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5368         || (extractFloatx80Exp(a) == 0x7FFF
5369             && (uint64_t) (extractFloatx80Frac(a) << 1))
5370         || (extractFloatx80Exp(b) == 0x7FFF
5371             && (uint64_t) (extractFloatx80Frac(b) << 1))
5372        ) {
5373         float_raise(float_flag_invalid, status);
5374         return 0;
5375     }
5376     aSign = extractFloatx80Sign( a );
5377     bSign = extractFloatx80Sign( b );
5378     if ( aSign != bSign ) {
5379         return
5380                aSign
5381             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5382                  == 0 );
5383     }
5384     return
5385           aSign ? le128( b.high, b.low, a.high, a.low )
5386         : le128( a.high, a.low, b.high, b.low );
5387 
5388 }
5389 
5390 /*----------------------------------------------------------------------------
5391 | Returns 1 if the extended double-precision floating-point value `a' is
5392 | less than the corresponding value `b', and 0 otherwise.  The invalid
5393 | exception is raised if either operand is a NaN.  The comparison is performed
5394 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 *----------------------------------------------------------------------------*/
5396 
5397 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5398 {
5399     flag aSign, bSign;
5400 
5401     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5402         || (extractFloatx80Exp(a) == 0x7FFF
5403             && (uint64_t) (extractFloatx80Frac(a) << 1))
5404         || (extractFloatx80Exp(b) == 0x7FFF
5405             && (uint64_t) (extractFloatx80Frac(b) << 1))
5406        ) {
5407         float_raise(float_flag_invalid, status);
5408         return 0;
5409     }
5410     aSign = extractFloatx80Sign( a );
5411     bSign = extractFloatx80Sign( b );
5412     if ( aSign != bSign ) {
5413         return
5414                aSign
5415             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5416                  != 0 );
5417     }
5418     return
5419           aSign ? lt128( b.high, b.low, a.high, a.low )
5420         : lt128( a.high, a.low, b.high, b.low );
5421 
5422 }
5423 
5424 /*----------------------------------------------------------------------------
5425 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5426 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5427 | either operand is a NaN.   The comparison is performed according to the
5428 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 *----------------------------------------------------------------------------*/
5430 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5431 {
5432     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5433         || (extractFloatx80Exp(a) == 0x7FFF
5434             && (uint64_t) (extractFloatx80Frac(a) << 1))
5435         || (extractFloatx80Exp(b) == 0x7FFF
5436             && (uint64_t) (extractFloatx80Frac(b) << 1))
5437        ) {
5438         float_raise(float_flag_invalid, status);
5439         return 1;
5440     }
5441     return 0;
5442 }
5443 
5444 /*----------------------------------------------------------------------------
5445 | Returns 1 if the extended double-precision floating-point value `a' is
5446 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5447 | cause an exception.  The comparison is performed according to the IEC/IEEE
5448 | Standard for Binary Floating-Point Arithmetic.
5449 *----------------------------------------------------------------------------*/
5450 
5451 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5452 {
5453 
5454     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5455         float_raise(float_flag_invalid, status);
5456         return 0;
5457     }
5458     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5459               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5460          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5461               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5462        ) {
5463         if (floatx80_is_signaling_nan(a, status)
5464          || floatx80_is_signaling_nan(b, status)) {
5465             float_raise(float_flag_invalid, status);
5466         }
5467         return 0;
5468     }
5469     return
5470            ( a.low == b.low )
5471         && (    ( a.high == b.high )
5472              || (    ( a.low == 0 )
5473                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5474            );
5475 
5476 }
5477 
5478 /*----------------------------------------------------------------------------
5479 | Returns 1 if the extended double-precision floating-point value `a' is less
5480 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5481 | do not cause an exception.  Otherwise, the comparison is performed according
5482 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5483 *----------------------------------------------------------------------------*/
5484 
5485 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5486 {
5487     flag aSign, bSign;
5488 
5489     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5490         float_raise(float_flag_invalid, status);
5491         return 0;
5492     }
5493     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5494               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5495          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5496               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5497        ) {
5498         if (floatx80_is_signaling_nan(a, status)
5499          || floatx80_is_signaling_nan(b, status)) {
5500             float_raise(float_flag_invalid, status);
5501         }
5502         return 0;
5503     }
5504     aSign = extractFloatx80Sign( a );
5505     bSign = extractFloatx80Sign( b );
5506     if ( aSign != bSign ) {
5507         return
5508                aSign
5509             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5510                  == 0 );
5511     }
5512     return
5513           aSign ? le128( b.high, b.low, a.high, a.low )
5514         : le128( a.high, a.low, b.high, b.low );
5515 
5516 }
5517 
5518 /*----------------------------------------------------------------------------
5519 | Returns 1 if the extended double-precision floating-point value `a' is less
5520 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5521 | an exception.  Otherwise, the comparison is performed according to the
5522 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5523 *----------------------------------------------------------------------------*/
5524 
5525 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5526 {
5527     flag aSign, bSign;
5528 
5529     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5530         float_raise(float_flag_invalid, status);
5531         return 0;
5532     }
5533     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5534               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5535          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5536               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5537        ) {
5538         if (floatx80_is_signaling_nan(a, status)
5539          || floatx80_is_signaling_nan(b, status)) {
5540             float_raise(float_flag_invalid, status);
5541         }
5542         return 0;
5543     }
5544     aSign = extractFloatx80Sign( a );
5545     bSign = extractFloatx80Sign( b );
5546     if ( aSign != bSign ) {
5547         return
5548                aSign
5549             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5550                  != 0 );
5551     }
5552     return
5553           aSign ? lt128( b.high, b.low, a.high, a.low )
5554         : lt128( a.high, a.low, b.high, b.low );
5555 
5556 }
5557 
5558 /*----------------------------------------------------------------------------
5559 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5560 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5561 | The comparison is performed according to the IEC/IEEE Standard for Binary
5562 | Floating-Point Arithmetic.
5563 *----------------------------------------------------------------------------*/
5564 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5565 {
5566     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5567         float_raise(float_flag_invalid, status);
5568         return 1;
5569     }
5570     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5571               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5572          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5573               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5574        ) {
5575         if (floatx80_is_signaling_nan(a, status)
5576          || floatx80_is_signaling_nan(b, status)) {
5577             float_raise(float_flag_invalid, status);
5578         }
5579         return 1;
5580     }
5581     return 0;
5582 }
5583 
5584 /*----------------------------------------------------------------------------
5585 | Returns the result of converting the quadruple-precision floating-point
5586 | value `a' to the 32-bit two's complement integer format.  The conversion
5587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5588 | Arithmetic---which means in particular that the conversion is rounded
5589 | according to the current rounding mode.  If `a' is a NaN, the largest
5590 | positive integer is returned.  Otherwise, if the conversion overflows, the
5591 | largest integer with the same sign as `a' is returned.
5592 *----------------------------------------------------------------------------*/
5593 
5594 int32_t float128_to_int32(float128 a, float_status *status)
5595 {
5596     flag aSign;
5597     int32_t aExp, shiftCount;
5598     uint64_t aSig0, aSig1;
5599 
5600     aSig1 = extractFloat128Frac1( a );
5601     aSig0 = extractFloat128Frac0( a );
5602     aExp = extractFloat128Exp( a );
5603     aSign = extractFloat128Sign( a );
5604     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5605     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5606     aSig0 |= ( aSig1 != 0 );
5607     shiftCount = 0x4028 - aExp;
5608     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5609     return roundAndPackInt32(aSign, aSig0, status);
5610 
5611 }
5612 
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the quadruple-precision floating-point
5615 | value `a' to the 32-bit two's complement integer format.  The conversion
5616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5617 | Arithmetic, except that the conversion is always rounded toward zero.  If
5618 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5619 | conversion overflows, the largest integer with the same sign as `a' is
5620 | returned.
5621 *----------------------------------------------------------------------------*/
5622 
5623 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5624 {
5625     flag aSign;
5626     int32_t aExp, shiftCount;
5627     uint64_t aSig0, aSig1, savedASig;
5628     int32_t z;
5629 
5630     aSig1 = extractFloat128Frac1( a );
5631     aSig0 = extractFloat128Frac0( a );
5632     aExp = extractFloat128Exp( a );
5633     aSign = extractFloat128Sign( a );
5634     aSig0 |= ( aSig1 != 0 );
5635     if ( 0x401E < aExp ) {
5636         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5637         goto invalid;
5638     }
5639     else if ( aExp < 0x3FFF ) {
5640         if (aExp || aSig0) {
5641             status->float_exception_flags |= float_flag_inexact;
5642         }
5643         return 0;
5644     }
5645     aSig0 |= LIT64( 0x0001000000000000 );
5646     shiftCount = 0x402F - aExp;
5647     savedASig = aSig0;
5648     aSig0 >>= shiftCount;
5649     z = aSig0;
5650     if ( aSign ) z = - z;
5651     if ( ( z < 0 ) ^ aSign ) {
5652  invalid:
5653         float_raise(float_flag_invalid, status);
5654         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5655     }
5656     if ( ( aSig0<<shiftCount ) != savedASig ) {
5657         status->float_exception_flags |= float_flag_inexact;
5658     }
5659     return z;
5660 
5661 }
5662 
5663 /*----------------------------------------------------------------------------
5664 | Returns the result of converting the quadruple-precision floating-point
5665 | value `a' to the 64-bit two's complement integer format.  The conversion
5666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5667 | Arithmetic---which means in particular that the conversion is rounded
5668 | according to the current rounding mode.  If `a' is a NaN, the largest
5669 | positive integer is returned.  Otherwise, if the conversion overflows, the
5670 | largest integer with the same sign as `a' is returned.
5671 *----------------------------------------------------------------------------*/
5672 
5673 int64_t float128_to_int64(float128 a, float_status *status)
5674 {
5675     flag aSign;
5676     int32_t aExp, shiftCount;
5677     uint64_t aSig0, aSig1;
5678 
5679     aSig1 = extractFloat128Frac1( a );
5680     aSig0 = extractFloat128Frac0( a );
5681     aExp = extractFloat128Exp( a );
5682     aSign = extractFloat128Sign( a );
5683     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5684     shiftCount = 0x402F - aExp;
5685     if ( shiftCount <= 0 ) {
5686         if ( 0x403E < aExp ) {
5687             float_raise(float_flag_invalid, status);
5688             if (    ! aSign
5689                  || (    ( aExp == 0x7FFF )
5690                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5691                     )
5692                ) {
5693                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5694             }
5695             return (int64_t) LIT64( 0x8000000000000000 );
5696         }
5697         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5698     }
5699     else {
5700         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5701     }
5702     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5703 
5704 }
5705 
5706 /*----------------------------------------------------------------------------
5707 | Returns the result of converting the quadruple-precision floating-point
5708 | value `a' to the 64-bit two's complement integer format.  The conversion
5709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5710 | Arithmetic, except that the conversion is always rounded toward zero.
5711 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5712 | the conversion overflows, the largest integer with the same sign as `a' is
5713 | returned.
5714 *----------------------------------------------------------------------------*/
5715 
5716 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5717 {
5718     flag aSign;
5719     int32_t aExp, shiftCount;
5720     uint64_t aSig0, aSig1;
5721     int64_t z;
5722 
5723     aSig1 = extractFloat128Frac1( a );
5724     aSig0 = extractFloat128Frac0( a );
5725     aExp = extractFloat128Exp( a );
5726     aSign = extractFloat128Sign( a );
5727     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5728     shiftCount = aExp - 0x402F;
5729     if ( 0 < shiftCount ) {
5730         if ( 0x403E <= aExp ) {
5731             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5732             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5733                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5734                 if (aSig1) {
5735                     status->float_exception_flags |= float_flag_inexact;
5736                 }
5737             }
5738             else {
5739                 float_raise(float_flag_invalid, status);
5740                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5741                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5742                 }
5743             }
5744             return (int64_t) LIT64( 0x8000000000000000 );
5745         }
5746         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5747         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5748             status->float_exception_flags |= float_flag_inexact;
5749         }
5750     }
5751     else {
5752         if ( aExp < 0x3FFF ) {
5753             if ( aExp | aSig0 | aSig1 ) {
5754                 status->float_exception_flags |= float_flag_inexact;
5755             }
5756             return 0;
5757         }
5758         z = aSig0>>( - shiftCount );
5759         if (    aSig1
5760              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5761             status->float_exception_flags |= float_flag_inexact;
5762         }
5763     }
5764     if ( aSign ) z = - z;
5765     return z;
5766 
5767 }
5768 
5769 /*----------------------------------------------------------------------------
5770 | Returns the result of converting the quadruple-precision floating-point value
5771 | `a' to the 64-bit unsigned integer format.  The conversion is
5772 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5773 | Arithmetic---which means in particular that the conversion is rounded
5774 | according to the current rounding mode.  If `a' is a NaN, the largest
5775 | positive integer is returned.  If the conversion overflows, the
5776 | largest unsigned integer is returned.  If 'a' is negative, the value is
5777 | rounded and zero is returned; negative values that do not round to zero
5778 | will raise the inexact exception.
5779 *----------------------------------------------------------------------------*/
5780 
5781 uint64_t float128_to_uint64(float128 a, float_status *status)
5782 {
5783     flag aSign;
5784     int aExp;
5785     int shiftCount;
5786     uint64_t aSig0, aSig1;
5787 
5788     aSig0 = extractFloat128Frac0(a);
5789     aSig1 = extractFloat128Frac1(a);
5790     aExp = extractFloat128Exp(a);
5791     aSign = extractFloat128Sign(a);
5792     if (aSign && (aExp > 0x3FFE)) {
5793         float_raise(float_flag_invalid, status);
5794         if (float128_is_any_nan(a)) {
5795             return LIT64(0xFFFFFFFFFFFFFFFF);
5796         } else {
5797             return 0;
5798         }
5799     }
5800     if (aExp) {
5801         aSig0 |= LIT64(0x0001000000000000);
5802     }
5803     shiftCount = 0x402F - aExp;
5804     if (shiftCount <= 0) {
5805         if (0x403E < aExp) {
5806             float_raise(float_flag_invalid, status);
5807             return LIT64(0xFFFFFFFFFFFFFFFF);
5808         }
5809         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5810     } else {
5811         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5812     }
5813     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5814 }
5815 
5816 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5817 {
5818     uint64_t v;
5819     signed char current_rounding_mode = status->float_rounding_mode;
5820 
5821     set_float_rounding_mode(float_round_to_zero, status);
5822     v = float128_to_uint64(a, status);
5823     set_float_rounding_mode(current_rounding_mode, status);
5824 
5825     return v;
5826 }
5827 
5828 /*----------------------------------------------------------------------------
5829 | Returns the result of converting the quadruple-precision floating-point
5830 | value `a' to the 32-bit unsigned integer format.  The conversion
5831 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5832 | Arithmetic except that the conversion is always rounded toward zero.
5833 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5834 | if the conversion overflows, the largest unsigned integer is returned.
5835 | If 'a' is negative, the value is rounded and zero is returned; negative
5836 | values that do not round to zero will raise the inexact exception.
5837 *----------------------------------------------------------------------------*/
5838 
5839 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5840 {
5841     uint64_t v;
5842     uint32_t res;
5843     int old_exc_flags = get_float_exception_flags(status);
5844 
5845     v = float128_to_uint64_round_to_zero(a, status);
5846     if (v > 0xffffffff) {
5847         res = 0xffffffff;
5848     } else {
5849         return v;
5850     }
5851     set_float_exception_flags(old_exc_flags, status);
5852     float_raise(float_flag_invalid, status);
5853     return res;
5854 }
5855 
5856 /*----------------------------------------------------------------------------
5857 | Returns the result of converting the quadruple-precision floating-point
5858 | value `a' to the single-precision floating-point format.  The conversion
5859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5860 | Arithmetic.
5861 *----------------------------------------------------------------------------*/
5862 
5863 float32 float128_to_float32(float128 a, float_status *status)
5864 {
5865     flag aSign;
5866     int32_t aExp;
5867     uint64_t aSig0, aSig1;
5868     uint32_t zSig;
5869 
5870     aSig1 = extractFloat128Frac1( a );
5871     aSig0 = extractFloat128Frac0( a );
5872     aExp = extractFloat128Exp( a );
5873     aSign = extractFloat128Sign( a );
5874     if ( aExp == 0x7FFF ) {
5875         if ( aSig0 | aSig1 ) {
5876             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5877         }
5878         return packFloat32( aSign, 0xFF, 0 );
5879     }
5880     aSig0 |= ( aSig1 != 0 );
5881     shift64RightJamming( aSig0, 18, &aSig0 );
5882     zSig = aSig0;
5883     if ( aExp || zSig ) {
5884         zSig |= 0x40000000;
5885         aExp -= 0x3F81;
5886     }
5887     return roundAndPackFloat32(aSign, aExp, zSig, status);
5888 
5889 }
5890 
5891 /*----------------------------------------------------------------------------
5892 | Returns the result of converting the quadruple-precision floating-point
5893 | value `a' to the double-precision floating-point format.  The conversion
5894 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5895 | Arithmetic.
5896 *----------------------------------------------------------------------------*/
5897 
5898 float64 float128_to_float64(float128 a, float_status *status)
5899 {
5900     flag aSign;
5901     int32_t aExp;
5902     uint64_t aSig0, aSig1;
5903 
5904     aSig1 = extractFloat128Frac1( a );
5905     aSig0 = extractFloat128Frac0( a );
5906     aExp = extractFloat128Exp( a );
5907     aSign = extractFloat128Sign( a );
5908     if ( aExp == 0x7FFF ) {
5909         if ( aSig0 | aSig1 ) {
5910             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5911         }
5912         return packFloat64( aSign, 0x7FF, 0 );
5913     }
5914     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5915     aSig0 |= ( aSig1 != 0 );
5916     if ( aExp || aSig0 ) {
5917         aSig0 |= LIT64( 0x4000000000000000 );
5918         aExp -= 0x3C01;
5919     }
5920     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5921 
5922 }
5923 
5924 /*----------------------------------------------------------------------------
5925 | Returns the result of converting the quadruple-precision floating-point
5926 | value `a' to the extended double-precision floating-point format.  The
5927 | conversion is performed according to the IEC/IEEE Standard for Binary
5928 | Floating-Point Arithmetic.
5929 *----------------------------------------------------------------------------*/
5930 
5931 floatx80 float128_to_floatx80(float128 a, float_status *status)
5932 {
5933     flag aSign;
5934     int32_t aExp;
5935     uint64_t aSig0, aSig1;
5936 
5937     aSig1 = extractFloat128Frac1( a );
5938     aSig0 = extractFloat128Frac0( a );
5939     aExp = extractFloat128Exp( a );
5940     aSign = extractFloat128Sign( a );
5941     if ( aExp == 0x7FFF ) {
5942         if ( aSig0 | aSig1 ) {
5943             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5944         }
5945         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5946     }
5947     if ( aExp == 0 ) {
5948         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5949         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5950     }
5951     else {
5952         aSig0 |= LIT64( 0x0001000000000000 );
5953     }
5954     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5955     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5956 
5957 }
5958 
5959 /*----------------------------------------------------------------------------
5960 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5961 | returns the result as a quadruple-precision floating-point value.  The
5962 | operation is performed according to the IEC/IEEE Standard for Binary
5963 | Floating-Point Arithmetic.
5964 *----------------------------------------------------------------------------*/
5965 
5966 float128 float128_round_to_int(float128 a, float_status *status)
5967 {
5968     flag aSign;
5969     int32_t aExp;
5970     uint64_t lastBitMask, roundBitsMask;
5971     float128 z;
5972 
5973     aExp = extractFloat128Exp( a );
5974     if ( 0x402F <= aExp ) {
5975         if ( 0x406F <= aExp ) {
5976             if (    ( aExp == 0x7FFF )
5977                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5978                ) {
5979                 return propagateFloat128NaN(a, a, status);
5980             }
5981             return a;
5982         }
5983         lastBitMask = 1;
5984         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5985         roundBitsMask = lastBitMask - 1;
5986         z = a;
5987         switch (status->float_rounding_mode) {
5988         case float_round_nearest_even:
5989             if ( lastBitMask ) {
5990                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5991                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5992             }
5993             else {
5994                 if ( (int64_t) z.low < 0 ) {
5995                     ++z.high;
5996                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5997                 }
5998             }
5999             break;
6000         case float_round_ties_away:
6001             if (lastBitMask) {
6002                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6003             } else {
6004                 if ((int64_t) z.low < 0) {
6005                     ++z.high;
6006                 }
6007             }
6008             break;
6009         case float_round_to_zero:
6010             break;
6011         case float_round_up:
6012             if (!extractFloat128Sign(z)) {
6013                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6014             }
6015             break;
6016         case float_round_down:
6017             if (extractFloat128Sign(z)) {
6018                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6019             }
6020             break;
6021         default:
6022             abort();
6023         }
6024         z.low &= ~ roundBitsMask;
6025     }
6026     else {
6027         if ( aExp < 0x3FFF ) {
6028             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6029             status->float_exception_flags |= float_flag_inexact;
6030             aSign = extractFloat128Sign( a );
6031             switch (status->float_rounding_mode) {
6032              case float_round_nearest_even:
6033                 if (    ( aExp == 0x3FFE )
6034                      && (   extractFloat128Frac0( a )
6035                           | extractFloat128Frac1( a ) )
6036                    ) {
6037                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6038                 }
6039                 break;
6040             case float_round_ties_away:
6041                 if (aExp == 0x3FFE) {
6042                     return packFloat128(aSign, 0x3FFF, 0, 0);
6043                 }
6044                 break;
6045              case float_round_down:
6046                 return
6047                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6048                     : packFloat128( 0, 0, 0, 0 );
6049              case float_round_up:
6050                 return
6051                       aSign ? packFloat128( 1, 0, 0, 0 )
6052                     : packFloat128( 0, 0x3FFF, 0, 0 );
6053             }
6054             return packFloat128( aSign, 0, 0, 0 );
6055         }
6056         lastBitMask = 1;
6057         lastBitMask <<= 0x402F - aExp;
6058         roundBitsMask = lastBitMask - 1;
6059         z.low = 0;
6060         z.high = a.high;
6061         switch (status->float_rounding_mode) {
6062         case float_round_nearest_even:
6063             z.high += lastBitMask>>1;
6064             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6065                 z.high &= ~ lastBitMask;
6066             }
6067             break;
6068         case float_round_ties_away:
6069             z.high += lastBitMask>>1;
6070             break;
6071         case float_round_to_zero:
6072             break;
6073         case float_round_up:
6074             if (!extractFloat128Sign(z)) {
6075                 z.high |= ( a.low != 0 );
6076                 z.high += roundBitsMask;
6077             }
6078             break;
6079         case float_round_down:
6080             if (extractFloat128Sign(z)) {
6081                 z.high |= (a.low != 0);
6082                 z.high += roundBitsMask;
6083             }
6084             break;
6085         default:
6086             abort();
6087         }
6088         z.high &= ~ roundBitsMask;
6089     }
6090     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6091         status->float_exception_flags |= float_flag_inexact;
6092     }
6093     return z;
6094 
6095 }
6096 
6097 /*----------------------------------------------------------------------------
6098 | Returns the result of adding the absolute values of the quadruple-precision
6099 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6100 | before being returned.  `zSign' is ignored if the result is a NaN.
6101 | The addition is performed according to the IEC/IEEE Standard for Binary
6102 | Floating-Point Arithmetic.
6103 *----------------------------------------------------------------------------*/
6104 
6105 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6106                                 float_status *status)
6107 {
6108     int32_t aExp, bExp, zExp;
6109     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6110     int32_t expDiff;
6111 
6112     aSig1 = extractFloat128Frac1( a );
6113     aSig0 = extractFloat128Frac0( a );
6114     aExp = extractFloat128Exp( a );
6115     bSig1 = extractFloat128Frac1( b );
6116     bSig0 = extractFloat128Frac0( b );
6117     bExp = extractFloat128Exp( b );
6118     expDiff = aExp - bExp;
6119     if ( 0 < expDiff ) {
6120         if ( aExp == 0x7FFF ) {
6121             if (aSig0 | aSig1) {
6122                 return propagateFloat128NaN(a, b, status);
6123             }
6124             return a;
6125         }
6126         if ( bExp == 0 ) {
6127             --expDiff;
6128         }
6129         else {
6130             bSig0 |= LIT64( 0x0001000000000000 );
6131         }
6132         shift128ExtraRightJamming(
6133             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6134         zExp = aExp;
6135     }
6136     else if ( expDiff < 0 ) {
6137         if ( bExp == 0x7FFF ) {
6138             if (bSig0 | bSig1) {
6139                 return propagateFloat128NaN(a, b, status);
6140             }
6141             return packFloat128( zSign, 0x7FFF, 0, 0 );
6142         }
6143         if ( aExp == 0 ) {
6144             ++expDiff;
6145         }
6146         else {
6147             aSig0 |= LIT64( 0x0001000000000000 );
6148         }
6149         shift128ExtraRightJamming(
6150             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6151         zExp = bExp;
6152     }
6153     else {
6154         if ( aExp == 0x7FFF ) {
6155             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6156                 return propagateFloat128NaN(a, b, status);
6157             }
6158             return a;
6159         }
6160         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6161         if ( aExp == 0 ) {
6162             if (status->flush_to_zero) {
6163                 if (zSig0 | zSig1) {
6164                     float_raise(float_flag_output_denormal, status);
6165                 }
6166                 return packFloat128(zSign, 0, 0, 0);
6167             }
6168             return packFloat128( zSign, 0, zSig0, zSig1 );
6169         }
6170         zSig2 = 0;
6171         zSig0 |= LIT64( 0x0002000000000000 );
6172         zExp = aExp;
6173         goto shiftRight1;
6174     }
6175     aSig0 |= LIT64( 0x0001000000000000 );
6176     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6177     --zExp;
6178     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6179     ++zExp;
6180  shiftRight1:
6181     shift128ExtraRightJamming(
6182         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6183  roundAndPack:
6184     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6185 
6186 }
6187 
6188 /*----------------------------------------------------------------------------
6189 | Returns the result of subtracting the absolute values of the quadruple-
6190 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6191 | difference is negated before being returned.  `zSign' is ignored if the
6192 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6193 | Standard for Binary Floating-Point Arithmetic.
6194 *----------------------------------------------------------------------------*/
6195 
6196 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6197                                 float_status *status)
6198 {
6199     int32_t aExp, bExp, zExp;
6200     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6201     int32_t expDiff;
6202 
6203     aSig1 = extractFloat128Frac1( a );
6204     aSig0 = extractFloat128Frac0( a );
6205     aExp = extractFloat128Exp( a );
6206     bSig1 = extractFloat128Frac1( b );
6207     bSig0 = extractFloat128Frac0( b );
6208     bExp = extractFloat128Exp( b );
6209     expDiff = aExp - bExp;
6210     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6211     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6212     if ( 0 < expDiff ) goto aExpBigger;
6213     if ( expDiff < 0 ) goto bExpBigger;
6214     if ( aExp == 0x7FFF ) {
6215         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6216             return propagateFloat128NaN(a, b, status);
6217         }
6218         float_raise(float_flag_invalid, status);
6219         return float128_default_nan(status);
6220     }
6221     if ( aExp == 0 ) {
6222         aExp = 1;
6223         bExp = 1;
6224     }
6225     if ( bSig0 < aSig0 ) goto aBigger;
6226     if ( aSig0 < bSig0 ) goto bBigger;
6227     if ( bSig1 < aSig1 ) goto aBigger;
6228     if ( aSig1 < bSig1 ) goto bBigger;
6229     return packFloat128(status->float_rounding_mode == float_round_down,
6230                         0, 0, 0);
6231  bExpBigger:
6232     if ( bExp == 0x7FFF ) {
6233         if (bSig0 | bSig1) {
6234             return propagateFloat128NaN(a, b, status);
6235         }
6236         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6237     }
6238     if ( aExp == 0 ) {
6239         ++expDiff;
6240     }
6241     else {
6242         aSig0 |= LIT64( 0x4000000000000000 );
6243     }
6244     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6245     bSig0 |= LIT64( 0x4000000000000000 );
6246  bBigger:
6247     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6248     zExp = bExp;
6249     zSign ^= 1;
6250     goto normalizeRoundAndPack;
6251  aExpBigger:
6252     if ( aExp == 0x7FFF ) {
6253         if (aSig0 | aSig1) {
6254             return propagateFloat128NaN(a, b, status);
6255         }
6256         return a;
6257     }
6258     if ( bExp == 0 ) {
6259         --expDiff;
6260     }
6261     else {
6262         bSig0 |= LIT64( 0x4000000000000000 );
6263     }
6264     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6265     aSig0 |= LIT64( 0x4000000000000000 );
6266  aBigger:
6267     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6268     zExp = aExp;
6269  normalizeRoundAndPack:
6270     --zExp;
6271     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6272                                          status);
6273 
6274 }
6275 
6276 /*----------------------------------------------------------------------------
6277 | Returns the result of adding the quadruple-precision floating-point values
6278 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6279 | for Binary Floating-Point Arithmetic.
6280 *----------------------------------------------------------------------------*/
6281 
6282 float128 float128_add(float128 a, float128 b, float_status *status)
6283 {
6284     flag aSign, bSign;
6285 
6286     aSign = extractFloat128Sign( a );
6287     bSign = extractFloat128Sign( b );
6288     if ( aSign == bSign ) {
6289         return addFloat128Sigs(a, b, aSign, status);
6290     }
6291     else {
6292         return subFloat128Sigs(a, b, aSign, status);
6293     }
6294 
6295 }
6296 
6297 /*----------------------------------------------------------------------------
6298 | Returns the result of subtracting the quadruple-precision floating-point
6299 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6300 | Standard for Binary Floating-Point Arithmetic.
6301 *----------------------------------------------------------------------------*/
6302 
6303 float128 float128_sub(float128 a, float128 b, float_status *status)
6304 {
6305     flag aSign, bSign;
6306 
6307     aSign = extractFloat128Sign( a );
6308     bSign = extractFloat128Sign( b );
6309     if ( aSign == bSign ) {
6310         return subFloat128Sigs(a, b, aSign, status);
6311     }
6312     else {
6313         return addFloat128Sigs(a, b, aSign, status);
6314     }
6315 
6316 }
6317 
6318 /*----------------------------------------------------------------------------
6319 | Returns the result of multiplying the quadruple-precision floating-point
6320 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6321 | Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6323 
6324 float128 float128_mul(float128 a, float128 b, float_status *status)
6325 {
6326     flag aSign, bSign, zSign;
6327     int32_t aExp, bExp, zExp;
6328     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6329 
6330     aSig1 = extractFloat128Frac1( a );
6331     aSig0 = extractFloat128Frac0( a );
6332     aExp = extractFloat128Exp( a );
6333     aSign = extractFloat128Sign( a );
6334     bSig1 = extractFloat128Frac1( b );
6335     bSig0 = extractFloat128Frac0( b );
6336     bExp = extractFloat128Exp( b );
6337     bSign = extractFloat128Sign( b );
6338     zSign = aSign ^ bSign;
6339     if ( aExp == 0x7FFF ) {
6340         if (    ( aSig0 | aSig1 )
6341              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6342             return propagateFloat128NaN(a, b, status);
6343         }
6344         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6345         return packFloat128( zSign, 0x7FFF, 0, 0 );
6346     }
6347     if ( bExp == 0x7FFF ) {
6348         if (bSig0 | bSig1) {
6349             return propagateFloat128NaN(a, b, status);
6350         }
6351         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6352  invalid:
6353             float_raise(float_flag_invalid, status);
6354             return float128_default_nan(status);
6355         }
6356         return packFloat128( zSign, 0x7FFF, 0, 0 );
6357     }
6358     if ( aExp == 0 ) {
6359         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6360         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6361     }
6362     if ( bExp == 0 ) {
6363         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6364         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6365     }
6366     zExp = aExp + bExp - 0x4000;
6367     aSig0 |= LIT64( 0x0001000000000000 );
6368     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6369     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6370     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6371     zSig2 |= ( zSig3 != 0 );
6372     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6373         shift128ExtraRightJamming(
6374             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6375         ++zExp;
6376     }
6377     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6378 
6379 }
6380 
6381 /*----------------------------------------------------------------------------
6382 | Returns the result of dividing the quadruple-precision floating-point value
6383 | `a' by the corresponding value `b'.  The operation is performed according to
6384 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6385 *----------------------------------------------------------------------------*/
6386 
6387 float128 float128_div(float128 a, float128 b, float_status *status)
6388 {
6389     flag aSign, bSign, zSign;
6390     int32_t aExp, bExp, zExp;
6391     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6392     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6393 
6394     aSig1 = extractFloat128Frac1( a );
6395     aSig0 = extractFloat128Frac0( a );
6396     aExp = extractFloat128Exp( a );
6397     aSign = extractFloat128Sign( a );
6398     bSig1 = extractFloat128Frac1( b );
6399     bSig0 = extractFloat128Frac0( b );
6400     bExp = extractFloat128Exp( b );
6401     bSign = extractFloat128Sign( b );
6402     zSign = aSign ^ bSign;
6403     if ( aExp == 0x7FFF ) {
6404         if (aSig0 | aSig1) {
6405             return propagateFloat128NaN(a, b, status);
6406         }
6407         if ( bExp == 0x7FFF ) {
6408             if (bSig0 | bSig1) {
6409                 return propagateFloat128NaN(a, b, status);
6410             }
6411             goto invalid;
6412         }
6413         return packFloat128( zSign, 0x7FFF, 0, 0 );
6414     }
6415     if ( bExp == 0x7FFF ) {
6416         if (bSig0 | bSig1) {
6417             return propagateFloat128NaN(a, b, status);
6418         }
6419         return packFloat128( zSign, 0, 0, 0 );
6420     }
6421     if ( bExp == 0 ) {
6422         if ( ( bSig0 | bSig1 ) == 0 ) {
6423             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6424  invalid:
6425                 float_raise(float_flag_invalid, status);
6426                 return float128_default_nan(status);
6427             }
6428             float_raise(float_flag_divbyzero, status);
6429             return packFloat128( zSign, 0x7FFF, 0, 0 );
6430         }
6431         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6432     }
6433     if ( aExp == 0 ) {
6434         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6435         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6436     }
6437     zExp = aExp - bExp + 0x3FFD;
6438     shortShift128Left(
6439         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6440     shortShift128Left(
6441         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6442     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6443         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6444         ++zExp;
6445     }
6446     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6447     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6448     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6449     while ( (int64_t) rem0 < 0 ) {
6450         --zSig0;
6451         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6452     }
6453     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6454     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6455         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6456         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6457         while ( (int64_t) rem1 < 0 ) {
6458             --zSig1;
6459             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6460         }
6461         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6462     }
6463     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6464     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6465 
6466 }
6467 
6468 /*----------------------------------------------------------------------------
6469 | Returns the remainder of the quadruple-precision floating-point value `a'
6470 | with respect to the corresponding value `b'.  The operation is performed
6471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6472 *----------------------------------------------------------------------------*/
6473 
6474 float128 float128_rem(float128 a, float128 b, float_status *status)
6475 {
6476     flag aSign, zSign;
6477     int32_t aExp, bExp, expDiff;
6478     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6479     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6480     int64_t sigMean0;
6481 
6482     aSig1 = extractFloat128Frac1( a );
6483     aSig0 = extractFloat128Frac0( a );
6484     aExp = extractFloat128Exp( a );
6485     aSign = extractFloat128Sign( a );
6486     bSig1 = extractFloat128Frac1( b );
6487     bSig0 = extractFloat128Frac0( b );
6488     bExp = extractFloat128Exp( b );
6489     if ( aExp == 0x7FFF ) {
6490         if (    ( aSig0 | aSig1 )
6491              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6492             return propagateFloat128NaN(a, b, status);
6493         }
6494         goto invalid;
6495     }
6496     if ( bExp == 0x7FFF ) {
6497         if (bSig0 | bSig1) {
6498             return propagateFloat128NaN(a, b, status);
6499         }
6500         return a;
6501     }
6502     if ( bExp == 0 ) {
6503         if ( ( bSig0 | bSig1 ) == 0 ) {
6504  invalid:
6505             float_raise(float_flag_invalid, status);
6506             return float128_default_nan(status);
6507         }
6508         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6509     }
6510     if ( aExp == 0 ) {
6511         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6512         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6513     }
6514     expDiff = aExp - bExp;
6515     if ( expDiff < -1 ) return a;
6516     shortShift128Left(
6517         aSig0 | LIT64( 0x0001000000000000 ),
6518         aSig1,
6519         15 - ( expDiff < 0 ),
6520         &aSig0,
6521         &aSig1
6522     );
6523     shortShift128Left(
6524         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6525     q = le128( bSig0, bSig1, aSig0, aSig1 );
6526     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6527     expDiff -= 64;
6528     while ( 0 < expDiff ) {
6529         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6530         q = ( 4 < q ) ? q - 4 : 0;
6531         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6532         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6533         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6534         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6535         expDiff -= 61;
6536     }
6537     if ( -64 < expDiff ) {
6538         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6539         q = ( 4 < q ) ? q - 4 : 0;
6540         q >>= - expDiff;
6541         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6542         expDiff += 52;
6543         if ( expDiff < 0 ) {
6544             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6545         }
6546         else {
6547             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6548         }
6549         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6550         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6551     }
6552     else {
6553         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6554         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6555     }
6556     do {
6557         alternateASig0 = aSig0;
6558         alternateASig1 = aSig1;
6559         ++q;
6560         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6561     } while ( 0 <= (int64_t) aSig0 );
6562     add128(
6563         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6564     if (    ( sigMean0 < 0 )
6565          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6566         aSig0 = alternateASig0;
6567         aSig1 = alternateASig1;
6568     }
6569     zSign = ( (int64_t) aSig0 < 0 );
6570     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6571     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6572                                          status);
6573 }
6574 
6575 /*----------------------------------------------------------------------------
6576 | Returns the square root of the quadruple-precision floating-point value `a'.
6577 | The operation is performed according to the IEC/IEEE Standard for Binary
6578 | Floating-Point Arithmetic.
6579 *----------------------------------------------------------------------------*/
6580 
6581 float128 float128_sqrt(float128 a, float_status *status)
6582 {
6583     flag aSign;
6584     int32_t aExp, zExp;
6585     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6586     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6587 
6588     aSig1 = extractFloat128Frac1( a );
6589     aSig0 = extractFloat128Frac0( a );
6590     aExp = extractFloat128Exp( a );
6591     aSign = extractFloat128Sign( a );
6592     if ( aExp == 0x7FFF ) {
6593         if (aSig0 | aSig1) {
6594             return propagateFloat128NaN(a, a, status);
6595         }
6596         if ( ! aSign ) return a;
6597         goto invalid;
6598     }
6599     if ( aSign ) {
6600         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6601  invalid:
6602         float_raise(float_flag_invalid, status);
6603         return float128_default_nan(status);
6604     }
6605     if ( aExp == 0 ) {
6606         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6607         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6608     }
6609     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6610     aSig0 |= LIT64( 0x0001000000000000 );
6611     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6612     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6613     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6614     doubleZSig0 = zSig0<<1;
6615     mul64To128( zSig0, zSig0, &term0, &term1 );
6616     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6617     while ( (int64_t) rem0 < 0 ) {
6618         --zSig0;
6619         doubleZSig0 -= 2;
6620         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6621     }
6622     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6623     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6624         if ( zSig1 == 0 ) zSig1 = 1;
6625         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6626         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6627         mul64To128( zSig1, zSig1, &term2, &term3 );
6628         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6629         while ( (int64_t) rem1 < 0 ) {
6630             --zSig1;
6631             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6632             term3 |= 1;
6633             term2 |= doubleZSig0;
6634             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6635         }
6636         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6637     }
6638     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6639     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6640 
6641 }
6642 
6643 /*----------------------------------------------------------------------------
6644 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6645 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6646 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6647 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6648 *----------------------------------------------------------------------------*/
6649 
6650 int float128_eq(float128 a, float128 b, float_status *status)
6651 {
6652 
6653     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6654               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6655          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6656               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6657        ) {
6658         float_raise(float_flag_invalid, status);
6659         return 0;
6660     }
6661     return
6662            ( a.low == b.low )
6663         && (    ( a.high == b.high )
6664              || (    ( a.low == 0 )
6665                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6666            );
6667 
6668 }
6669 
6670 /*----------------------------------------------------------------------------
6671 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6672 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6673 | exception is raised if either operand is a NaN.  The comparison is performed
6674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6675 *----------------------------------------------------------------------------*/
6676 
6677 int float128_le(float128 a, float128 b, float_status *status)
6678 {
6679     flag aSign, bSign;
6680 
6681     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6682               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6683          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6684               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6685        ) {
6686         float_raise(float_flag_invalid, status);
6687         return 0;
6688     }
6689     aSign = extractFloat128Sign( a );
6690     bSign = extractFloat128Sign( b );
6691     if ( aSign != bSign ) {
6692         return
6693                aSign
6694             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6695                  == 0 );
6696     }
6697     return
6698           aSign ? le128( b.high, b.low, a.high, a.low )
6699         : le128( a.high, a.low, b.high, b.low );
6700 
6701 }
6702 
6703 /*----------------------------------------------------------------------------
6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6705 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6706 | raised if either operand is a NaN.  The comparison is performed according
6707 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6708 *----------------------------------------------------------------------------*/
6709 
6710 int float128_lt(float128 a, float128 b, float_status *status)
6711 {
6712     flag aSign, bSign;
6713 
6714     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6715               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6716          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6717               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6718        ) {
6719         float_raise(float_flag_invalid, status);
6720         return 0;
6721     }
6722     aSign = extractFloat128Sign( a );
6723     bSign = extractFloat128Sign( b );
6724     if ( aSign != bSign ) {
6725         return
6726                aSign
6727             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6728                  != 0 );
6729     }
6730     return
6731           aSign ? lt128( b.high, b.low, a.high, a.low )
6732         : lt128( a.high, a.low, b.high, b.low );
6733 
6734 }
6735 
6736 /*----------------------------------------------------------------------------
6737 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6738 | be compared, and 0 otherwise.  The invalid exception is raised if either
6739 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6740 | Standard for Binary Floating-Point Arithmetic.
6741 *----------------------------------------------------------------------------*/
6742 
6743 int float128_unordered(float128 a, float128 b, float_status *status)
6744 {
6745     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6746               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6747          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6748               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6749        ) {
6750         float_raise(float_flag_invalid, status);
6751         return 1;
6752     }
6753     return 0;
6754 }
6755 
6756 /*----------------------------------------------------------------------------
6757 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6758 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6759 | exception.  The comparison is performed according to the IEC/IEEE Standard
6760 | for Binary Floating-Point Arithmetic.
6761 *----------------------------------------------------------------------------*/
6762 
6763 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6764 {
6765 
6766     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6767               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6768          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6769               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6770        ) {
6771         if (float128_is_signaling_nan(a, status)
6772          || float128_is_signaling_nan(b, status)) {
6773             float_raise(float_flag_invalid, status);
6774         }
6775         return 0;
6776     }
6777     return
6778            ( a.low == b.low )
6779         && (    ( a.high == b.high )
6780              || (    ( a.low == 0 )
6781                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6782            );
6783 
6784 }
6785 
6786 /*----------------------------------------------------------------------------
6787 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6788 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6789 | cause an exception.  Otherwise, the comparison is performed according to the
6790 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6791 *----------------------------------------------------------------------------*/
6792 
6793 int float128_le_quiet(float128 a, float128 b, float_status *status)
6794 {
6795     flag aSign, bSign;
6796 
6797     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6798               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6799          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6800               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6801        ) {
6802         if (float128_is_signaling_nan(a, status)
6803          || float128_is_signaling_nan(b, status)) {
6804             float_raise(float_flag_invalid, status);
6805         }
6806         return 0;
6807     }
6808     aSign = extractFloat128Sign( a );
6809     bSign = extractFloat128Sign( b );
6810     if ( aSign != bSign ) {
6811         return
6812                aSign
6813             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6814                  == 0 );
6815     }
6816     return
6817           aSign ? le128( b.high, b.low, a.high, a.low )
6818         : le128( a.high, a.low, b.high, b.low );
6819 
6820 }
6821 
6822 /*----------------------------------------------------------------------------
6823 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6824 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6825 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6826 | Standard for Binary Floating-Point Arithmetic.
6827 *----------------------------------------------------------------------------*/
6828 
6829 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6830 {
6831     flag aSign, bSign;
6832 
6833     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6834               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6835          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6836               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6837        ) {
6838         if (float128_is_signaling_nan(a, status)
6839          || float128_is_signaling_nan(b, status)) {
6840             float_raise(float_flag_invalid, status);
6841         }
6842         return 0;
6843     }
6844     aSign = extractFloat128Sign( a );
6845     bSign = extractFloat128Sign( b );
6846     if ( aSign != bSign ) {
6847         return
6848                aSign
6849             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6850                  != 0 );
6851     }
6852     return
6853           aSign ? lt128( b.high, b.low, a.high, a.low )
6854         : lt128( a.high, a.low, b.high, b.low );
6855 
6856 }
6857 
6858 /*----------------------------------------------------------------------------
6859 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6860 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6861 | comparison is performed according to the IEC/IEEE Standard for Binary
6862 | Floating-Point Arithmetic.
6863 *----------------------------------------------------------------------------*/
6864 
6865 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6866 {
6867     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6868               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6869          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6870               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6871        ) {
6872         if (float128_is_signaling_nan(a, status)
6873          || float128_is_signaling_nan(b, status)) {
6874             float_raise(float_flag_invalid, status);
6875         }
6876         return 1;
6877     }
6878     return 0;
6879 }
6880 
6881 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6882                                             int is_quiet, float_status *status)
6883 {
6884     flag aSign, bSign;
6885 
6886     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6887         float_raise(float_flag_invalid, status);
6888         return float_relation_unordered;
6889     }
6890     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6891           ( extractFloatx80Frac( a )<<1 ) ) ||
6892         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6893           ( extractFloatx80Frac( b )<<1 ) )) {
6894         if (!is_quiet ||
6895             floatx80_is_signaling_nan(a, status) ||
6896             floatx80_is_signaling_nan(b, status)) {
6897             float_raise(float_flag_invalid, status);
6898         }
6899         return float_relation_unordered;
6900     }
6901     aSign = extractFloatx80Sign( a );
6902     bSign = extractFloatx80Sign( b );
6903     if ( aSign != bSign ) {
6904 
6905         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6906              ( ( a.low | b.low ) == 0 ) ) {
6907             /* zero case */
6908             return float_relation_equal;
6909         } else {
6910             return 1 - (2 * aSign);
6911         }
6912     } else {
6913         if (a.low == b.low && a.high == b.high) {
6914             return float_relation_equal;
6915         } else {
6916             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6917         }
6918     }
6919 }
6920 
6921 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6922 {
6923     return floatx80_compare_internal(a, b, 0, status);
6924 }
6925 
6926 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6927 {
6928     return floatx80_compare_internal(a, b, 1, status);
6929 }
6930 
6931 static inline int float128_compare_internal(float128 a, float128 b,
6932                                             int is_quiet, float_status *status)
6933 {
6934     flag aSign, bSign;
6935 
6936     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6937           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6938         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6939           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6940         if (!is_quiet ||
6941             float128_is_signaling_nan(a, status) ||
6942             float128_is_signaling_nan(b, status)) {
6943             float_raise(float_flag_invalid, status);
6944         }
6945         return float_relation_unordered;
6946     }
6947     aSign = extractFloat128Sign( a );
6948     bSign = extractFloat128Sign( b );
6949     if ( aSign != bSign ) {
6950         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6951             /* zero case */
6952             return float_relation_equal;
6953         } else {
6954             return 1 - (2 * aSign);
6955         }
6956     } else {
6957         if (a.low == b.low && a.high == b.high) {
6958             return float_relation_equal;
6959         } else {
6960             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6961         }
6962     }
6963 }
6964 
6965 int float128_compare(float128 a, float128 b, float_status *status)
6966 {
6967     return float128_compare_internal(a, b, 0, status);
6968 }
6969 
6970 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6971 {
6972     return float128_compare_internal(a, b, 1, status);
6973 }
6974 
6975 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6976 {
6977     flag aSign;
6978     int32_t aExp;
6979     uint64_t aSig;
6980 
6981     if (floatx80_invalid_encoding(a)) {
6982         float_raise(float_flag_invalid, status);
6983         return floatx80_default_nan(status);
6984     }
6985     aSig = extractFloatx80Frac( a );
6986     aExp = extractFloatx80Exp( a );
6987     aSign = extractFloatx80Sign( a );
6988 
6989     if ( aExp == 0x7FFF ) {
6990         if ( aSig<<1 ) {
6991             return propagateFloatx80NaN(a, a, status);
6992         }
6993         return a;
6994     }
6995 
6996     if (aExp == 0) {
6997         if (aSig == 0) {
6998             return a;
6999         }
7000         aExp++;
7001     }
7002 
7003     if (n > 0x10000) {
7004         n = 0x10000;
7005     } else if (n < -0x10000) {
7006         n = -0x10000;
7007     }
7008 
7009     aExp += n;
7010     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7011                                          aSign, aExp, aSig, 0, status);
7012 }
7013 
7014 float128 float128_scalbn(float128 a, int n, float_status *status)
7015 {
7016     flag aSign;
7017     int32_t aExp;
7018     uint64_t aSig0, aSig1;
7019 
7020     aSig1 = extractFloat128Frac1( a );
7021     aSig0 = extractFloat128Frac0( a );
7022     aExp = extractFloat128Exp( a );
7023     aSign = extractFloat128Sign( a );
7024     if ( aExp == 0x7FFF ) {
7025         if ( aSig0 | aSig1 ) {
7026             return propagateFloat128NaN(a, a, status);
7027         }
7028         return a;
7029     }
7030     if (aExp != 0) {
7031         aSig0 |= LIT64( 0x0001000000000000 );
7032     } else if (aSig0 == 0 && aSig1 == 0) {
7033         return a;
7034     } else {
7035         aExp++;
7036     }
7037 
7038     if (n > 0x10000) {
7039         n = 0x10000;
7040     } else if (n < -0x10000) {
7041         n = -0x10000;
7042     }
7043 
7044     aExp += n - 1;
7045     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7046                                          , status);
7047 
7048 }
7049