xref: /qemu/fpu/softfloat.c (revision 0c4c90929143a530730e2879204a55a30bf63758)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333 
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341 
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377 
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396 
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400 
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404 
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408 
1409 #undef FLOAT_TO_INT
1410 
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423 
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429 
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446 
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455 
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488 
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492 
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496 
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500 
1501 #undef FLOAT_TO_UINT
1502 
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510 
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536 
1537     return r;
1538 }
1539 
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545 
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550 
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555 
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561 
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566 
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571 
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577 
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582 
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587 
1588 
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596 
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600 
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614 
1615     return r;
1616 }
1617 
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623 
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628 
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633 
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639 
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644 
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649 
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655 
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660 
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665 
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668  * 'compare and pick one input' because that would mishandle
1669  * NaNs and +0 vs -0.
1670  *
1671  * minnum() and maxnum() functions. These are similar to the min()
1672  * and max() functions but if one of the arguments is a QNaN and
1673  * the other is numerical then the numerical argument is returned.
1674  * SNaNs will get quietened before being returned.
1675  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676  * and maxNum() operations. min() and max() are the typical min/max
1677  * semantics provided by many CPUs which predate that specification.
1678  *
1679  * minnummag() and maxnummag() functions correspond to minNumMag()
1680  * and minNumMag() from the IEEE-754 2008.
1681  */
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683                                 bool ieee, bool ismag, float_status *s)
1684 {
1685     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686         if (ieee) {
1687             /* Takes two floating-point values `a' and `b', one of
1688              * which is a NaN, and returns the appropriate NaN
1689              * result. If either `a' or `b' is a signaling NaN,
1690              * the invalid exception is raised.
1691              */
1692             if (is_snan(a.cls) || is_snan(b.cls)) {
1693                 return pick_nan(a, b, s);
1694             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695                 return b;
1696             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697                 return a;
1698             }
1699         }
1700         return pick_nan(a, b, s);
1701     } else {
1702         int a_exp, b_exp;
1703         bool a_sign, b_sign;
1704 
1705         switch (a.cls) {
1706         case float_class_normal:
1707             a_exp = a.exp;
1708             break;
1709         case float_class_inf:
1710             a_exp = INT_MAX;
1711             break;
1712         case float_class_zero:
1713             a_exp = INT_MIN;
1714             break;
1715         default:
1716             g_assert_not_reached();
1717             break;
1718         }
1719         switch (b.cls) {
1720         case float_class_normal:
1721             b_exp = b.exp;
1722             break;
1723         case float_class_inf:
1724             b_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             b_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733 
1734         a_sign = a.sign;
1735         b_sign = b.sign;
1736         if (ismag) {
1737             a_sign = b_sign = 0;
1738         }
1739 
1740         if (a_sign == b_sign) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_sign ^ a_less ^ ismin ? b : a;
1746         } else {
1747             return a_sign ^ ismin ? b : a;
1748         }
1749     }
1750 }
1751 
1752 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1754                                      float_status *s)                   \
1755 {                                                                       \
1756     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1757     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1758     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1759                                                                         \
1760     return float ## sz ## _round_pack_canonical(pr, s);                 \
1761 }
1762 
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1769 
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1776 
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1783 
1784 #undef MINMAX
1785 
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788                           float_status *s)
1789 {
1790     if (is_nan(a.cls) || is_nan(b.cls)) {
1791         if (!is_quiet ||
1792             a.cls == float_class_snan ||
1793             b.cls == float_class_snan) {
1794             s->float_exception_flags |= float_flag_invalid;
1795         }
1796         return float_relation_unordered;
1797     }
1798 
1799     if (a.cls == float_class_zero) {
1800         if (b.cls == float_class_zero) {
1801             return float_relation_equal;
1802         }
1803         return b.sign ? float_relation_greater : float_relation_less;
1804     } else if (b.cls == float_class_zero) {
1805         return a.sign ? float_relation_less : float_relation_greater;
1806     }
1807 
1808     /* The only really important thing about infinity is its sign. If
1809      * both are infinities the sign marks the smallest of the two.
1810      */
1811     if (a.cls == float_class_inf) {
1812         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813             return float_relation_equal;
1814         }
1815         return a.sign ? float_relation_less : float_relation_greater;
1816     } else if (b.cls == float_class_inf) {
1817         return b.sign ? float_relation_greater : float_relation_less;
1818     }
1819 
1820     if (a.sign != b.sign) {
1821         return a.sign ? float_relation_less : float_relation_greater;
1822     }
1823 
1824     if (a.exp == b.exp) {
1825         if (a.frac == b.frac) {
1826             return float_relation_equal;
1827         }
1828         if (a.sign) {
1829             return a.frac > b.frac ?
1830                 float_relation_less : float_relation_greater;
1831         } else {
1832             return a.frac > b.frac ?
1833                 float_relation_greater : float_relation_less;
1834         }
1835     } else {
1836         if (a.sign) {
1837             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838         } else {
1839             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1840         }
1841     }
1842 }
1843 
1844 #define COMPARE(sz)                                                     \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1846                             float_status *s)                            \
1847 {                                                                       \
1848     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1849     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1850     return compare_floats(pa, pb, false, s);                            \
1851 }                                                                       \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1853                                   float_status *s)                      \
1854 {                                                                       \
1855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1857     return compare_floats(pa, pb, true, s);                             \
1858 }
1859 
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1863 
1864 #undef COMPARE
1865 
1866 /* Multiply A by 2 raised to the power N.  */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1868 {
1869     if (unlikely(is_nan(a.cls))) {
1870         return return_nan(a, s);
1871     }
1872     if (a.cls == float_class_normal) {
1873         a.exp += n;
1874     }
1875     return a;
1876 }
1877 
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1879 {
1880     FloatParts pa = float16_unpack_canonical(a, status);
1881     FloatParts pr = scalbn_decomposed(pa, n, status);
1882     return float16_round_pack_canonical(pr, status);
1883 }
1884 
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1886 {
1887     FloatParts pa = float32_unpack_canonical(a, status);
1888     FloatParts pr = scalbn_decomposed(pa, n, status);
1889     return float32_round_pack_canonical(pr, status);
1890 }
1891 
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1893 {
1894     FloatParts pa = float64_unpack_canonical(a, status);
1895     FloatParts pr = scalbn_decomposed(pa, n, status);
1896     return float64_round_pack_canonical(pr, status);
1897 }
1898 
1899 /*----------------------------------------------------------------------------
1900 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1901 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1902 | input.  If `zSign' is 1, the input is negated before being converted to an
1903 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1904 | is simply rounded to an integer, with the inexact exception raised if the
1905 | input cannot be represented exactly as an integer.  However, if the fixed-
1906 | point input is too large, the invalid exception is raised and the largest
1907 | positive or negative integer is returned.
1908 *----------------------------------------------------------------------------*/
1909 
1910 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1911 {
1912     int8_t roundingMode;
1913     flag roundNearestEven;
1914     int8_t roundIncrement, roundBits;
1915     int32_t z;
1916 
1917     roundingMode = status->float_rounding_mode;
1918     roundNearestEven = ( roundingMode == float_round_nearest_even );
1919     switch (roundingMode) {
1920     case float_round_nearest_even:
1921     case float_round_ties_away:
1922         roundIncrement = 0x40;
1923         break;
1924     case float_round_to_zero:
1925         roundIncrement = 0;
1926         break;
1927     case float_round_up:
1928         roundIncrement = zSign ? 0 : 0x7f;
1929         break;
1930     case float_round_down:
1931         roundIncrement = zSign ? 0x7f : 0;
1932         break;
1933     default:
1934         abort();
1935     }
1936     roundBits = absZ & 0x7F;
1937     absZ = ( absZ + roundIncrement )>>7;
1938     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1939     z = absZ;
1940     if ( zSign ) z = - z;
1941     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1942         float_raise(float_flag_invalid, status);
1943         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1944     }
1945     if (roundBits) {
1946         status->float_exception_flags |= float_flag_inexact;
1947     }
1948     return z;
1949 
1950 }
1951 
1952 /*----------------------------------------------------------------------------
1953 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1954 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1955 | and returns the properly rounded 64-bit integer corresponding to the input.
1956 | If `zSign' is 1, the input is negated before being converted to an integer.
1957 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1958 | the inexact exception raised if the input cannot be represented exactly as
1959 | an integer.  However, if the fixed-point input is too large, the invalid
1960 | exception is raised and the largest positive or negative integer is
1961 | returned.
1962 *----------------------------------------------------------------------------*/
1963 
1964 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1965                                float_status *status)
1966 {
1967     int8_t roundingMode;
1968     flag roundNearestEven, increment;
1969     int64_t z;
1970 
1971     roundingMode = status->float_rounding_mode;
1972     roundNearestEven = ( roundingMode == float_round_nearest_even );
1973     switch (roundingMode) {
1974     case float_round_nearest_even:
1975     case float_round_ties_away:
1976         increment = ((int64_t) absZ1 < 0);
1977         break;
1978     case float_round_to_zero:
1979         increment = 0;
1980         break;
1981     case float_round_up:
1982         increment = !zSign && absZ1;
1983         break;
1984     case float_round_down:
1985         increment = zSign && absZ1;
1986         break;
1987     default:
1988         abort();
1989     }
1990     if ( increment ) {
1991         ++absZ0;
1992         if ( absZ0 == 0 ) goto overflow;
1993         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1994     }
1995     z = absZ0;
1996     if ( zSign ) z = - z;
1997     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1998  overflow:
1999         float_raise(float_flag_invalid, status);
2000         return
2001               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2002             : LIT64( 0x7FFFFFFFFFFFFFFF );
2003     }
2004     if (absZ1) {
2005         status->float_exception_flags |= float_flag_inexact;
2006     }
2007     return z;
2008 
2009 }
2010 
2011 /*----------------------------------------------------------------------------
2012 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2013 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2014 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2015 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2016 | with the inexact exception raised if the input cannot be represented exactly
2017 | as an integer.  However, if the fixed-point input is too large, the invalid
2018 | exception is raised and the largest unsigned integer is returned.
2019 *----------------------------------------------------------------------------*/
2020 
2021 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2022                                 uint64_t absZ1, float_status *status)
2023 {
2024     int8_t roundingMode;
2025     flag roundNearestEven, increment;
2026 
2027     roundingMode = status->float_rounding_mode;
2028     roundNearestEven = (roundingMode == float_round_nearest_even);
2029     switch (roundingMode) {
2030     case float_round_nearest_even:
2031     case float_round_ties_away:
2032         increment = ((int64_t)absZ1 < 0);
2033         break;
2034     case float_round_to_zero:
2035         increment = 0;
2036         break;
2037     case float_round_up:
2038         increment = !zSign && absZ1;
2039         break;
2040     case float_round_down:
2041         increment = zSign && absZ1;
2042         break;
2043     default:
2044         abort();
2045     }
2046     if (increment) {
2047         ++absZ0;
2048         if (absZ0 == 0) {
2049             float_raise(float_flag_invalid, status);
2050             return LIT64(0xFFFFFFFFFFFFFFFF);
2051         }
2052         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2053     }
2054 
2055     if (zSign && absZ0) {
2056         float_raise(float_flag_invalid, status);
2057         return 0;
2058     }
2059 
2060     if (absZ1) {
2061         status->float_exception_flags |= float_flag_inexact;
2062     }
2063     return absZ0;
2064 }
2065 
2066 /*----------------------------------------------------------------------------
2067 | If `a' is denormal and we are in flush-to-zero mode then set the
2068 | input-denormal exception and return zero. Otherwise just return the value.
2069 *----------------------------------------------------------------------------*/
2070 float32 float32_squash_input_denormal(float32 a, float_status *status)
2071 {
2072     if (status->flush_inputs_to_zero) {
2073         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2074             float_raise(float_flag_input_denormal, status);
2075             return make_float32(float32_val(a) & 0x80000000);
2076         }
2077     }
2078     return a;
2079 }
2080 
2081 /*----------------------------------------------------------------------------
2082 | Normalizes the subnormal single-precision floating-point value represented
2083 | by the denormalized significand `aSig'.  The normalized exponent and
2084 | significand are stored at the locations pointed to by `zExpPtr' and
2085 | `zSigPtr', respectively.
2086 *----------------------------------------------------------------------------*/
2087 
2088 static void
2089  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2090 {
2091     int8_t shiftCount;
2092 
2093     shiftCount = countLeadingZeros32( aSig ) - 8;
2094     *zSigPtr = aSig<<shiftCount;
2095     *zExpPtr = 1 - shiftCount;
2096 
2097 }
2098 
2099 /*----------------------------------------------------------------------------
2100 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2101 | single-precision floating-point value, returning the result.  After being
2102 | shifted into the proper positions, the three fields are simply added
2103 | together to form the result.  This means that any integer portion of `zSig'
2104 | will be added into the exponent.  Since a properly normalized significand
2105 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2106 | than the desired result exponent whenever `zSig' is a complete, normalized
2107 | significand.
2108 *----------------------------------------------------------------------------*/
2109 
2110 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2111 {
2112 
2113     return make_float32(
2114           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2115 
2116 }
2117 
2118 /*----------------------------------------------------------------------------
2119 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2120 | and significand `zSig', and returns the proper single-precision floating-
2121 | point value corresponding to the abstract input.  Ordinarily, the abstract
2122 | value is simply rounded and packed into the single-precision format, with
2123 | the inexact exception raised if the abstract input cannot be represented
2124 | exactly.  However, if the abstract value is too large, the overflow and
2125 | inexact exceptions are raised and an infinity or maximal finite value is
2126 | returned.  If the abstract value is too small, the input value is rounded to
2127 | a subnormal number, and the underflow and inexact exceptions are raised if
2128 | the abstract input cannot be represented exactly as a subnormal single-
2129 | precision floating-point number.
2130 |     The input significand `zSig' has its binary point between bits 30
2131 | and 29, which is 7 bits to the left of the usual location.  This shifted
2132 | significand must be normalized or smaller.  If `zSig' is not normalized,
2133 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2134 | and it must not require rounding.  In the usual case that `zSig' is
2135 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2136 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2137 | Binary Floating-Point Arithmetic.
2138 *----------------------------------------------------------------------------*/
2139 
2140 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2141                                    float_status *status)
2142 {
2143     int8_t roundingMode;
2144     flag roundNearestEven;
2145     int8_t roundIncrement, roundBits;
2146     flag isTiny;
2147 
2148     roundingMode = status->float_rounding_mode;
2149     roundNearestEven = ( roundingMode == float_round_nearest_even );
2150     switch (roundingMode) {
2151     case float_round_nearest_even:
2152     case float_round_ties_away:
2153         roundIncrement = 0x40;
2154         break;
2155     case float_round_to_zero:
2156         roundIncrement = 0;
2157         break;
2158     case float_round_up:
2159         roundIncrement = zSign ? 0 : 0x7f;
2160         break;
2161     case float_round_down:
2162         roundIncrement = zSign ? 0x7f : 0;
2163         break;
2164     default:
2165         abort();
2166         break;
2167     }
2168     roundBits = zSig & 0x7F;
2169     if ( 0xFD <= (uint16_t) zExp ) {
2170         if (    ( 0xFD < zExp )
2171              || (    ( zExp == 0xFD )
2172                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2173            ) {
2174             float_raise(float_flag_overflow | float_flag_inexact, status);
2175             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2176         }
2177         if ( zExp < 0 ) {
2178             if (status->flush_to_zero) {
2179                 float_raise(float_flag_output_denormal, status);
2180                 return packFloat32(zSign, 0, 0);
2181             }
2182             isTiny =
2183                 (status->float_detect_tininess
2184                  == float_tininess_before_rounding)
2185                 || ( zExp < -1 )
2186                 || ( zSig + roundIncrement < 0x80000000 );
2187             shift32RightJamming( zSig, - zExp, &zSig );
2188             zExp = 0;
2189             roundBits = zSig & 0x7F;
2190             if (isTiny && roundBits) {
2191                 float_raise(float_flag_underflow, status);
2192             }
2193         }
2194     }
2195     if (roundBits) {
2196         status->float_exception_flags |= float_flag_inexact;
2197     }
2198     zSig = ( zSig + roundIncrement )>>7;
2199     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2200     if ( zSig == 0 ) zExp = 0;
2201     return packFloat32( zSign, zExp, zSig );
2202 
2203 }
2204 
2205 /*----------------------------------------------------------------------------
2206 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2207 | and significand `zSig', and returns the proper single-precision floating-
2208 | point value corresponding to the abstract input.  This routine is just like
2209 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2210 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2211 | floating-point exponent.
2212 *----------------------------------------------------------------------------*/
2213 
2214 static float32
2215  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2216                               float_status *status)
2217 {
2218     int8_t shiftCount;
2219 
2220     shiftCount = countLeadingZeros32( zSig ) - 1;
2221     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2222                                status);
2223 
2224 }
2225 
2226 /*----------------------------------------------------------------------------
2227 | If `a' is denormal and we are in flush-to-zero mode then set the
2228 | input-denormal exception and return zero. Otherwise just return the value.
2229 *----------------------------------------------------------------------------*/
2230 float64 float64_squash_input_denormal(float64 a, float_status *status)
2231 {
2232     if (status->flush_inputs_to_zero) {
2233         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2234             float_raise(float_flag_input_denormal, status);
2235             return make_float64(float64_val(a) & (1ULL << 63));
2236         }
2237     }
2238     return a;
2239 }
2240 
2241 /*----------------------------------------------------------------------------
2242 | Normalizes the subnormal double-precision floating-point value represented
2243 | by the denormalized significand `aSig'.  The normalized exponent and
2244 | significand are stored at the locations pointed to by `zExpPtr' and
2245 | `zSigPtr', respectively.
2246 *----------------------------------------------------------------------------*/
2247 
2248 static void
2249  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2250 {
2251     int8_t shiftCount;
2252 
2253     shiftCount = countLeadingZeros64( aSig ) - 11;
2254     *zSigPtr = aSig<<shiftCount;
2255     *zExpPtr = 1 - shiftCount;
2256 
2257 }
2258 
2259 /*----------------------------------------------------------------------------
2260 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2261 | double-precision floating-point value, returning the result.  After being
2262 | shifted into the proper positions, the three fields are simply added
2263 | together to form the result.  This means that any integer portion of `zSig'
2264 | will be added into the exponent.  Since a properly normalized significand
2265 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2266 | than the desired result exponent whenever `zSig' is a complete, normalized
2267 | significand.
2268 *----------------------------------------------------------------------------*/
2269 
2270 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2271 {
2272 
2273     return make_float64(
2274         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2275 
2276 }
2277 
2278 /*----------------------------------------------------------------------------
2279 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2280 | and significand `zSig', and returns the proper double-precision floating-
2281 | point value corresponding to the abstract input.  Ordinarily, the abstract
2282 | value is simply rounded and packed into the double-precision format, with
2283 | the inexact exception raised if the abstract input cannot be represented
2284 | exactly.  However, if the abstract value is too large, the overflow and
2285 | inexact exceptions are raised and an infinity or maximal finite value is
2286 | returned.  If the abstract value is too small, the input value is rounded to
2287 | a subnormal number, and the underflow and inexact exceptions are raised if
2288 | the abstract input cannot be represented exactly as a subnormal double-
2289 | precision floating-point number.
2290 |     The input significand `zSig' has its binary point between bits 62
2291 | and 61, which is 10 bits to the left of the usual location.  This shifted
2292 | significand must be normalized or smaller.  If `zSig' is not normalized,
2293 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2294 | and it must not require rounding.  In the usual case that `zSig' is
2295 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2296 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2297 | Binary Floating-Point Arithmetic.
2298 *----------------------------------------------------------------------------*/
2299 
2300 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2301                                    float_status *status)
2302 {
2303     int8_t roundingMode;
2304     flag roundNearestEven;
2305     int roundIncrement, roundBits;
2306     flag isTiny;
2307 
2308     roundingMode = status->float_rounding_mode;
2309     roundNearestEven = ( roundingMode == float_round_nearest_even );
2310     switch (roundingMode) {
2311     case float_round_nearest_even:
2312     case float_round_ties_away:
2313         roundIncrement = 0x200;
2314         break;
2315     case float_round_to_zero:
2316         roundIncrement = 0;
2317         break;
2318     case float_round_up:
2319         roundIncrement = zSign ? 0 : 0x3ff;
2320         break;
2321     case float_round_down:
2322         roundIncrement = zSign ? 0x3ff : 0;
2323         break;
2324     case float_round_to_odd:
2325         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2326         break;
2327     default:
2328         abort();
2329     }
2330     roundBits = zSig & 0x3FF;
2331     if ( 0x7FD <= (uint16_t) zExp ) {
2332         if (    ( 0x7FD < zExp )
2333              || (    ( zExp == 0x7FD )
2334                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2335            ) {
2336             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2337                                    roundIncrement != 0;
2338             float_raise(float_flag_overflow | float_flag_inexact, status);
2339             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2340         }
2341         if ( zExp < 0 ) {
2342             if (status->flush_to_zero) {
2343                 float_raise(float_flag_output_denormal, status);
2344                 return packFloat64(zSign, 0, 0);
2345             }
2346             isTiny =
2347                    (status->float_detect_tininess
2348                     == float_tininess_before_rounding)
2349                 || ( zExp < -1 )
2350                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2351             shift64RightJamming( zSig, - zExp, &zSig );
2352             zExp = 0;
2353             roundBits = zSig & 0x3FF;
2354             if (isTiny && roundBits) {
2355                 float_raise(float_flag_underflow, status);
2356             }
2357             if (roundingMode == float_round_to_odd) {
2358                 /*
2359                  * For round-to-odd case, the roundIncrement depends on
2360                  * zSig which just changed.
2361                  */
2362                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2363             }
2364         }
2365     }
2366     if (roundBits) {
2367         status->float_exception_flags |= float_flag_inexact;
2368     }
2369     zSig = ( zSig + roundIncrement )>>10;
2370     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2371     if ( zSig == 0 ) zExp = 0;
2372     return packFloat64( zSign, zExp, zSig );
2373 
2374 }
2375 
2376 /*----------------------------------------------------------------------------
2377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2378 | and significand `zSig', and returns the proper double-precision floating-
2379 | point value corresponding to the abstract input.  This routine is just like
2380 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2381 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2382 | floating-point exponent.
2383 *----------------------------------------------------------------------------*/
2384 
2385 static float64
2386  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2387                               float_status *status)
2388 {
2389     int8_t shiftCount;
2390 
2391     shiftCount = countLeadingZeros64( zSig ) - 1;
2392     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2393                                status);
2394 
2395 }
2396 
2397 /*----------------------------------------------------------------------------
2398 | Returns the fraction bits of the extended double-precision floating-point
2399 | value `a'.
2400 *----------------------------------------------------------------------------*/
2401 
2402 static inline uint64_t extractFloatx80Frac( floatx80 a )
2403 {
2404 
2405     return a.low;
2406 
2407 }
2408 
2409 /*----------------------------------------------------------------------------
2410 | Returns the exponent bits of the extended double-precision floating-point
2411 | value `a'.
2412 *----------------------------------------------------------------------------*/
2413 
2414 static inline int32_t extractFloatx80Exp( floatx80 a )
2415 {
2416 
2417     return a.high & 0x7FFF;
2418 
2419 }
2420 
2421 /*----------------------------------------------------------------------------
2422 | Returns the sign bit of the extended double-precision floating-point value
2423 | `a'.
2424 *----------------------------------------------------------------------------*/
2425 
2426 static inline flag extractFloatx80Sign( floatx80 a )
2427 {
2428 
2429     return a.high>>15;
2430 
2431 }
2432 
2433 /*----------------------------------------------------------------------------
2434 | Normalizes the subnormal extended double-precision floating-point value
2435 | represented by the denormalized significand `aSig'.  The normalized exponent
2436 | and significand are stored at the locations pointed to by `zExpPtr' and
2437 | `zSigPtr', respectively.
2438 *----------------------------------------------------------------------------*/
2439 
2440 static void
2441  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2442 {
2443     int8_t shiftCount;
2444 
2445     shiftCount = countLeadingZeros64( aSig );
2446     *zSigPtr = aSig<<shiftCount;
2447     *zExpPtr = 1 - shiftCount;
2448 
2449 }
2450 
2451 /*----------------------------------------------------------------------------
2452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2453 | extended double-precision floating-point value, returning the result.
2454 *----------------------------------------------------------------------------*/
2455 
2456 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2457 {
2458     floatx80 z;
2459 
2460     z.low = zSig;
2461     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2462     return z;
2463 
2464 }
2465 
2466 /*----------------------------------------------------------------------------
2467 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2468 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2469 | and returns the proper extended double-precision floating-point value
2470 | corresponding to the abstract input.  Ordinarily, the abstract value is
2471 | rounded and packed into the extended double-precision format, with the
2472 | inexact exception raised if the abstract input cannot be represented
2473 | exactly.  However, if the abstract value is too large, the overflow and
2474 | inexact exceptions are raised and an infinity or maximal finite value is
2475 | returned.  If the abstract value is too small, the input value is rounded to
2476 | a subnormal number, and the underflow and inexact exceptions are raised if
2477 | the abstract input cannot be represented exactly as a subnormal extended
2478 | double-precision floating-point number.
2479 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2480 | number of bits as single or double precision, respectively.  Otherwise, the
2481 | result is rounded to the full precision of the extended double-precision
2482 | format.
2483 |     The input significand must be normalized or smaller.  If the input
2484 | significand is not normalized, `zExp' must be 0; in that case, the result
2485 | returned is a subnormal number, and it must not require rounding.  The
2486 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2487 | Floating-Point Arithmetic.
2488 *----------------------------------------------------------------------------*/
2489 
2490 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2491                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2492                                      float_status *status)
2493 {
2494     int8_t roundingMode;
2495     flag roundNearestEven, increment, isTiny;
2496     int64_t roundIncrement, roundMask, roundBits;
2497 
2498     roundingMode = status->float_rounding_mode;
2499     roundNearestEven = ( roundingMode == float_round_nearest_even );
2500     if ( roundingPrecision == 80 ) goto precision80;
2501     if ( roundingPrecision == 64 ) {
2502         roundIncrement = LIT64( 0x0000000000000400 );
2503         roundMask = LIT64( 0x00000000000007FF );
2504     }
2505     else if ( roundingPrecision == 32 ) {
2506         roundIncrement = LIT64( 0x0000008000000000 );
2507         roundMask = LIT64( 0x000000FFFFFFFFFF );
2508     }
2509     else {
2510         goto precision80;
2511     }
2512     zSig0 |= ( zSig1 != 0 );
2513     switch (roundingMode) {
2514     case float_round_nearest_even:
2515     case float_round_ties_away:
2516         break;
2517     case float_round_to_zero:
2518         roundIncrement = 0;
2519         break;
2520     case float_round_up:
2521         roundIncrement = zSign ? 0 : roundMask;
2522         break;
2523     case float_round_down:
2524         roundIncrement = zSign ? roundMask : 0;
2525         break;
2526     default:
2527         abort();
2528     }
2529     roundBits = zSig0 & roundMask;
2530     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2531         if (    ( 0x7FFE < zExp )
2532              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2533            ) {
2534             goto overflow;
2535         }
2536         if ( zExp <= 0 ) {
2537             if (status->flush_to_zero) {
2538                 float_raise(float_flag_output_denormal, status);
2539                 return packFloatx80(zSign, 0, 0);
2540             }
2541             isTiny =
2542                    (status->float_detect_tininess
2543                     == float_tininess_before_rounding)
2544                 || ( zExp < 0 )
2545                 || ( zSig0 <= zSig0 + roundIncrement );
2546             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2547             zExp = 0;
2548             roundBits = zSig0 & roundMask;
2549             if (isTiny && roundBits) {
2550                 float_raise(float_flag_underflow, status);
2551             }
2552             if (roundBits) {
2553                 status->float_exception_flags |= float_flag_inexact;
2554             }
2555             zSig0 += roundIncrement;
2556             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2557             roundIncrement = roundMask + 1;
2558             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2559                 roundMask |= roundIncrement;
2560             }
2561             zSig0 &= ~ roundMask;
2562             return packFloatx80( zSign, zExp, zSig0 );
2563         }
2564     }
2565     if (roundBits) {
2566         status->float_exception_flags |= float_flag_inexact;
2567     }
2568     zSig0 += roundIncrement;
2569     if ( zSig0 < roundIncrement ) {
2570         ++zExp;
2571         zSig0 = LIT64( 0x8000000000000000 );
2572     }
2573     roundIncrement = roundMask + 1;
2574     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2575         roundMask |= roundIncrement;
2576     }
2577     zSig0 &= ~ roundMask;
2578     if ( zSig0 == 0 ) zExp = 0;
2579     return packFloatx80( zSign, zExp, zSig0 );
2580  precision80:
2581     switch (roundingMode) {
2582     case float_round_nearest_even:
2583     case float_round_ties_away:
2584         increment = ((int64_t)zSig1 < 0);
2585         break;
2586     case float_round_to_zero:
2587         increment = 0;
2588         break;
2589     case float_round_up:
2590         increment = !zSign && zSig1;
2591         break;
2592     case float_round_down:
2593         increment = zSign && zSig1;
2594         break;
2595     default:
2596         abort();
2597     }
2598     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2599         if (    ( 0x7FFE < zExp )
2600              || (    ( zExp == 0x7FFE )
2601                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2602                   && increment
2603                 )
2604            ) {
2605             roundMask = 0;
2606  overflow:
2607             float_raise(float_flag_overflow | float_flag_inexact, status);
2608             if (    ( roundingMode == float_round_to_zero )
2609                  || ( zSign && ( roundingMode == float_round_up ) )
2610                  || ( ! zSign && ( roundingMode == float_round_down ) )
2611                ) {
2612                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2613             }
2614             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2615         }
2616         if ( zExp <= 0 ) {
2617             isTiny =
2618                    (status->float_detect_tininess
2619                     == float_tininess_before_rounding)
2620                 || ( zExp < 0 )
2621                 || ! increment
2622                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2623             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2624             zExp = 0;
2625             if (isTiny && zSig1) {
2626                 float_raise(float_flag_underflow, status);
2627             }
2628             if (zSig1) {
2629                 status->float_exception_flags |= float_flag_inexact;
2630             }
2631             switch (roundingMode) {
2632             case float_round_nearest_even:
2633             case float_round_ties_away:
2634                 increment = ((int64_t)zSig1 < 0);
2635                 break;
2636             case float_round_to_zero:
2637                 increment = 0;
2638                 break;
2639             case float_round_up:
2640                 increment = !zSign && zSig1;
2641                 break;
2642             case float_round_down:
2643                 increment = zSign && zSig1;
2644                 break;
2645             default:
2646                 abort();
2647             }
2648             if ( increment ) {
2649                 ++zSig0;
2650                 zSig0 &=
2651                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2652                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2653             }
2654             return packFloatx80( zSign, zExp, zSig0 );
2655         }
2656     }
2657     if (zSig1) {
2658         status->float_exception_flags |= float_flag_inexact;
2659     }
2660     if ( increment ) {
2661         ++zSig0;
2662         if ( zSig0 == 0 ) {
2663             ++zExp;
2664             zSig0 = LIT64( 0x8000000000000000 );
2665         }
2666         else {
2667             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2668         }
2669     }
2670     else {
2671         if ( zSig0 == 0 ) zExp = 0;
2672     }
2673     return packFloatx80( zSign, zExp, zSig0 );
2674 
2675 }
2676 
2677 /*----------------------------------------------------------------------------
2678 | Takes an abstract floating-point value having sign `zSign', exponent
2679 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2680 | and returns the proper extended double-precision floating-point value
2681 | corresponding to the abstract input.  This routine is just like
2682 | `roundAndPackFloatx80' except that the input significand does not have to be
2683 | normalized.
2684 *----------------------------------------------------------------------------*/
2685 
2686 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2687                                               flag zSign, int32_t zExp,
2688                                               uint64_t zSig0, uint64_t zSig1,
2689                                               float_status *status)
2690 {
2691     int8_t shiftCount;
2692 
2693     if ( zSig0 == 0 ) {
2694         zSig0 = zSig1;
2695         zSig1 = 0;
2696         zExp -= 64;
2697     }
2698     shiftCount = countLeadingZeros64( zSig0 );
2699     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2700     zExp -= shiftCount;
2701     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2702                                 zSig0, zSig1, status);
2703 
2704 }
2705 
2706 /*----------------------------------------------------------------------------
2707 | Returns the least-significant 64 fraction bits of the quadruple-precision
2708 | floating-point value `a'.
2709 *----------------------------------------------------------------------------*/
2710 
2711 static inline uint64_t extractFloat128Frac1( float128 a )
2712 {
2713 
2714     return a.low;
2715 
2716 }
2717 
2718 /*----------------------------------------------------------------------------
2719 | Returns the most-significant 48 fraction bits of the quadruple-precision
2720 | floating-point value `a'.
2721 *----------------------------------------------------------------------------*/
2722 
2723 static inline uint64_t extractFloat128Frac0( float128 a )
2724 {
2725 
2726     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2727 
2728 }
2729 
2730 /*----------------------------------------------------------------------------
2731 | Returns the exponent bits of the quadruple-precision floating-point value
2732 | `a'.
2733 *----------------------------------------------------------------------------*/
2734 
2735 static inline int32_t extractFloat128Exp( float128 a )
2736 {
2737 
2738     return ( a.high>>48 ) & 0x7FFF;
2739 
2740 }
2741 
2742 /*----------------------------------------------------------------------------
2743 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2744 *----------------------------------------------------------------------------*/
2745 
2746 static inline flag extractFloat128Sign( float128 a )
2747 {
2748 
2749     return a.high>>63;
2750 
2751 }
2752 
2753 /*----------------------------------------------------------------------------
2754 | Normalizes the subnormal quadruple-precision floating-point value
2755 | represented by the denormalized significand formed by the concatenation of
2756 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2757 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2758 | significand are stored at the location pointed to by `zSig0Ptr', and the
2759 | least significant 64 bits of the normalized significand are stored at the
2760 | location pointed to by `zSig1Ptr'.
2761 *----------------------------------------------------------------------------*/
2762 
2763 static void
2764  normalizeFloat128Subnormal(
2765      uint64_t aSig0,
2766      uint64_t aSig1,
2767      int32_t *zExpPtr,
2768      uint64_t *zSig0Ptr,
2769      uint64_t *zSig1Ptr
2770  )
2771 {
2772     int8_t shiftCount;
2773 
2774     if ( aSig0 == 0 ) {
2775         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2776         if ( shiftCount < 0 ) {
2777             *zSig0Ptr = aSig1>>( - shiftCount );
2778             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2779         }
2780         else {
2781             *zSig0Ptr = aSig1<<shiftCount;
2782             *zSig1Ptr = 0;
2783         }
2784         *zExpPtr = - shiftCount - 63;
2785     }
2786     else {
2787         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2788         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2789         *zExpPtr = 1 - shiftCount;
2790     }
2791 
2792 }
2793 
2794 /*----------------------------------------------------------------------------
2795 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2796 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2797 | floating-point value, returning the result.  After being shifted into the
2798 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2799 | added together to form the most significant 32 bits of the result.  This
2800 | means that any integer portion of `zSig0' will be added into the exponent.
2801 | Since a properly normalized significand will have an integer portion equal
2802 | to 1, the `zExp' input should be 1 less than the desired result exponent
2803 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2804 | significand.
2805 *----------------------------------------------------------------------------*/
2806 
2807 static inline float128
2808  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2809 {
2810     float128 z;
2811 
2812     z.low = zSig1;
2813     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2814     return z;
2815 
2816 }
2817 
2818 /*----------------------------------------------------------------------------
2819 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2820 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2821 | and `zSig2', and returns the proper quadruple-precision floating-point value
2822 | corresponding to the abstract input.  Ordinarily, the abstract value is
2823 | simply rounded and packed into the quadruple-precision format, with the
2824 | inexact exception raised if the abstract input cannot be represented
2825 | exactly.  However, if the abstract value is too large, the overflow and
2826 | inexact exceptions are raised and an infinity or maximal finite value is
2827 | returned.  If the abstract value is too small, the input value is rounded to
2828 | a subnormal number, and the underflow and inexact exceptions are raised if
2829 | the abstract input cannot be represented exactly as a subnormal quadruple-
2830 | precision floating-point number.
2831 |     The input significand must be normalized or smaller.  If the input
2832 | significand is not normalized, `zExp' must be 0; in that case, the result
2833 | returned is a subnormal number, and it must not require rounding.  In the
2834 | usual case that the input significand is normalized, `zExp' must be 1 less
2835 | than the ``true'' floating-point exponent.  The handling of underflow and
2836 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2837 *----------------------------------------------------------------------------*/
2838 
2839 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2840                                      uint64_t zSig0, uint64_t zSig1,
2841                                      uint64_t zSig2, float_status *status)
2842 {
2843     int8_t roundingMode;
2844     flag roundNearestEven, increment, isTiny;
2845 
2846     roundingMode = status->float_rounding_mode;
2847     roundNearestEven = ( roundingMode == float_round_nearest_even );
2848     switch (roundingMode) {
2849     case float_round_nearest_even:
2850     case float_round_ties_away:
2851         increment = ((int64_t)zSig2 < 0);
2852         break;
2853     case float_round_to_zero:
2854         increment = 0;
2855         break;
2856     case float_round_up:
2857         increment = !zSign && zSig2;
2858         break;
2859     case float_round_down:
2860         increment = zSign && zSig2;
2861         break;
2862     case float_round_to_odd:
2863         increment = !(zSig1 & 0x1) && zSig2;
2864         break;
2865     default:
2866         abort();
2867     }
2868     if ( 0x7FFD <= (uint32_t) zExp ) {
2869         if (    ( 0x7FFD < zExp )
2870              || (    ( zExp == 0x7FFD )
2871                   && eq128(
2872                          LIT64( 0x0001FFFFFFFFFFFF ),
2873                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2874                          zSig0,
2875                          zSig1
2876                      )
2877                   && increment
2878                 )
2879            ) {
2880             float_raise(float_flag_overflow | float_flag_inexact, status);
2881             if (    ( roundingMode == float_round_to_zero )
2882                  || ( zSign && ( roundingMode == float_round_up ) )
2883                  || ( ! zSign && ( roundingMode == float_round_down ) )
2884                  || (roundingMode == float_round_to_odd)
2885                ) {
2886                 return
2887                     packFloat128(
2888                         zSign,
2889                         0x7FFE,
2890                         LIT64( 0x0000FFFFFFFFFFFF ),
2891                         LIT64( 0xFFFFFFFFFFFFFFFF )
2892                     );
2893             }
2894             return packFloat128( zSign, 0x7FFF, 0, 0 );
2895         }
2896         if ( zExp < 0 ) {
2897             if (status->flush_to_zero) {
2898                 float_raise(float_flag_output_denormal, status);
2899                 return packFloat128(zSign, 0, 0, 0);
2900             }
2901             isTiny =
2902                    (status->float_detect_tininess
2903                     == float_tininess_before_rounding)
2904                 || ( zExp < -1 )
2905                 || ! increment
2906                 || lt128(
2907                        zSig0,
2908                        zSig1,
2909                        LIT64( 0x0001FFFFFFFFFFFF ),
2910                        LIT64( 0xFFFFFFFFFFFFFFFF )
2911                    );
2912             shift128ExtraRightJamming(
2913                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2914             zExp = 0;
2915             if (isTiny && zSig2) {
2916                 float_raise(float_flag_underflow, status);
2917             }
2918             switch (roundingMode) {
2919             case float_round_nearest_even:
2920             case float_round_ties_away:
2921                 increment = ((int64_t)zSig2 < 0);
2922                 break;
2923             case float_round_to_zero:
2924                 increment = 0;
2925                 break;
2926             case float_round_up:
2927                 increment = !zSign && zSig2;
2928                 break;
2929             case float_round_down:
2930                 increment = zSign && zSig2;
2931                 break;
2932             case float_round_to_odd:
2933                 increment = !(zSig1 & 0x1) && zSig2;
2934                 break;
2935             default:
2936                 abort();
2937             }
2938         }
2939     }
2940     if (zSig2) {
2941         status->float_exception_flags |= float_flag_inexact;
2942     }
2943     if ( increment ) {
2944         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2945         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2946     }
2947     else {
2948         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2949     }
2950     return packFloat128( zSign, zExp, zSig0, zSig1 );
2951 
2952 }
2953 
2954 /*----------------------------------------------------------------------------
2955 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2956 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2957 | returns the proper quadruple-precision floating-point value corresponding
2958 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2959 | except that the input significand has fewer bits and does not have to be
2960 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2961 | point exponent.
2962 *----------------------------------------------------------------------------*/
2963 
2964 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2965                                               uint64_t zSig0, uint64_t zSig1,
2966                                               float_status *status)
2967 {
2968     int8_t shiftCount;
2969     uint64_t zSig2;
2970 
2971     if ( zSig0 == 0 ) {
2972         zSig0 = zSig1;
2973         zSig1 = 0;
2974         zExp -= 64;
2975     }
2976     shiftCount = countLeadingZeros64( zSig0 ) - 15;
2977     if ( 0 <= shiftCount ) {
2978         zSig2 = 0;
2979         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2980     }
2981     else {
2982         shift128ExtraRightJamming(
2983             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2984     }
2985     zExp -= shiftCount;
2986     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2987 
2988 }
2989 
2990 
2991 /*----------------------------------------------------------------------------
2992 | Returns the result of converting the 32-bit two's complement integer `a'
2993 | to the extended double-precision floating-point format.  The conversion
2994 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2995 | Arithmetic.
2996 *----------------------------------------------------------------------------*/
2997 
2998 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2999 {
3000     flag zSign;
3001     uint32_t absA;
3002     int8_t shiftCount;
3003     uint64_t zSig;
3004 
3005     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3006     zSign = ( a < 0 );
3007     absA = zSign ? - a : a;
3008     shiftCount = countLeadingZeros32( absA ) + 32;
3009     zSig = absA;
3010     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3011 
3012 }
3013 
3014 /*----------------------------------------------------------------------------
3015 | Returns the result of converting the 32-bit two's complement integer `a' to
3016 | the quadruple-precision floating-point format.  The conversion is performed
3017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3018 *----------------------------------------------------------------------------*/
3019 
3020 float128 int32_to_float128(int32_t a, float_status *status)
3021 {
3022     flag zSign;
3023     uint32_t absA;
3024     int8_t shiftCount;
3025     uint64_t zSig0;
3026 
3027     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3028     zSign = ( a < 0 );
3029     absA = zSign ? - a : a;
3030     shiftCount = countLeadingZeros32( absA ) + 17;
3031     zSig0 = absA;
3032     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3033 
3034 }
3035 
3036 /*----------------------------------------------------------------------------
3037 | Returns the result of converting the 64-bit two's complement integer `a'
3038 | to the extended double-precision floating-point format.  The conversion
3039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3040 | Arithmetic.
3041 *----------------------------------------------------------------------------*/
3042 
3043 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3044 {
3045     flag zSign;
3046     uint64_t absA;
3047     int8_t shiftCount;
3048 
3049     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3050     zSign = ( a < 0 );
3051     absA = zSign ? - a : a;
3052     shiftCount = countLeadingZeros64( absA );
3053     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3054 
3055 }
3056 
3057 /*----------------------------------------------------------------------------
3058 | Returns the result of converting the 64-bit two's complement integer `a' to
3059 | the quadruple-precision floating-point format.  The conversion is performed
3060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3061 *----------------------------------------------------------------------------*/
3062 
3063 float128 int64_to_float128(int64_t a, float_status *status)
3064 {
3065     flag zSign;
3066     uint64_t absA;
3067     int8_t shiftCount;
3068     int32_t zExp;
3069     uint64_t zSig0, zSig1;
3070 
3071     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3072     zSign = ( a < 0 );
3073     absA = zSign ? - a : a;
3074     shiftCount = countLeadingZeros64( absA ) + 49;
3075     zExp = 0x406E - shiftCount;
3076     if ( 64 <= shiftCount ) {
3077         zSig1 = 0;
3078         zSig0 = absA;
3079         shiftCount -= 64;
3080     }
3081     else {
3082         zSig1 = absA;
3083         zSig0 = 0;
3084     }
3085     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3086     return packFloat128( zSign, zExp, zSig0, zSig1 );
3087 
3088 }
3089 
3090 /*----------------------------------------------------------------------------
3091 | Returns the result of converting the 64-bit unsigned integer `a'
3092 | to the quadruple-precision floating-point format.  The conversion is performed
3093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3094 *----------------------------------------------------------------------------*/
3095 
3096 float128 uint64_to_float128(uint64_t a, float_status *status)
3097 {
3098     if (a == 0) {
3099         return float128_zero;
3100     }
3101     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3102 }
3103 
3104 
3105 
3106 
3107 /*----------------------------------------------------------------------------
3108 | Returns the result of converting the single-precision floating-point value
3109 | `a' to the double-precision floating-point format.  The conversion is
3110 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3111 | Arithmetic.
3112 *----------------------------------------------------------------------------*/
3113 
3114 float64 float32_to_float64(float32 a, float_status *status)
3115 {
3116     flag aSign;
3117     int aExp;
3118     uint32_t aSig;
3119     a = float32_squash_input_denormal(a, status);
3120 
3121     aSig = extractFloat32Frac( a );
3122     aExp = extractFloat32Exp( a );
3123     aSign = extractFloat32Sign( a );
3124     if ( aExp == 0xFF ) {
3125         if (aSig) {
3126             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3127         }
3128         return packFloat64( aSign, 0x7FF, 0 );
3129     }
3130     if ( aExp == 0 ) {
3131         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3132         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3133         --aExp;
3134     }
3135     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3136 
3137 }
3138 
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of converting the single-precision floating-point value
3141 | `a' to the extended double-precision floating-point format.  The conversion
3142 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3143 | Arithmetic.
3144 *----------------------------------------------------------------------------*/
3145 
3146 floatx80 float32_to_floatx80(float32 a, float_status *status)
3147 {
3148     flag aSign;
3149     int aExp;
3150     uint32_t aSig;
3151 
3152     a = float32_squash_input_denormal(a, status);
3153     aSig = extractFloat32Frac( a );
3154     aExp = extractFloat32Exp( a );
3155     aSign = extractFloat32Sign( a );
3156     if ( aExp == 0xFF ) {
3157         if (aSig) {
3158             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3159         }
3160         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3161     }
3162     if ( aExp == 0 ) {
3163         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3164         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3165     }
3166     aSig |= 0x00800000;
3167     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3168 
3169 }
3170 
3171 /*----------------------------------------------------------------------------
3172 | Returns the result of converting the single-precision floating-point value
3173 | `a' to the double-precision floating-point format.  The conversion is
3174 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3175 | Arithmetic.
3176 *----------------------------------------------------------------------------*/
3177 
3178 float128 float32_to_float128(float32 a, float_status *status)
3179 {
3180     flag aSign;
3181     int aExp;
3182     uint32_t aSig;
3183 
3184     a = float32_squash_input_denormal(a, status);
3185     aSig = extractFloat32Frac( a );
3186     aExp = extractFloat32Exp( a );
3187     aSign = extractFloat32Sign( a );
3188     if ( aExp == 0xFF ) {
3189         if (aSig) {
3190             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3191         }
3192         return packFloat128( aSign, 0x7FFF, 0, 0 );
3193     }
3194     if ( aExp == 0 ) {
3195         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3196         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3197         --aExp;
3198     }
3199     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3200 
3201 }
3202 
3203 /*----------------------------------------------------------------------------
3204 | Returns the remainder of the single-precision floating-point value `a'
3205 | with respect to the corresponding value `b'.  The operation is performed
3206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3207 *----------------------------------------------------------------------------*/
3208 
3209 float32 float32_rem(float32 a, float32 b, float_status *status)
3210 {
3211     flag aSign, zSign;
3212     int aExp, bExp, expDiff;
3213     uint32_t aSig, bSig;
3214     uint32_t q;
3215     uint64_t aSig64, bSig64, q64;
3216     uint32_t alternateASig;
3217     int32_t sigMean;
3218     a = float32_squash_input_denormal(a, status);
3219     b = float32_squash_input_denormal(b, status);
3220 
3221     aSig = extractFloat32Frac( a );
3222     aExp = extractFloat32Exp( a );
3223     aSign = extractFloat32Sign( a );
3224     bSig = extractFloat32Frac( b );
3225     bExp = extractFloat32Exp( b );
3226     if ( aExp == 0xFF ) {
3227         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3228             return propagateFloat32NaN(a, b, status);
3229         }
3230         float_raise(float_flag_invalid, status);
3231         return float32_default_nan(status);
3232     }
3233     if ( bExp == 0xFF ) {
3234         if (bSig) {
3235             return propagateFloat32NaN(a, b, status);
3236         }
3237         return a;
3238     }
3239     if ( bExp == 0 ) {
3240         if ( bSig == 0 ) {
3241             float_raise(float_flag_invalid, status);
3242             return float32_default_nan(status);
3243         }
3244         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3245     }
3246     if ( aExp == 0 ) {
3247         if ( aSig == 0 ) return a;
3248         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3249     }
3250     expDiff = aExp - bExp;
3251     aSig |= 0x00800000;
3252     bSig |= 0x00800000;
3253     if ( expDiff < 32 ) {
3254         aSig <<= 8;
3255         bSig <<= 8;
3256         if ( expDiff < 0 ) {
3257             if ( expDiff < -1 ) return a;
3258             aSig >>= 1;
3259         }
3260         q = ( bSig <= aSig );
3261         if ( q ) aSig -= bSig;
3262         if ( 0 < expDiff ) {
3263             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3264             q >>= 32 - expDiff;
3265             bSig >>= 2;
3266             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3267         }
3268         else {
3269             aSig >>= 2;
3270             bSig >>= 2;
3271         }
3272     }
3273     else {
3274         if ( bSig <= aSig ) aSig -= bSig;
3275         aSig64 = ( (uint64_t) aSig )<<40;
3276         bSig64 = ( (uint64_t) bSig )<<40;
3277         expDiff -= 64;
3278         while ( 0 < expDiff ) {
3279             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3280             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3281             aSig64 = - ( ( bSig * q64 )<<38 );
3282             expDiff -= 62;
3283         }
3284         expDiff += 64;
3285         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3286         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3287         q = q64>>( 64 - expDiff );
3288         bSig <<= 6;
3289         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3290     }
3291     do {
3292         alternateASig = aSig;
3293         ++q;
3294         aSig -= bSig;
3295     } while ( 0 <= (int32_t) aSig );
3296     sigMean = aSig + alternateASig;
3297     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3298         aSig = alternateASig;
3299     }
3300     zSign = ( (int32_t) aSig < 0 );
3301     if ( zSign ) aSig = - aSig;
3302     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3303 }
3304 
3305 
3306 /*----------------------------------------------------------------------------
3307 | Returns the square root of the single-precision floating-point value `a'.
3308 | The operation is performed according to the IEC/IEEE Standard for Binary
3309 | Floating-Point Arithmetic.
3310 *----------------------------------------------------------------------------*/
3311 
3312 float32 float32_sqrt(float32 a, float_status *status)
3313 {
3314     flag aSign;
3315     int aExp, zExp;
3316     uint32_t aSig, zSig;
3317     uint64_t rem, term;
3318     a = float32_squash_input_denormal(a, status);
3319 
3320     aSig = extractFloat32Frac( a );
3321     aExp = extractFloat32Exp( a );
3322     aSign = extractFloat32Sign( a );
3323     if ( aExp == 0xFF ) {
3324         if (aSig) {
3325             return propagateFloat32NaN(a, float32_zero, status);
3326         }
3327         if ( ! aSign ) return a;
3328         float_raise(float_flag_invalid, status);
3329         return float32_default_nan(status);
3330     }
3331     if ( aSign ) {
3332         if ( ( aExp | aSig ) == 0 ) return a;
3333         float_raise(float_flag_invalid, status);
3334         return float32_default_nan(status);
3335     }
3336     if ( aExp == 0 ) {
3337         if ( aSig == 0 ) return float32_zero;
3338         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3339     }
3340     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3341     aSig = ( aSig | 0x00800000 )<<8;
3342     zSig = estimateSqrt32( aExp, aSig ) + 2;
3343     if ( ( zSig & 0x7F ) <= 5 ) {
3344         if ( zSig < 2 ) {
3345             zSig = 0x7FFFFFFF;
3346             goto roundAndPack;
3347         }
3348         aSig >>= aExp & 1;
3349         term = ( (uint64_t) zSig ) * zSig;
3350         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3351         while ( (int64_t) rem < 0 ) {
3352             --zSig;
3353             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3354         }
3355         zSig |= ( rem != 0 );
3356     }
3357     shift32RightJamming( zSig, 1, &zSig );
3358  roundAndPack:
3359     return roundAndPackFloat32(0, zExp, zSig, status);
3360 
3361 }
3362 
3363 /*----------------------------------------------------------------------------
3364 | Returns the binary exponential of the single-precision floating-point value
3365 | `a'. The operation is performed according to the IEC/IEEE Standard for
3366 | Binary Floating-Point Arithmetic.
3367 |
3368 | Uses the following identities:
3369 |
3370 | 1. -------------------------------------------------------------------------
3371 |      x    x*ln(2)
3372 |     2  = e
3373 |
3374 | 2. -------------------------------------------------------------------------
3375 |                      2     3     4     5           n
3376 |      x        x     x     x     x     x           x
3377 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3378 |               1!    2!    3!    4!    5!          n!
3379 *----------------------------------------------------------------------------*/
3380 
3381 static const float64 float32_exp2_coefficients[15] =
3382 {
3383     const_float64( 0x3ff0000000000000ll ), /*  1 */
3384     const_float64( 0x3fe0000000000000ll ), /*  2 */
3385     const_float64( 0x3fc5555555555555ll ), /*  3 */
3386     const_float64( 0x3fa5555555555555ll ), /*  4 */
3387     const_float64( 0x3f81111111111111ll ), /*  5 */
3388     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3389     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3390     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3391     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3392     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3393     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3394     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3395     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3396     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3397     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3398 };
3399 
3400 float32 float32_exp2(float32 a, float_status *status)
3401 {
3402     flag aSign;
3403     int aExp;
3404     uint32_t aSig;
3405     float64 r, x, xn;
3406     int i;
3407     a = float32_squash_input_denormal(a, status);
3408 
3409     aSig = extractFloat32Frac( a );
3410     aExp = extractFloat32Exp( a );
3411     aSign = extractFloat32Sign( a );
3412 
3413     if ( aExp == 0xFF) {
3414         if (aSig) {
3415             return propagateFloat32NaN(a, float32_zero, status);
3416         }
3417         return (aSign) ? float32_zero : a;
3418     }
3419     if (aExp == 0) {
3420         if (aSig == 0) return float32_one;
3421     }
3422 
3423     float_raise(float_flag_inexact, status);
3424 
3425     /* ******************************* */
3426     /* using float64 for approximation */
3427     /* ******************************* */
3428     x = float32_to_float64(a, status);
3429     x = float64_mul(x, float64_ln2, status);
3430 
3431     xn = x;
3432     r = float64_one;
3433     for (i = 0 ; i < 15 ; i++) {
3434         float64 f;
3435 
3436         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3437         r = float64_add(r, f, status);
3438 
3439         xn = float64_mul(xn, x, status);
3440     }
3441 
3442     return float64_to_float32(r, status);
3443 }
3444 
3445 /*----------------------------------------------------------------------------
3446 | Returns the binary log of the single-precision floating-point value `a'.
3447 | The operation is performed according to the IEC/IEEE Standard for Binary
3448 | Floating-Point Arithmetic.
3449 *----------------------------------------------------------------------------*/
3450 float32 float32_log2(float32 a, float_status *status)
3451 {
3452     flag aSign, zSign;
3453     int aExp;
3454     uint32_t aSig, zSig, i;
3455 
3456     a = float32_squash_input_denormal(a, status);
3457     aSig = extractFloat32Frac( a );
3458     aExp = extractFloat32Exp( a );
3459     aSign = extractFloat32Sign( a );
3460 
3461     if ( aExp == 0 ) {
3462         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3463         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3464     }
3465     if ( aSign ) {
3466         float_raise(float_flag_invalid, status);
3467         return float32_default_nan(status);
3468     }
3469     if ( aExp == 0xFF ) {
3470         if (aSig) {
3471             return propagateFloat32NaN(a, float32_zero, status);
3472         }
3473         return a;
3474     }
3475 
3476     aExp -= 0x7F;
3477     aSig |= 0x00800000;
3478     zSign = aExp < 0;
3479     zSig = aExp << 23;
3480 
3481     for (i = 1 << 22; i > 0; i >>= 1) {
3482         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3483         if ( aSig & 0x01000000 ) {
3484             aSig >>= 1;
3485             zSig |= i;
3486         }
3487     }
3488 
3489     if ( zSign )
3490         zSig = -zSig;
3491 
3492     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3493 }
3494 
3495 /*----------------------------------------------------------------------------
3496 | Returns 1 if the single-precision floating-point value `a' is equal to
3497 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3498 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3501 
3502 int float32_eq(float32 a, float32 b, float_status *status)
3503 {
3504     uint32_t av, bv;
3505     a = float32_squash_input_denormal(a, status);
3506     b = float32_squash_input_denormal(b, status);
3507 
3508     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3509          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3510        ) {
3511         float_raise(float_flag_invalid, status);
3512         return 0;
3513     }
3514     av = float32_val(a);
3515     bv = float32_val(b);
3516     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3517 }
3518 
3519 /*----------------------------------------------------------------------------
3520 | Returns 1 if the single-precision floating-point value `a' is less than
3521 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3522 | exception is raised if either operand is a NaN.  The comparison is performed
3523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3524 *----------------------------------------------------------------------------*/
3525 
3526 int float32_le(float32 a, float32 b, float_status *status)
3527 {
3528     flag aSign, bSign;
3529     uint32_t av, bv;
3530     a = float32_squash_input_denormal(a, status);
3531     b = float32_squash_input_denormal(b, status);
3532 
3533     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3534          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3535        ) {
3536         float_raise(float_flag_invalid, status);
3537         return 0;
3538     }
3539     aSign = extractFloat32Sign( a );
3540     bSign = extractFloat32Sign( b );
3541     av = float32_val(a);
3542     bv = float32_val(b);
3543     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3544     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3545 
3546 }
3547 
3548 /*----------------------------------------------------------------------------
3549 | Returns 1 if the single-precision floating-point value `a' is less than
3550 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3551 | raised if either operand is a NaN.  The comparison is performed according
3552 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3553 *----------------------------------------------------------------------------*/
3554 
3555 int float32_lt(float32 a, float32 b, float_status *status)
3556 {
3557     flag aSign, bSign;
3558     uint32_t av, bv;
3559     a = float32_squash_input_denormal(a, status);
3560     b = float32_squash_input_denormal(b, status);
3561 
3562     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3563          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3564        ) {
3565         float_raise(float_flag_invalid, status);
3566         return 0;
3567     }
3568     aSign = extractFloat32Sign( a );
3569     bSign = extractFloat32Sign( b );
3570     av = float32_val(a);
3571     bv = float32_val(b);
3572     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3573     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3574 
3575 }
3576 
3577 /*----------------------------------------------------------------------------
3578 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3579 | be compared, and 0 otherwise.  The invalid exception is raised if either
3580 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3581 | Standard for Binary Floating-Point Arithmetic.
3582 *----------------------------------------------------------------------------*/
3583 
3584 int float32_unordered(float32 a, float32 b, float_status *status)
3585 {
3586     a = float32_squash_input_denormal(a, status);
3587     b = float32_squash_input_denormal(b, status);
3588 
3589     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3590          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3591        ) {
3592         float_raise(float_flag_invalid, status);
3593         return 1;
3594     }
3595     return 0;
3596 }
3597 
3598 /*----------------------------------------------------------------------------
3599 | Returns 1 if the single-precision floating-point value `a' is equal to
3600 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3601 | exception.  The comparison is performed according to the IEC/IEEE Standard
3602 | for Binary Floating-Point Arithmetic.
3603 *----------------------------------------------------------------------------*/
3604 
3605 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3606 {
3607     a = float32_squash_input_denormal(a, status);
3608     b = float32_squash_input_denormal(b, status);
3609 
3610     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3611          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3612        ) {
3613         if (float32_is_signaling_nan(a, status)
3614          || float32_is_signaling_nan(b, status)) {
3615             float_raise(float_flag_invalid, status);
3616         }
3617         return 0;
3618     }
3619     return ( float32_val(a) == float32_val(b) ) ||
3620             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3621 }
3622 
3623 /*----------------------------------------------------------------------------
3624 | Returns 1 if the single-precision floating-point value `a' is less than or
3625 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3626 | cause an exception.  Otherwise, the comparison is performed according to the
3627 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3629 
3630 int float32_le_quiet(float32 a, float32 b, float_status *status)
3631 {
3632     flag aSign, bSign;
3633     uint32_t av, bv;
3634     a = float32_squash_input_denormal(a, status);
3635     b = float32_squash_input_denormal(b, status);
3636 
3637     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3638          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3639        ) {
3640         if (float32_is_signaling_nan(a, status)
3641          || float32_is_signaling_nan(b, status)) {
3642             float_raise(float_flag_invalid, status);
3643         }
3644         return 0;
3645     }
3646     aSign = extractFloat32Sign( a );
3647     bSign = extractFloat32Sign( b );
3648     av = float32_val(a);
3649     bv = float32_val(b);
3650     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3651     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3652 
3653 }
3654 
3655 /*----------------------------------------------------------------------------
3656 | Returns 1 if the single-precision floating-point value `a' is less than
3657 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3658 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3659 | Standard for Binary Floating-Point Arithmetic.
3660 *----------------------------------------------------------------------------*/
3661 
3662 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3663 {
3664     flag aSign, bSign;
3665     uint32_t av, bv;
3666     a = float32_squash_input_denormal(a, status);
3667     b = float32_squash_input_denormal(b, status);
3668 
3669     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3670          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3671        ) {
3672         if (float32_is_signaling_nan(a, status)
3673          || float32_is_signaling_nan(b, status)) {
3674             float_raise(float_flag_invalid, status);
3675         }
3676         return 0;
3677     }
3678     aSign = extractFloat32Sign( a );
3679     bSign = extractFloat32Sign( b );
3680     av = float32_val(a);
3681     bv = float32_val(b);
3682     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3683     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3684 
3685 }
3686 
3687 /*----------------------------------------------------------------------------
3688 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3689 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3690 | comparison is performed according to the IEC/IEEE Standard for Binary
3691 | Floating-Point Arithmetic.
3692 *----------------------------------------------------------------------------*/
3693 
3694 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3695 {
3696     a = float32_squash_input_denormal(a, status);
3697     b = float32_squash_input_denormal(b, status);
3698 
3699     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3700          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3701        ) {
3702         if (float32_is_signaling_nan(a, status)
3703          || float32_is_signaling_nan(b, status)) {
3704             float_raise(float_flag_invalid, status);
3705         }
3706         return 1;
3707     }
3708     return 0;
3709 }
3710 
3711 
3712 /*----------------------------------------------------------------------------
3713 | Returns the result of converting the double-precision floating-point value
3714 | `a' to the single-precision floating-point format.  The conversion is
3715 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3716 | Arithmetic.
3717 *----------------------------------------------------------------------------*/
3718 
3719 float32 float64_to_float32(float64 a, float_status *status)
3720 {
3721     flag aSign;
3722     int aExp;
3723     uint64_t aSig;
3724     uint32_t zSig;
3725     a = float64_squash_input_denormal(a, status);
3726 
3727     aSig = extractFloat64Frac( a );
3728     aExp = extractFloat64Exp( a );
3729     aSign = extractFloat64Sign( a );
3730     if ( aExp == 0x7FF ) {
3731         if (aSig) {
3732             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3733         }
3734         return packFloat32( aSign, 0xFF, 0 );
3735     }
3736     shift64RightJamming( aSig, 22, &aSig );
3737     zSig = aSig;
3738     if ( aExp || zSig ) {
3739         zSig |= 0x40000000;
3740         aExp -= 0x381;
3741     }
3742     return roundAndPackFloat32(aSign, aExp, zSig, status);
3743 
3744 }
3745 
3746 
3747 /*----------------------------------------------------------------------------
3748 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3749 | half-precision floating-point value, returning the result.  After being
3750 | shifted into the proper positions, the three fields are simply added
3751 | together to form the result.  This means that any integer portion of `zSig'
3752 | will be added into the exponent.  Since a properly normalized significand
3753 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3754 | than the desired result exponent whenever `zSig' is a complete, normalized
3755 | significand.
3756 *----------------------------------------------------------------------------*/
3757 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3758 {
3759     return make_float16(
3760         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3761 }
3762 
3763 /*----------------------------------------------------------------------------
3764 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3765 | and significand `zSig', and returns the proper half-precision floating-
3766 | point value corresponding to the abstract input.  Ordinarily, the abstract
3767 | value is simply rounded and packed into the half-precision format, with
3768 | the inexact exception raised if the abstract input cannot be represented
3769 | exactly.  However, if the abstract value is too large, the overflow and
3770 | inexact exceptions are raised and an infinity or maximal finite value is
3771 | returned.  If the abstract value is too small, the input value is rounded to
3772 | a subnormal number, and the underflow and inexact exceptions are raised if
3773 | the abstract input cannot be represented exactly as a subnormal half-
3774 | precision floating-point number.
3775 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3776 | ARM-style "alternative representation", which omits the NaN and Inf
3777 | encodings in order to raise the maximum representable exponent by one.
3778 |     The input significand `zSig' has its binary point between bits 22
3779 | and 23, which is 13 bits to the left of the usual location.  This shifted
3780 | significand must be normalized or smaller.  If `zSig' is not normalized,
3781 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3782 | and it must not require rounding.  In the usual case that `zSig' is
3783 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3784 | Note the slightly odd position of the binary point in zSig compared with the
3785 | other roundAndPackFloat functions. This should probably be fixed if we
3786 | need to implement more float16 routines than just conversion.
3787 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3788 | Binary Floating-Point Arithmetic.
3789 *----------------------------------------------------------------------------*/
3790 
3791 static float16 roundAndPackFloat16(flag zSign, int zExp,
3792                                    uint32_t zSig, flag ieee,
3793                                    float_status *status)
3794 {
3795     int maxexp = ieee ? 29 : 30;
3796     uint32_t mask;
3797     uint32_t increment;
3798     bool rounding_bumps_exp;
3799     bool is_tiny = false;
3800 
3801     /* Calculate the mask of bits of the mantissa which are not
3802      * representable in half-precision and will be lost.
3803      */
3804     if (zExp < 1) {
3805         /* Will be denormal in halfprec */
3806         mask = 0x00ffffff;
3807         if (zExp >= -11) {
3808             mask >>= 11 + zExp;
3809         }
3810     } else {
3811         /* Normal number in halfprec */
3812         mask = 0x00001fff;
3813     }
3814 
3815     switch (status->float_rounding_mode) {
3816     case float_round_nearest_even:
3817         increment = (mask + 1) >> 1;
3818         if ((zSig & mask) == increment) {
3819             increment = zSig & (increment << 1);
3820         }
3821         break;
3822     case float_round_ties_away:
3823         increment = (mask + 1) >> 1;
3824         break;
3825     case float_round_up:
3826         increment = zSign ? 0 : mask;
3827         break;
3828     case float_round_down:
3829         increment = zSign ? mask : 0;
3830         break;
3831     default: /* round_to_zero */
3832         increment = 0;
3833         break;
3834     }
3835 
3836     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3837 
3838     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3839         if (ieee) {
3840             float_raise(float_flag_overflow | float_flag_inexact, status);
3841             return packFloat16(zSign, 0x1f, 0);
3842         } else {
3843             float_raise(float_flag_invalid, status);
3844             return packFloat16(zSign, 0x1f, 0x3ff);
3845         }
3846     }
3847 
3848     if (zExp < 0) {
3849         /* Note that flush-to-zero does not affect half-precision results */
3850         is_tiny =
3851             (status->float_detect_tininess == float_tininess_before_rounding)
3852             || (zExp < -1)
3853             || (!rounding_bumps_exp);
3854     }
3855     if (zSig & mask) {
3856         float_raise(float_flag_inexact, status);
3857         if (is_tiny) {
3858             float_raise(float_flag_underflow, status);
3859         }
3860     }
3861 
3862     zSig += increment;
3863     if (rounding_bumps_exp) {
3864         zSig >>= 1;
3865         zExp++;
3866     }
3867 
3868     if (zExp < -10) {
3869         return packFloat16(zSign, 0, 0);
3870     }
3871     if (zExp < 0) {
3872         zSig >>= -zExp;
3873         zExp = 0;
3874     }
3875     return packFloat16(zSign, zExp, zSig >> 13);
3876 }
3877 
3878 /*----------------------------------------------------------------------------
3879 | If `a' is denormal and we are in flush-to-zero mode then set the
3880 | input-denormal exception and return zero. Otherwise just return the value.
3881 *----------------------------------------------------------------------------*/
3882 float16 float16_squash_input_denormal(float16 a, float_status *status)
3883 {
3884     if (status->flush_inputs_to_zero) {
3885         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3886             float_raise(float_flag_input_denormal, status);
3887             return make_float16(float16_val(a) & 0x8000);
3888         }
3889     }
3890     return a;
3891 }
3892 
3893 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3894                                       uint32_t *zSigPtr)
3895 {
3896     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3897     *zSigPtr = aSig << shiftCount;
3898     *zExpPtr = 1 - shiftCount;
3899 }
3900 
3901 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3902    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3903 
3904 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3905 {
3906     flag aSign;
3907     int aExp;
3908     uint32_t aSig;
3909 
3910     aSign = extractFloat16Sign(a);
3911     aExp = extractFloat16Exp(a);
3912     aSig = extractFloat16Frac(a);
3913 
3914     if (aExp == 0x1f && ieee) {
3915         if (aSig) {
3916             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3917         }
3918         return packFloat32(aSign, 0xff, 0);
3919     }
3920     if (aExp == 0) {
3921         if (aSig == 0) {
3922             return packFloat32(aSign, 0, 0);
3923         }
3924 
3925         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3926         aExp--;
3927     }
3928     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3929 }
3930 
3931 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3932 {
3933     flag aSign;
3934     int aExp;
3935     uint32_t aSig;
3936 
3937     a = float32_squash_input_denormal(a, status);
3938 
3939     aSig = extractFloat32Frac( a );
3940     aExp = extractFloat32Exp( a );
3941     aSign = extractFloat32Sign( a );
3942     if ( aExp == 0xFF ) {
3943         if (aSig) {
3944             /* Input is a NaN */
3945             if (!ieee) {
3946                 float_raise(float_flag_invalid, status);
3947                 return packFloat16(aSign, 0, 0);
3948             }
3949             return commonNaNToFloat16(
3950                 float32ToCommonNaN(a, status), status);
3951         }
3952         /* Infinity */
3953         if (!ieee) {
3954             float_raise(float_flag_invalid, status);
3955             return packFloat16(aSign, 0x1f, 0x3ff);
3956         }
3957         return packFloat16(aSign, 0x1f, 0);
3958     }
3959     if (aExp == 0 && aSig == 0) {
3960         return packFloat16(aSign, 0, 0);
3961     }
3962     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3963      * even if the input is denormal; however this is harmless because
3964      * the largest possible single-precision denormal is still smaller
3965      * than the smallest representable half-precision denormal, and so we
3966      * will end up ignoring aSig and returning via the "always return zero"
3967      * codepath.
3968      */
3969     aSig |= 0x00800000;
3970     aExp -= 0x71;
3971 
3972     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3973 }
3974 
3975 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3976 {
3977     flag aSign;
3978     int aExp;
3979     uint32_t aSig;
3980 
3981     aSign = extractFloat16Sign(a);
3982     aExp = extractFloat16Exp(a);
3983     aSig = extractFloat16Frac(a);
3984 
3985     if (aExp == 0x1f && ieee) {
3986         if (aSig) {
3987             return commonNaNToFloat64(
3988                 float16ToCommonNaN(a, status), status);
3989         }
3990         return packFloat64(aSign, 0x7ff, 0);
3991     }
3992     if (aExp == 0) {
3993         if (aSig == 0) {
3994             return packFloat64(aSign, 0, 0);
3995         }
3996 
3997         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3998         aExp--;
3999     }
4000     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4001 }
4002 
4003 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4004 {
4005     flag aSign;
4006     int aExp;
4007     uint64_t aSig;
4008     uint32_t zSig;
4009 
4010     a = float64_squash_input_denormal(a, status);
4011 
4012     aSig = extractFloat64Frac(a);
4013     aExp = extractFloat64Exp(a);
4014     aSign = extractFloat64Sign(a);
4015     if (aExp == 0x7FF) {
4016         if (aSig) {
4017             /* Input is a NaN */
4018             if (!ieee) {
4019                 float_raise(float_flag_invalid, status);
4020                 return packFloat16(aSign, 0, 0);
4021             }
4022             return commonNaNToFloat16(
4023                 float64ToCommonNaN(a, status), status);
4024         }
4025         /* Infinity */
4026         if (!ieee) {
4027             float_raise(float_flag_invalid, status);
4028             return packFloat16(aSign, 0x1f, 0x3ff);
4029         }
4030         return packFloat16(aSign, 0x1f, 0);
4031     }
4032     shift64RightJamming(aSig, 29, &aSig);
4033     zSig = aSig;
4034     if (aExp == 0 && zSig == 0) {
4035         return packFloat16(aSign, 0, 0);
4036     }
4037     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4038      * even if the input is denormal; however this is harmless because
4039      * the largest possible single-precision denormal is still smaller
4040      * than the smallest representable half-precision denormal, and so we
4041      * will end up ignoring aSig and returning via the "always return zero"
4042      * codepath.
4043      */
4044     zSig |= 0x00800000;
4045     aExp -= 0x3F1;
4046 
4047     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4048 }
4049 
4050 /*----------------------------------------------------------------------------
4051 | Returns the result of converting the double-precision floating-point value
4052 | `a' to the extended double-precision floating-point format.  The conversion
4053 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4054 | Arithmetic.
4055 *----------------------------------------------------------------------------*/
4056 
4057 floatx80 float64_to_floatx80(float64 a, float_status *status)
4058 {
4059     flag aSign;
4060     int aExp;
4061     uint64_t aSig;
4062 
4063     a = float64_squash_input_denormal(a, status);
4064     aSig = extractFloat64Frac( a );
4065     aExp = extractFloat64Exp( a );
4066     aSign = extractFloat64Sign( a );
4067     if ( aExp == 0x7FF ) {
4068         if (aSig) {
4069             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4070         }
4071         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4072     }
4073     if ( aExp == 0 ) {
4074         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4075         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4076     }
4077     return
4078         packFloatx80(
4079             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4080 
4081 }
4082 
4083 /*----------------------------------------------------------------------------
4084 | Returns the result of converting the double-precision floating-point value
4085 | `a' to the quadruple-precision floating-point format.  The conversion is
4086 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4087 | Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089 
4090 float128 float64_to_float128(float64 a, float_status *status)
4091 {
4092     flag aSign;
4093     int aExp;
4094     uint64_t aSig, zSig0, zSig1;
4095 
4096     a = float64_squash_input_denormal(a, status);
4097     aSig = extractFloat64Frac( a );
4098     aExp = extractFloat64Exp( a );
4099     aSign = extractFloat64Sign( a );
4100     if ( aExp == 0x7FF ) {
4101         if (aSig) {
4102             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4103         }
4104         return packFloat128( aSign, 0x7FFF, 0, 0 );
4105     }
4106     if ( aExp == 0 ) {
4107         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4108         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4109         --aExp;
4110     }
4111     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4112     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4113 
4114 }
4115 
4116 
4117 /*----------------------------------------------------------------------------
4118 | Returns the remainder of the double-precision floating-point value `a'
4119 | with respect to the corresponding value `b'.  The operation is performed
4120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4121 *----------------------------------------------------------------------------*/
4122 
4123 float64 float64_rem(float64 a, float64 b, float_status *status)
4124 {
4125     flag aSign, zSign;
4126     int aExp, bExp, expDiff;
4127     uint64_t aSig, bSig;
4128     uint64_t q, alternateASig;
4129     int64_t sigMean;
4130 
4131     a = float64_squash_input_denormal(a, status);
4132     b = float64_squash_input_denormal(b, status);
4133     aSig = extractFloat64Frac( a );
4134     aExp = extractFloat64Exp( a );
4135     aSign = extractFloat64Sign( a );
4136     bSig = extractFloat64Frac( b );
4137     bExp = extractFloat64Exp( b );
4138     if ( aExp == 0x7FF ) {
4139         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4140             return propagateFloat64NaN(a, b, status);
4141         }
4142         float_raise(float_flag_invalid, status);
4143         return float64_default_nan(status);
4144     }
4145     if ( bExp == 0x7FF ) {
4146         if (bSig) {
4147             return propagateFloat64NaN(a, b, status);
4148         }
4149         return a;
4150     }
4151     if ( bExp == 0 ) {
4152         if ( bSig == 0 ) {
4153             float_raise(float_flag_invalid, status);
4154             return float64_default_nan(status);
4155         }
4156         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4157     }
4158     if ( aExp == 0 ) {
4159         if ( aSig == 0 ) return a;
4160         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4161     }
4162     expDiff = aExp - bExp;
4163     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4164     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4165     if ( expDiff < 0 ) {
4166         if ( expDiff < -1 ) return a;
4167         aSig >>= 1;
4168     }
4169     q = ( bSig <= aSig );
4170     if ( q ) aSig -= bSig;
4171     expDiff -= 64;
4172     while ( 0 < expDiff ) {
4173         q = estimateDiv128To64( aSig, 0, bSig );
4174         q = ( 2 < q ) ? q - 2 : 0;
4175         aSig = - ( ( bSig>>2 ) * q );
4176         expDiff -= 62;
4177     }
4178     expDiff += 64;
4179     if ( 0 < expDiff ) {
4180         q = estimateDiv128To64( aSig, 0, bSig );
4181         q = ( 2 < q ) ? q - 2 : 0;
4182         q >>= 64 - expDiff;
4183         bSig >>= 2;
4184         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4185     }
4186     else {
4187         aSig >>= 2;
4188         bSig >>= 2;
4189     }
4190     do {
4191         alternateASig = aSig;
4192         ++q;
4193         aSig -= bSig;
4194     } while ( 0 <= (int64_t) aSig );
4195     sigMean = aSig + alternateASig;
4196     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4197         aSig = alternateASig;
4198     }
4199     zSign = ( (int64_t) aSig < 0 );
4200     if ( zSign ) aSig = - aSig;
4201     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4202 
4203 }
4204 
4205 
4206 /*----------------------------------------------------------------------------
4207 | Returns the square root of the double-precision floating-point value `a'.
4208 | The operation is performed according to the IEC/IEEE Standard for Binary
4209 | Floating-Point Arithmetic.
4210 *----------------------------------------------------------------------------*/
4211 
4212 float64 float64_sqrt(float64 a, float_status *status)
4213 {
4214     flag aSign;
4215     int aExp, zExp;
4216     uint64_t aSig, zSig, doubleZSig;
4217     uint64_t rem0, rem1, term0, term1;
4218     a = float64_squash_input_denormal(a, status);
4219 
4220     aSig = extractFloat64Frac( a );
4221     aExp = extractFloat64Exp( a );
4222     aSign = extractFloat64Sign( a );
4223     if ( aExp == 0x7FF ) {
4224         if (aSig) {
4225             return propagateFloat64NaN(a, a, status);
4226         }
4227         if ( ! aSign ) return a;
4228         float_raise(float_flag_invalid, status);
4229         return float64_default_nan(status);
4230     }
4231     if ( aSign ) {
4232         if ( ( aExp | aSig ) == 0 ) return a;
4233         float_raise(float_flag_invalid, status);
4234         return float64_default_nan(status);
4235     }
4236     if ( aExp == 0 ) {
4237         if ( aSig == 0 ) return float64_zero;
4238         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4239     }
4240     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4241     aSig |= LIT64( 0x0010000000000000 );
4242     zSig = estimateSqrt32( aExp, aSig>>21 );
4243     aSig <<= 9 - ( aExp & 1 );
4244     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4245     if ( ( zSig & 0x1FF ) <= 5 ) {
4246         doubleZSig = zSig<<1;
4247         mul64To128( zSig, zSig, &term0, &term1 );
4248         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4249         while ( (int64_t) rem0 < 0 ) {
4250             --zSig;
4251             doubleZSig -= 2;
4252             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4253         }
4254         zSig |= ( ( rem0 | rem1 ) != 0 );
4255     }
4256     return roundAndPackFloat64(0, zExp, zSig, status);
4257 
4258 }
4259 
4260 /*----------------------------------------------------------------------------
4261 | Returns the binary log of the double-precision floating-point value `a'.
4262 | The operation is performed according to the IEC/IEEE Standard for Binary
4263 | Floating-Point Arithmetic.
4264 *----------------------------------------------------------------------------*/
4265 float64 float64_log2(float64 a, float_status *status)
4266 {
4267     flag aSign, zSign;
4268     int aExp;
4269     uint64_t aSig, aSig0, aSig1, zSig, i;
4270     a = float64_squash_input_denormal(a, status);
4271 
4272     aSig = extractFloat64Frac( a );
4273     aExp = extractFloat64Exp( a );
4274     aSign = extractFloat64Sign( a );
4275 
4276     if ( aExp == 0 ) {
4277         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4278         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4279     }
4280     if ( aSign ) {
4281         float_raise(float_flag_invalid, status);
4282         return float64_default_nan(status);
4283     }
4284     if ( aExp == 0x7FF ) {
4285         if (aSig) {
4286             return propagateFloat64NaN(a, float64_zero, status);
4287         }
4288         return a;
4289     }
4290 
4291     aExp -= 0x3FF;
4292     aSig |= LIT64( 0x0010000000000000 );
4293     zSign = aExp < 0;
4294     zSig = (uint64_t)aExp << 52;
4295     for (i = 1LL << 51; i > 0; i >>= 1) {
4296         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4297         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4298         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4299             aSig >>= 1;
4300             zSig |= i;
4301         }
4302     }
4303 
4304     if ( zSign )
4305         zSig = -zSig;
4306     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4307 }
4308 
4309 /*----------------------------------------------------------------------------
4310 | Returns 1 if the double-precision floating-point value `a' is equal to the
4311 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4312 | if either operand is a NaN.  Otherwise, the comparison is performed
4313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4314 *----------------------------------------------------------------------------*/
4315 
4316 int float64_eq(float64 a, float64 b, float_status *status)
4317 {
4318     uint64_t av, bv;
4319     a = float64_squash_input_denormal(a, status);
4320     b = float64_squash_input_denormal(b, status);
4321 
4322     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4323          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4324        ) {
4325         float_raise(float_flag_invalid, status);
4326         return 0;
4327     }
4328     av = float64_val(a);
4329     bv = float64_val(b);
4330     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4331 
4332 }
4333 
4334 /*----------------------------------------------------------------------------
4335 | Returns 1 if the double-precision floating-point value `a' is less than or
4336 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4337 | exception is raised if either operand is a NaN.  The comparison is performed
4338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4339 *----------------------------------------------------------------------------*/
4340 
4341 int float64_le(float64 a, float64 b, float_status *status)
4342 {
4343     flag aSign, bSign;
4344     uint64_t av, bv;
4345     a = float64_squash_input_denormal(a, status);
4346     b = float64_squash_input_denormal(b, status);
4347 
4348     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4349          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4350        ) {
4351         float_raise(float_flag_invalid, status);
4352         return 0;
4353     }
4354     aSign = extractFloat64Sign( a );
4355     bSign = extractFloat64Sign( b );
4356     av = float64_val(a);
4357     bv = float64_val(b);
4358     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4359     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4360 
4361 }
4362 
4363 /*----------------------------------------------------------------------------
4364 | Returns 1 if the double-precision floating-point value `a' is less than
4365 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4366 | raised if either operand is a NaN.  The comparison is performed according
4367 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4368 *----------------------------------------------------------------------------*/
4369 
4370 int float64_lt(float64 a, float64 b, float_status *status)
4371 {
4372     flag aSign, bSign;
4373     uint64_t av, bv;
4374 
4375     a = float64_squash_input_denormal(a, status);
4376     b = float64_squash_input_denormal(b, status);
4377     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4378          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4379        ) {
4380         float_raise(float_flag_invalid, status);
4381         return 0;
4382     }
4383     aSign = extractFloat64Sign( a );
4384     bSign = extractFloat64Sign( b );
4385     av = float64_val(a);
4386     bv = float64_val(b);
4387     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4388     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4394 | be compared, and 0 otherwise.  The invalid exception is raised if either
4395 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4396 | Standard for Binary Floating-Point Arithmetic.
4397 *----------------------------------------------------------------------------*/
4398 
4399 int float64_unordered(float64 a, float64 b, float_status *status)
4400 {
4401     a = float64_squash_input_denormal(a, status);
4402     b = float64_squash_input_denormal(b, status);
4403 
4404     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4405          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4406        ) {
4407         float_raise(float_flag_invalid, status);
4408         return 1;
4409     }
4410     return 0;
4411 }
4412 
4413 /*----------------------------------------------------------------------------
4414 | Returns 1 if the double-precision floating-point value `a' is equal to the
4415 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4416 | exception.The comparison is performed according to the IEC/IEEE Standard
4417 | for Binary Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4419 
4420 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4421 {
4422     uint64_t av, bv;
4423     a = float64_squash_input_denormal(a, status);
4424     b = float64_squash_input_denormal(b, status);
4425 
4426     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4427          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4428        ) {
4429         if (float64_is_signaling_nan(a, status)
4430          || float64_is_signaling_nan(b, status)) {
4431             float_raise(float_flag_invalid, status);
4432         }
4433         return 0;
4434     }
4435     av = float64_val(a);
4436     bv = float64_val(b);
4437     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4438 
4439 }
4440 
4441 /*----------------------------------------------------------------------------
4442 | Returns 1 if the double-precision floating-point value `a' is less than or
4443 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4444 | cause an exception.  Otherwise, the comparison is performed according to the
4445 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447 
4448 int float64_le_quiet(float64 a, float64 b, float_status *status)
4449 {
4450     flag aSign, bSign;
4451     uint64_t av, bv;
4452     a = float64_squash_input_denormal(a, status);
4453     b = float64_squash_input_denormal(b, status);
4454 
4455     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4456          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4457        ) {
4458         if (float64_is_signaling_nan(a, status)
4459          || float64_is_signaling_nan(b, status)) {
4460             float_raise(float_flag_invalid, status);
4461         }
4462         return 0;
4463     }
4464     aSign = extractFloat64Sign( a );
4465     bSign = extractFloat64Sign( b );
4466     av = float64_val(a);
4467     bv = float64_val(b);
4468     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4469     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4470 
4471 }
4472 
4473 /*----------------------------------------------------------------------------
4474 | Returns 1 if the double-precision floating-point value `a' is less than
4475 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4476 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4477 | Standard for Binary Floating-Point Arithmetic.
4478 *----------------------------------------------------------------------------*/
4479 
4480 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4481 {
4482     flag aSign, bSign;
4483     uint64_t av, bv;
4484     a = float64_squash_input_denormal(a, status);
4485     b = float64_squash_input_denormal(b, status);
4486 
4487     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4488          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4489        ) {
4490         if (float64_is_signaling_nan(a, status)
4491          || float64_is_signaling_nan(b, status)) {
4492             float_raise(float_flag_invalid, status);
4493         }
4494         return 0;
4495     }
4496     aSign = extractFloat64Sign( a );
4497     bSign = extractFloat64Sign( b );
4498     av = float64_val(a);
4499     bv = float64_val(b);
4500     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4501     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4502 
4503 }
4504 
4505 /*----------------------------------------------------------------------------
4506 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4507 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4508 | comparison is performed according to the IEC/IEEE Standard for Binary
4509 | Floating-Point Arithmetic.
4510 *----------------------------------------------------------------------------*/
4511 
4512 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4513 {
4514     a = float64_squash_input_denormal(a, status);
4515     b = float64_squash_input_denormal(b, status);
4516 
4517     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4518          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4519        ) {
4520         if (float64_is_signaling_nan(a, status)
4521          || float64_is_signaling_nan(b, status)) {
4522             float_raise(float_flag_invalid, status);
4523         }
4524         return 1;
4525     }
4526     return 0;
4527 }
4528 
4529 /*----------------------------------------------------------------------------
4530 | Returns the result of converting the extended double-precision floating-
4531 | point value `a' to the 32-bit two's complement integer format.  The
4532 | conversion is performed according to the IEC/IEEE Standard for Binary
4533 | Floating-Point Arithmetic---which means in particular that the conversion
4534 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4535 | largest positive integer is returned.  Otherwise, if the conversion
4536 | overflows, the largest integer with the same sign as `a' is returned.
4537 *----------------------------------------------------------------------------*/
4538 
4539 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4540 {
4541     flag aSign;
4542     int32_t aExp, shiftCount;
4543     uint64_t aSig;
4544 
4545     if (floatx80_invalid_encoding(a)) {
4546         float_raise(float_flag_invalid, status);
4547         return 1 << 31;
4548     }
4549     aSig = extractFloatx80Frac( a );
4550     aExp = extractFloatx80Exp( a );
4551     aSign = extractFloatx80Sign( a );
4552     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4553     shiftCount = 0x4037 - aExp;
4554     if ( shiftCount <= 0 ) shiftCount = 1;
4555     shift64RightJamming( aSig, shiftCount, &aSig );
4556     return roundAndPackInt32(aSign, aSig, status);
4557 
4558 }
4559 
4560 /*----------------------------------------------------------------------------
4561 | Returns the result of converting the extended double-precision floating-
4562 | point value `a' to the 32-bit two's complement integer format.  The
4563 | conversion is performed according to the IEC/IEEE Standard for Binary
4564 | Floating-Point Arithmetic, except that the conversion is always rounded
4565 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4566 | Otherwise, if the conversion overflows, the largest integer with the same
4567 | sign as `a' is returned.
4568 *----------------------------------------------------------------------------*/
4569 
4570 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4571 {
4572     flag aSign;
4573     int32_t aExp, shiftCount;
4574     uint64_t aSig, savedASig;
4575     int32_t z;
4576 
4577     if (floatx80_invalid_encoding(a)) {
4578         float_raise(float_flag_invalid, status);
4579         return 1 << 31;
4580     }
4581     aSig = extractFloatx80Frac( a );
4582     aExp = extractFloatx80Exp( a );
4583     aSign = extractFloatx80Sign( a );
4584     if ( 0x401E < aExp ) {
4585         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4586         goto invalid;
4587     }
4588     else if ( aExp < 0x3FFF ) {
4589         if (aExp || aSig) {
4590             status->float_exception_flags |= float_flag_inexact;
4591         }
4592         return 0;
4593     }
4594     shiftCount = 0x403E - aExp;
4595     savedASig = aSig;
4596     aSig >>= shiftCount;
4597     z = aSig;
4598     if ( aSign ) z = - z;
4599     if ( ( z < 0 ) ^ aSign ) {
4600  invalid:
4601         float_raise(float_flag_invalid, status);
4602         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4603     }
4604     if ( ( aSig<<shiftCount ) != savedASig ) {
4605         status->float_exception_flags |= float_flag_inexact;
4606     }
4607     return z;
4608 
4609 }
4610 
4611 /*----------------------------------------------------------------------------
4612 | Returns the result of converting the extended double-precision floating-
4613 | point value `a' to the 64-bit two's complement integer format.  The
4614 | conversion is performed according to the IEC/IEEE Standard for Binary
4615 | Floating-Point Arithmetic---which means in particular that the conversion
4616 | is rounded according to the current rounding mode.  If `a' is a NaN,
4617 | the largest positive integer is returned.  Otherwise, if the conversion
4618 | overflows, the largest integer with the same sign as `a' is returned.
4619 *----------------------------------------------------------------------------*/
4620 
4621 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4622 {
4623     flag aSign;
4624     int32_t aExp, shiftCount;
4625     uint64_t aSig, aSigExtra;
4626 
4627     if (floatx80_invalid_encoding(a)) {
4628         float_raise(float_flag_invalid, status);
4629         return 1ULL << 63;
4630     }
4631     aSig = extractFloatx80Frac( a );
4632     aExp = extractFloatx80Exp( a );
4633     aSign = extractFloatx80Sign( a );
4634     shiftCount = 0x403E - aExp;
4635     if ( shiftCount <= 0 ) {
4636         if ( shiftCount ) {
4637             float_raise(float_flag_invalid, status);
4638             if (    ! aSign
4639                  || (    ( aExp == 0x7FFF )
4640                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4641                ) {
4642                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4643             }
4644             return (int64_t) LIT64( 0x8000000000000000 );
4645         }
4646         aSigExtra = 0;
4647     }
4648     else {
4649         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4650     }
4651     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4652 
4653 }
4654 
4655 /*----------------------------------------------------------------------------
4656 | Returns the result of converting the extended double-precision floating-
4657 | point value `a' to the 64-bit two's complement integer format.  The
4658 | conversion is performed according to the IEC/IEEE Standard for Binary
4659 | Floating-Point Arithmetic, except that the conversion is always rounded
4660 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4661 | Otherwise, if the conversion overflows, the largest integer with the same
4662 | sign as `a' is returned.
4663 *----------------------------------------------------------------------------*/
4664 
4665 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4666 {
4667     flag aSign;
4668     int32_t aExp, shiftCount;
4669     uint64_t aSig;
4670     int64_t z;
4671 
4672     if (floatx80_invalid_encoding(a)) {
4673         float_raise(float_flag_invalid, status);
4674         return 1ULL << 63;
4675     }
4676     aSig = extractFloatx80Frac( a );
4677     aExp = extractFloatx80Exp( a );
4678     aSign = extractFloatx80Sign( a );
4679     shiftCount = aExp - 0x403E;
4680     if ( 0 <= shiftCount ) {
4681         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4682         if ( ( a.high != 0xC03E ) || aSig ) {
4683             float_raise(float_flag_invalid, status);
4684             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4685                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4686             }
4687         }
4688         return (int64_t) LIT64( 0x8000000000000000 );
4689     }
4690     else if ( aExp < 0x3FFF ) {
4691         if (aExp | aSig) {
4692             status->float_exception_flags |= float_flag_inexact;
4693         }
4694         return 0;
4695     }
4696     z = aSig>>( - shiftCount );
4697     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4698         status->float_exception_flags |= float_flag_inexact;
4699     }
4700     if ( aSign ) z = - z;
4701     return z;
4702 
4703 }
4704 
4705 /*----------------------------------------------------------------------------
4706 | Returns the result of converting the extended double-precision floating-
4707 | point value `a' to the single-precision floating-point format.  The
4708 | conversion is performed according to the IEC/IEEE Standard for Binary
4709 | Floating-Point Arithmetic.
4710 *----------------------------------------------------------------------------*/
4711 
4712 float32 floatx80_to_float32(floatx80 a, float_status *status)
4713 {
4714     flag aSign;
4715     int32_t aExp;
4716     uint64_t aSig;
4717 
4718     if (floatx80_invalid_encoding(a)) {
4719         float_raise(float_flag_invalid, status);
4720         return float32_default_nan(status);
4721     }
4722     aSig = extractFloatx80Frac( a );
4723     aExp = extractFloatx80Exp( a );
4724     aSign = extractFloatx80Sign( a );
4725     if ( aExp == 0x7FFF ) {
4726         if ( (uint64_t) ( aSig<<1 ) ) {
4727             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4728         }
4729         return packFloat32( aSign, 0xFF, 0 );
4730     }
4731     shift64RightJamming( aSig, 33, &aSig );
4732     if ( aExp || aSig ) aExp -= 0x3F81;
4733     return roundAndPackFloat32(aSign, aExp, aSig, status);
4734 
4735 }
4736 
4737 /*----------------------------------------------------------------------------
4738 | Returns the result of converting the extended double-precision floating-
4739 | point value `a' to the double-precision floating-point format.  The
4740 | conversion is performed according to the IEC/IEEE Standard for Binary
4741 | Floating-Point Arithmetic.
4742 *----------------------------------------------------------------------------*/
4743 
4744 float64 floatx80_to_float64(floatx80 a, float_status *status)
4745 {
4746     flag aSign;
4747     int32_t aExp;
4748     uint64_t aSig, zSig;
4749 
4750     if (floatx80_invalid_encoding(a)) {
4751         float_raise(float_flag_invalid, status);
4752         return float64_default_nan(status);
4753     }
4754     aSig = extractFloatx80Frac( a );
4755     aExp = extractFloatx80Exp( a );
4756     aSign = extractFloatx80Sign( a );
4757     if ( aExp == 0x7FFF ) {
4758         if ( (uint64_t) ( aSig<<1 ) ) {
4759             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4760         }
4761         return packFloat64( aSign, 0x7FF, 0 );
4762     }
4763     shift64RightJamming( aSig, 1, &zSig );
4764     if ( aExp || aSig ) aExp -= 0x3C01;
4765     return roundAndPackFloat64(aSign, aExp, zSig, status);
4766 
4767 }
4768 
4769 /*----------------------------------------------------------------------------
4770 | Returns the result of converting the extended double-precision floating-
4771 | point value `a' to the quadruple-precision floating-point format.  The
4772 | conversion is performed according to the IEC/IEEE Standard for Binary
4773 | Floating-Point Arithmetic.
4774 *----------------------------------------------------------------------------*/
4775 
4776 float128 floatx80_to_float128(floatx80 a, float_status *status)
4777 {
4778     flag aSign;
4779     int aExp;
4780     uint64_t aSig, zSig0, zSig1;
4781 
4782     if (floatx80_invalid_encoding(a)) {
4783         float_raise(float_flag_invalid, status);
4784         return float128_default_nan(status);
4785     }
4786     aSig = extractFloatx80Frac( a );
4787     aExp = extractFloatx80Exp( a );
4788     aSign = extractFloatx80Sign( a );
4789     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4790         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4791     }
4792     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4793     return packFloat128( aSign, aExp, zSig0, zSig1 );
4794 
4795 }
4796 
4797 /*----------------------------------------------------------------------------
4798 | Rounds the extended double-precision floating-point value `a'
4799 | to the precision provided by floatx80_rounding_precision and returns the
4800 | result as an extended double-precision floating-point value.
4801 | The operation is performed according to the IEC/IEEE Standard for Binary
4802 | Floating-Point Arithmetic.
4803 *----------------------------------------------------------------------------*/
4804 
4805 floatx80 floatx80_round(floatx80 a, float_status *status)
4806 {
4807     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4808                                 extractFloatx80Sign(a),
4809                                 extractFloatx80Exp(a),
4810                                 extractFloatx80Frac(a), 0, status);
4811 }
4812 
4813 /*----------------------------------------------------------------------------
4814 | Rounds the extended double-precision floating-point value `a' to an integer,
4815 | and returns the result as an extended quadruple-precision floating-point
4816 | value.  The operation is performed according to the IEC/IEEE Standard for
4817 | Binary Floating-Point Arithmetic.
4818 *----------------------------------------------------------------------------*/
4819 
4820 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4821 {
4822     flag aSign;
4823     int32_t aExp;
4824     uint64_t lastBitMask, roundBitsMask;
4825     floatx80 z;
4826 
4827     if (floatx80_invalid_encoding(a)) {
4828         float_raise(float_flag_invalid, status);
4829         return floatx80_default_nan(status);
4830     }
4831     aExp = extractFloatx80Exp( a );
4832     if ( 0x403E <= aExp ) {
4833         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4834             return propagateFloatx80NaN(a, a, status);
4835         }
4836         return a;
4837     }
4838     if ( aExp < 0x3FFF ) {
4839         if (    ( aExp == 0 )
4840              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4841             return a;
4842         }
4843         status->float_exception_flags |= float_flag_inexact;
4844         aSign = extractFloatx80Sign( a );
4845         switch (status->float_rounding_mode) {
4846          case float_round_nearest_even:
4847             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4848                ) {
4849                 return
4850                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4851             }
4852             break;
4853         case float_round_ties_away:
4854             if (aExp == 0x3FFE) {
4855                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4856             }
4857             break;
4858          case float_round_down:
4859             return
4860                   aSign ?
4861                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4862                 : packFloatx80( 0, 0, 0 );
4863          case float_round_up:
4864             return
4865                   aSign ? packFloatx80( 1, 0, 0 )
4866                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4867         }
4868         return packFloatx80( aSign, 0, 0 );
4869     }
4870     lastBitMask = 1;
4871     lastBitMask <<= 0x403E - aExp;
4872     roundBitsMask = lastBitMask - 1;
4873     z = a;
4874     switch (status->float_rounding_mode) {
4875     case float_round_nearest_even:
4876         z.low += lastBitMask>>1;
4877         if ((z.low & roundBitsMask) == 0) {
4878             z.low &= ~lastBitMask;
4879         }
4880         break;
4881     case float_round_ties_away:
4882         z.low += lastBitMask >> 1;
4883         break;
4884     case float_round_to_zero:
4885         break;
4886     case float_round_up:
4887         if (!extractFloatx80Sign(z)) {
4888             z.low += roundBitsMask;
4889         }
4890         break;
4891     case float_round_down:
4892         if (extractFloatx80Sign(z)) {
4893             z.low += roundBitsMask;
4894         }
4895         break;
4896     default:
4897         abort();
4898     }
4899     z.low &= ~ roundBitsMask;
4900     if ( z.low == 0 ) {
4901         ++z.high;
4902         z.low = LIT64( 0x8000000000000000 );
4903     }
4904     if (z.low != a.low) {
4905         status->float_exception_flags |= float_flag_inexact;
4906     }
4907     return z;
4908 
4909 }
4910 
4911 /*----------------------------------------------------------------------------
4912 | Returns the result of adding the absolute values of the extended double-
4913 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4914 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4915 | The addition is performed according to the IEC/IEEE Standard for Binary
4916 | Floating-Point Arithmetic.
4917 *----------------------------------------------------------------------------*/
4918 
4919 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4920                                 float_status *status)
4921 {
4922     int32_t aExp, bExp, zExp;
4923     uint64_t aSig, bSig, zSig0, zSig1;
4924     int32_t expDiff;
4925 
4926     aSig = extractFloatx80Frac( a );
4927     aExp = extractFloatx80Exp( a );
4928     bSig = extractFloatx80Frac( b );
4929     bExp = extractFloatx80Exp( b );
4930     expDiff = aExp - bExp;
4931     if ( 0 < expDiff ) {
4932         if ( aExp == 0x7FFF ) {
4933             if ((uint64_t)(aSig << 1)) {
4934                 return propagateFloatx80NaN(a, b, status);
4935             }
4936             return a;
4937         }
4938         if ( bExp == 0 ) --expDiff;
4939         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4940         zExp = aExp;
4941     }
4942     else if ( expDiff < 0 ) {
4943         if ( bExp == 0x7FFF ) {
4944             if ((uint64_t)(bSig << 1)) {
4945                 return propagateFloatx80NaN(a, b, status);
4946             }
4947             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4948         }
4949         if ( aExp == 0 ) ++expDiff;
4950         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4951         zExp = bExp;
4952     }
4953     else {
4954         if ( aExp == 0x7FFF ) {
4955             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4956                 return propagateFloatx80NaN(a, b, status);
4957             }
4958             return a;
4959         }
4960         zSig1 = 0;
4961         zSig0 = aSig + bSig;
4962         if ( aExp == 0 ) {
4963             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4964             goto roundAndPack;
4965         }
4966         zExp = aExp;
4967         goto shiftRight1;
4968     }
4969     zSig0 = aSig + bSig;
4970     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4971  shiftRight1:
4972     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4973     zSig0 |= LIT64( 0x8000000000000000 );
4974     ++zExp;
4975  roundAndPack:
4976     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4977                                 zSign, zExp, zSig0, zSig1, status);
4978 }
4979 
4980 /*----------------------------------------------------------------------------
4981 | Returns the result of subtracting the absolute values of the extended
4982 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4983 | difference is negated before being returned.  `zSign' is ignored if the
4984 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4985 | Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987 
4988 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4989                                 float_status *status)
4990 {
4991     int32_t aExp, bExp, zExp;
4992     uint64_t aSig, bSig, zSig0, zSig1;
4993     int32_t expDiff;
4994 
4995     aSig = extractFloatx80Frac( a );
4996     aExp = extractFloatx80Exp( a );
4997     bSig = extractFloatx80Frac( b );
4998     bExp = extractFloatx80Exp( b );
4999     expDiff = aExp - bExp;
5000     if ( 0 < expDiff ) goto aExpBigger;
5001     if ( expDiff < 0 ) goto bExpBigger;
5002     if ( aExp == 0x7FFF ) {
5003         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5004             return propagateFloatx80NaN(a, b, status);
5005         }
5006         float_raise(float_flag_invalid, status);
5007         return floatx80_default_nan(status);
5008     }
5009     if ( aExp == 0 ) {
5010         aExp = 1;
5011         bExp = 1;
5012     }
5013     zSig1 = 0;
5014     if ( bSig < aSig ) goto aBigger;
5015     if ( aSig < bSig ) goto bBigger;
5016     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5017  bExpBigger:
5018     if ( bExp == 0x7FFF ) {
5019         if ((uint64_t)(bSig << 1)) {
5020             return propagateFloatx80NaN(a, b, status);
5021         }
5022         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5023     }
5024     if ( aExp == 0 ) ++expDiff;
5025     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5026  bBigger:
5027     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5028     zExp = bExp;
5029     zSign ^= 1;
5030     goto normalizeRoundAndPack;
5031  aExpBigger:
5032     if ( aExp == 0x7FFF ) {
5033         if ((uint64_t)(aSig << 1)) {
5034             return propagateFloatx80NaN(a, b, status);
5035         }
5036         return a;
5037     }
5038     if ( bExp == 0 ) --expDiff;
5039     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5040  aBigger:
5041     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5042     zExp = aExp;
5043  normalizeRoundAndPack:
5044     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5045                                          zSign, zExp, zSig0, zSig1, status);
5046 }
5047 
5048 /*----------------------------------------------------------------------------
5049 | Returns the result of adding the extended double-precision floating-point
5050 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5051 | Standard for Binary Floating-Point Arithmetic.
5052 *----------------------------------------------------------------------------*/
5053 
5054 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5055 {
5056     flag aSign, bSign;
5057 
5058     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5059         float_raise(float_flag_invalid, status);
5060         return floatx80_default_nan(status);
5061     }
5062     aSign = extractFloatx80Sign( a );
5063     bSign = extractFloatx80Sign( b );
5064     if ( aSign == bSign ) {
5065         return addFloatx80Sigs(a, b, aSign, status);
5066     }
5067     else {
5068         return subFloatx80Sigs(a, b, aSign, status);
5069     }
5070 
5071 }
5072 
5073 /*----------------------------------------------------------------------------
5074 | Returns the result of subtracting the extended double-precision floating-
5075 | point values `a' and `b'.  The operation is performed according to the
5076 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078 
5079 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5080 {
5081     flag aSign, bSign;
5082 
5083     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5084         float_raise(float_flag_invalid, status);
5085         return floatx80_default_nan(status);
5086     }
5087     aSign = extractFloatx80Sign( a );
5088     bSign = extractFloatx80Sign( b );
5089     if ( aSign == bSign ) {
5090         return subFloatx80Sigs(a, b, aSign, status);
5091     }
5092     else {
5093         return addFloatx80Sigs(a, b, aSign, status);
5094     }
5095 
5096 }
5097 
5098 /*----------------------------------------------------------------------------
5099 | Returns the result of multiplying the extended double-precision floating-
5100 | point values `a' and `b'.  The operation is performed according to the
5101 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102 *----------------------------------------------------------------------------*/
5103 
5104 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5105 {
5106     flag aSign, bSign, zSign;
5107     int32_t aExp, bExp, zExp;
5108     uint64_t aSig, bSig, zSig0, zSig1;
5109 
5110     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5111         float_raise(float_flag_invalid, status);
5112         return floatx80_default_nan(status);
5113     }
5114     aSig = extractFloatx80Frac( a );
5115     aExp = extractFloatx80Exp( a );
5116     aSign = extractFloatx80Sign( a );
5117     bSig = extractFloatx80Frac( b );
5118     bExp = extractFloatx80Exp( b );
5119     bSign = extractFloatx80Sign( b );
5120     zSign = aSign ^ bSign;
5121     if ( aExp == 0x7FFF ) {
5122         if (    (uint64_t) ( aSig<<1 )
5123              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5124             return propagateFloatx80NaN(a, b, status);
5125         }
5126         if ( ( bExp | bSig ) == 0 ) goto invalid;
5127         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5128     }
5129     if ( bExp == 0x7FFF ) {
5130         if ((uint64_t)(bSig << 1)) {
5131             return propagateFloatx80NaN(a, b, status);
5132         }
5133         if ( ( aExp | aSig ) == 0 ) {
5134  invalid:
5135             float_raise(float_flag_invalid, status);
5136             return floatx80_default_nan(status);
5137         }
5138         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5139     }
5140     if ( aExp == 0 ) {
5141         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5142         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5143     }
5144     if ( bExp == 0 ) {
5145         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5146         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5147     }
5148     zExp = aExp + bExp - 0x3FFE;
5149     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5150     if ( 0 < (int64_t) zSig0 ) {
5151         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5152         --zExp;
5153     }
5154     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5155                                 zSign, zExp, zSig0, zSig1, status);
5156 }
5157 
5158 /*----------------------------------------------------------------------------
5159 | Returns the result of dividing the extended double-precision floating-point
5160 | value `a' by the corresponding value `b'.  The operation is performed
5161 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5162 *----------------------------------------------------------------------------*/
5163 
5164 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5165 {
5166     flag aSign, bSign, zSign;
5167     int32_t aExp, bExp, zExp;
5168     uint64_t aSig, bSig, zSig0, zSig1;
5169     uint64_t rem0, rem1, rem2, term0, term1, term2;
5170 
5171     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5172         float_raise(float_flag_invalid, status);
5173         return floatx80_default_nan(status);
5174     }
5175     aSig = extractFloatx80Frac( a );
5176     aExp = extractFloatx80Exp( a );
5177     aSign = extractFloatx80Sign( a );
5178     bSig = extractFloatx80Frac( b );
5179     bExp = extractFloatx80Exp( b );
5180     bSign = extractFloatx80Sign( b );
5181     zSign = aSign ^ bSign;
5182     if ( aExp == 0x7FFF ) {
5183         if ((uint64_t)(aSig << 1)) {
5184             return propagateFloatx80NaN(a, b, status);
5185         }
5186         if ( bExp == 0x7FFF ) {
5187             if ((uint64_t)(bSig << 1)) {
5188                 return propagateFloatx80NaN(a, b, status);
5189             }
5190             goto invalid;
5191         }
5192         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5193     }
5194     if ( bExp == 0x7FFF ) {
5195         if ((uint64_t)(bSig << 1)) {
5196             return propagateFloatx80NaN(a, b, status);
5197         }
5198         return packFloatx80( zSign, 0, 0 );
5199     }
5200     if ( bExp == 0 ) {
5201         if ( bSig == 0 ) {
5202             if ( ( aExp | aSig ) == 0 ) {
5203  invalid:
5204                 float_raise(float_flag_invalid, status);
5205                 return floatx80_default_nan(status);
5206             }
5207             float_raise(float_flag_divbyzero, status);
5208             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5209         }
5210         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5211     }
5212     if ( aExp == 0 ) {
5213         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5214         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5215     }
5216     zExp = aExp - bExp + 0x3FFE;
5217     rem1 = 0;
5218     if ( bSig <= aSig ) {
5219         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5220         ++zExp;
5221     }
5222     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5223     mul64To128( bSig, zSig0, &term0, &term1 );
5224     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5225     while ( (int64_t) rem0 < 0 ) {
5226         --zSig0;
5227         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5228     }
5229     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5230     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5231         mul64To128( bSig, zSig1, &term1, &term2 );
5232         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5233         while ( (int64_t) rem1 < 0 ) {
5234             --zSig1;
5235             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5236         }
5237         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5238     }
5239     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5240                                 zSign, zExp, zSig0, zSig1, status);
5241 }
5242 
5243 /*----------------------------------------------------------------------------
5244 | Returns the remainder of the extended double-precision floating-point value
5245 | `a' with respect to the corresponding value `b'.  The operation is performed
5246 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5247 *----------------------------------------------------------------------------*/
5248 
5249 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5250 {
5251     flag aSign, zSign;
5252     int32_t aExp, bExp, expDiff;
5253     uint64_t aSig0, aSig1, bSig;
5254     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5255 
5256     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5257         float_raise(float_flag_invalid, status);
5258         return floatx80_default_nan(status);
5259     }
5260     aSig0 = extractFloatx80Frac( a );
5261     aExp = extractFloatx80Exp( a );
5262     aSign = extractFloatx80Sign( a );
5263     bSig = extractFloatx80Frac( b );
5264     bExp = extractFloatx80Exp( b );
5265     if ( aExp == 0x7FFF ) {
5266         if (    (uint64_t) ( aSig0<<1 )
5267              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5268             return propagateFloatx80NaN(a, b, status);
5269         }
5270         goto invalid;
5271     }
5272     if ( bExp == 0x7FFF ) {
5273         if ((uint64_t)(bSig << 1)) {
5274             return propagateFloatx80NaN(a, b, status);
5275         }
5276         return a;
5277     }
5278     if ( bExp == 0 ) {
5279         if ( bSig == 0 ) {
5280  invalid:
5281             float_raise(float_flag_invalid, status);
5282             return floatx80_default_nan(status);
5283         }
5284         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5285     }
5286     if ( aExp == 0 ) {
5287         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5288         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5289     }
5290     bSig |= LIT64( 0x8000000000000000 );
5291     zSign = aSign;
5292     expDiff = aExp - bExp;
5293     aSig1 = 0;
5294     if ( expDiff < 0 ) {
5295         if ( expDiff < -1 ) return a;
5296         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5297         expDiff = 0;
5298     }
5299     q = ( bSig <= aSig0 );
5300     if ( q ) aSig0 -= bSig;
5301     expDiff -= 64;
5302     while ( 0 < expDiff ) {
5303         q = estimateDiv128To64( aSig0, aSig1, bSig );
5304         q = ( 2 < q ) ? q - 2 : 0;
5305         mul64To128( bSig, q, &term0, &term1 );
5306         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5307         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5308         expDiff -= 62;
5309     }
5310     expDiff += 64;
5311     if ( 0 < expDiff ) {
5312         q = estimateDiv128To64( aSig0, aSig1, bSig );
5313         q = ( 2 < q ) ? q - 2 : 0;
5314         q >>= 64 - expDiff;
5315         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5316         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5317         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5318         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5319             ++q;
5320             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5321         }
5322     }
5323     else {
5324         term1 = 0;
5325         term0 = bSig;
5326     }
5327     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5328     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5329          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5330               && ( q & 1 ) )
5331        ) {
5332         aSig0 = alternateASig0;
5333         aSig1 = alternateASig1;
5334         zSign = ! zSign;
5335     }
5336     return
5337         normalizeRoundAndPackFloatx80(
5338             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5339 
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns the square root of the extended double-precision floating-point
5344 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5345 | for Binary Floating-Point Arithmetic.
5346 *----------------------------------------------------------------------------*/
5347 
5348 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5349 {
5350     flag aSign;
5351     int32_t aExp, zExp;
5352     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5353     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5354 
5355     if (floatx80_invalid_encoding(a)) {
5356         float_raise(float_flag_invalid, status);
5357         return floatx80_default_nan(status);
5358     }
5359     aSig0 = extractFloatx80Frac( a );
5360     aExp = extractFloatx80Exp( a );
5361     aSign = extractFloatx80Sign( a );
5362     if ( aExp == 0x7FFF ) {
5363         if ((uint64_t)(aSig0 << 1)) {
5364             return propagateFloatx80NaN(a, a, status);
5365         }
5366         if ( ! aSign ) return a;
5367         goto invalid;
5368     }
5369     if ( aSign ) {
5370         if ( ( aExp | aSig0 ) == 0 ) return a;
5371  invalid:
5372         float_raise(float_flag_invalid, status);
5373         return floatx80_default_nan(status);
5374     }
5375     if ( aExp == 0 ) {
5376         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5377         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5378     }
5379     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5380     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5381     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5382     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5383     doubleZSig0 = zSig0<<1;
5384     mul64To128( zSig0, zSig0, &term0, &term1 );
5385     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5386     while ( (int64_t) rem0 < 0 ) {
5387         --zSig0;
5388         doubleZSig0 -= 2;
5389         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5390     }
5391     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5392     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5393         if ( zSig1 == 0 ) zSig1 = 1;
5394         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5395         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5396         mul64To128( zSig1, zSig1, &term2, &term3 );
5397         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5398         while ( (int64_t) rem1 < 0 ) {
5399             --zSig1;
5400             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5401             term3 |= 1;
5402             term2 |= doubleZSig0;
5403             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5404         }
5405         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5406     }
5407     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5408     zSig0 |= doubleZSig0;
5409     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5410                                 0, zExp, zSig0, zSig1, status);
5411 }
5412 
5413 /*----------------------------------------------------------------------------
5414 | Returns 1 if the extended double-precision floating-point value `a' is equal
5415 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5416 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5418 *----------------------------------------------------------------------------*/
5419 
5420 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5421 {
5422 
5423     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5424         || (extractFloatx80Exp(a) == 0x7FFF
5425             && (uint64_t) (extractFloatx80Frac(a) << 1))
5426         || (extractFloatx80Exp(b) == 0x7FFF
5427             && (uint64_t) (extractFloatx80Frac(b) << 1))
5428        ) {
5429         float_raise(float_flag_invalid, status);
5430         return 0;
5431     }
5432     return
5433            ( a.low == b.low )
5434         && (    ( a.high == b.high )
5435              || (    ( a.low == 0 )
5436                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5437            );
5438 
5439 }
5440 
5441 /*----------------------------------------------------------------------------
5442 | Returns 1 if the extended double-precision floating-point value `a' is
5443 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5444 | invalid exception is raised if either operand is a NaN.  The comparison is
5445 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5446 | Arithmetic.
5447 *----------------------------------------------------------------------------*/
5448 
5449 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5450 {
5451     flag aSign, bSign;
5452 
5453     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5454         || (extractFloatx80Exp(a) == 0x7FFF
5455             && (uint64_t) (extractFloatx80Frac(a) << 1))
5456         || (extractFloatx80Exp(b) == 0x7FFF
5457             && (uint64_t) (extractFloatx80Frac(b) << 1))
5458        ) {
5459         float_raise(float_flag_invalid, status);
5460         return 0;
5461     }
5462     aSign = extractFloatx80Sign( a );
5463     bSign = extractFloatx80Sign( b );
5464     if ( aSign != bSign ) {
5465         return
5466                aSign
5467             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5468                  == 0 );
5469     }
5470     return
5471           aSign ? le128( b.high, b.low, a.high, a.low )
5472         : le128( a.high, a.low, b.high, b.low );
5473 
5474 }
5475 
5476 /*----------------------------------------------------------------------------
5477 | Returns 1 if the extended double-precision floating-point value `a' is
5478 | less than the corresponding value `b', and 0 otherwise.  The invalid
5479 | exception is raised if either operand is a NaN.  The comparison is performed
5480 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5481 *----------------------------------------------------------------------------*/
5482 
5483 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5484 {
5485     flag aSign, bSign;
5486 
5487     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5488         || (extractFloatx80Exp(a) == 0x7FFF
5489             && (uint64_t) (extractFloatx80Frac(a) << 1))
5490         || (extractFloatx80Exp(b) == 0x7FFF
5491             && (uint64_t) (extractFloatx80Frac(b) << 1))
5492        ) {
5493         float_raise(float_flag_invalid, status);
5494         return 0;
5495     }
5496     aSign = extractFloatx80Sign( a );
5497     bSign = extractFloatx80Sign( b );
5498     if ( aSign != bSign ) {
5499         return
5500                aSign
5501             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5502                  != 0 );
5503     }
5504     return
5505           aSign ? lt128( b.high, b.low, a.high, a.low )
5506         : lt128( a.high, a.low, b.high, b.low );
5507 
5508 }
5509 
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5512 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5513 | either operand is a NaN.   The comparison is performed according to the
5514 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5515 *----------------------------------------------------------------------------*/
5516 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5517 {
5518     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5519         || (extractFloatx80Exp(a) == 0x7FFF
5520             && (uint64_t) (extractFloatx80Frac(a) << 1))
5521         || (extractFloatx80Exp(b) == 0x7FFF
5522             && (uint64_t) (extractFloatx80Frac(b) << 1))
5523        ) {
5524         float_raise(float_flag_invalid, status);
5525         return 1;
5526     }
5527     return 0;
5528 }
5529 
5530 /*----------------------------------------------------------------------------
5531 | Returns 1 if the extended double-precision floating-point value `a' is
5532 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5533 | cause an exception.  The comparison is performed according to the IEC/IEEE
5534 | Standard for Binary Floating-Point Arithmetic.
5535 *----------------------------------------------------------------------------*/
5536 
5537 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5538 {
5539 
5540     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5541         float_raise(float_flag_invalid, status);
5542         return 0;
5543     }
5544     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5545               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5546          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5547               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5548        ) {
5549         if (floatx80_is_signaling_nan(a, status)
5550          || floatx80_is_signaling_nan(b, status)) {
5551             float_raise(float_flag_invalid, status);
5552         }
5553         return 0;
5554     }
5555     return
5556            ( a.low == b.low )
5557         && (    ( a.high == b.high )
5558              || (    ( a.low == 0 )
5559                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5560            );
5561 
5562 }
5563 
5564 /*----------------------------------------------------------------------------
5565 | Returns 1 if the extended double-precision floating-point value `a' is less
5566 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5567 | do not cause an exception.  Otherwise, the comparison is performed according
5568 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5569 *----------------------------------------------------------------------------*/
5570 
5571 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5572 {
5573     flag aSign, bSign;
5574 
5575     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5576         float_raise(float_flag_invalid, status);
5577         return 0;
5578     }
5579     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5580               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5581          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5582               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5583        ) {
5584         if (floatx80_is_signaling_nan(a, status)
5585          || floatx80_is_signaling_nan(b, status)) {
5586             float_raise(float_flag_invalid, status);
5587         }
5588         return 0;
5589     }
5590     aSign = extractFloatx80Sign( a );
5591     bSign = extractFloatx80Sign( b );
5592     if ( aSign != bSign ) {
5593         return
5594                aSign
5595             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5596                  == 0 );
5597     }
5598     return
5599           aSign ? le128( b.high, b.low, a.high, a.low )
5600         : le128( a.high, a.low, b.high, b.low );
5601 
5602 }
5603 
5604 /*----------------------------------------------------------------------------
5605 | Returns 1 if the extended double-precision floating-point value `a' is less
5606 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5607 | an exception.  Otherwise, the comparison is performed according to the
5608 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5609 *----------------------------------------------------------------------------*/
5610 
5611 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5612 {
5613     flag aSign, bSign;
5614 
5615     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5616         float_raise(float_flag_invalid, status);
5617         return 0;
5618     }
5619     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5620               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5621          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5622               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5623        ) {
5624         if (floatx80_is_signaling_nan(a, status)
5625          || floatx80_is_signaling_nan(b, status)) {
5626             float_raise(float_flag_invalid, status);
5627         }
5628         return 0;
5629     }
5630     aSign = extractFloatx80Sign( a );
5631     bSign = extractFloatx80Sign( b );
5632     if ( aSign != bSign ) {
5633         return
5634                aSign
5635             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5636                  != 0 );
5637     }
5638     return
5639           aSign ? lt128( b.high, b.low, a.high, a.low )
5640         : lt128( a.high, a.low, b.high, b.low );
5641 
5642 }
5643 
5644 /*----------------------------------------------------------------------------
5645 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5646 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5647 | The comparison is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5651 {
5652     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5653         float_raise(float_flag_invalid, status);
5654         return 1;
5655     }
5656     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5657               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5658          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5659               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5660        ) {
5661         if (floatx80_is_signaling_nan(a, status)
5662          || floatx80_is_signaling_nan(b, status)) {
5663             float_raise(float_flag_invalid, status);
5664         }
5665         return 1;
5666     }
5667     return 0;
5668 }
5669 
5670 /*----------------------------------------------------------------------------
5671 | Returns the result of converting the quadruple-precision floating-point
5672 | value `a' to the 32-bit two's complement integer format.  The conversion
5673 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5674 | Arithmetic---which means in particular that the conversion is rounded
5675 | according to the current rounding mode.  If `a' is a NaN, the largest
5676 | positive integer is returned.  Otherwise, if the conversion overflows, the
5677 | largest integer with the same sign as `a' is returned.
5678 *----------------------------------------------------------------------------*/
5679 
5680 int32_t float128_to_int32(float128 a, float_status *status)
5681 {
5682     flag aSign;
5683     int32_t aExp, shiftCount;
5684     uint64_t aSig0, aSig1;
5685 
5686     aSig1 = extractFloat128Frac1( a );
5687     aSig0 = extractFloat128Frac0( a );
5688     aExp = extractFloat128Exp( a );
5689     aSign = extractFloat128Sign( a );
5690     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5691     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5692     aSig0 |= ( aSig1 != 0 );
5693     shiftCount = 0x4028 - aExp;
5694     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5695     return roundAndPackInt32(aSign, aSig0, status);
5696 
5697 }
5698 
5699 /*----------------------------------------------------------------------------
5700 | Returns the result of converting the quadruple-precision floating-point
5701 | value `a' to the 32-bit two's complement integer format.  The conversion
5702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5703 | Arithmetic, except that the conversion is always rounded toward zero.  If
5704 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5705 | conversion overflows, the largest integer with the same sign as `a' is
5706 | returned.
5707 *----------------------------------------------------------------------------*/
5708 
5709 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5710 {
5711     flag aSign;
5712     int32_t aExp, shiftCount;
5713     uint64_t aSig0, aSig1, savedASig;
5714     int32_t z;
5715 
5716     aSig1 = extractFloat128Frac1( a );
5717     aSig0 = extractFloat128Frac0( a );
5718     aExp = extractFloat128Exp( a );
5719     aSign = extractFloat128Sign( a );
5720     aSig0 |= ( aSig1 != 0 );
5721     if ( 0x401E < aExp ) {
5722         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5723         goto invalid;
5724     }
5725     else if ( aExp < 0x3FFF ) {
5726         if (aExp || aSig0) {
5727             status->float_exception_flags |= float_flag_inexact;
5728         }
5729         return 0;
5730     }
5731     aSig0 |= LIT64( 0x0001000000000000 );
5732     shiftCount = 0x402F - aExp;
5733     savedASig = aSig0;
5734     aSig0 >>= shiftCount;
5735     z = aSig0;
5736     if ( aSign ) z = - z;
5737     if ( ( z < 0 ) ^ aSign ) {
5738  invalid:
5739         float_raise(float_flag_invalid, status);
5740         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5741     }
5742     if ( ( aSig0<<shiftCount ) != savedASig ) {
5743         status->float_exception_flags |= float_flag_inexact;
5744     }
5745     return z;
5746 
5747 }
5748 
5749 /*----------------------------------------------------------------------------
5750 | Returns the result of converting the quadruple-precision floating-point
5751 | value `a' to the 64-bit two's complement integer format.  The conversion
5752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5753 | Arithmetic---which means in particular that the conversion is rounded
5754 | according to the current rounding mode.  If `a' is a NaN, the largest
5755 | positive integer is returned.  Otherwise, if the conversion overflows, the
5756 | largest integer with the same sign as `a' is returned.
5757 *----------------------------------------------------------------------------*/
5758 
5759 int64_t float128_to_int64(float128 a, float_status *status)
5760 {
5761     flag aSign;
5762     int32_t aExp, shiftCount;
5763     uint64_t aSig0, aSig1;
5764 
5765     aSig1 = extractFloat128Frac1( a );
5766     aSig0 = extractFloat128Frac0( a );
5767     aExp = extractFloat128Exp( a );
5768     aSign = extractFloat128Sign( a );
5769     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5770     shiftCount = 0x402F - aExp;
5771     if ( shiftCount <= 0 ) {
5772         if ( 0x403E < aExp ) {
5773             float_raise(float_flag_invalid, status);
5774             if (    ! aSign
5775                  || (    ( aExp == 0x7FFF )
5776                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5777                     )
5778                ) {
5779                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5780             }
5781             return (int64_t) LIT64( 0x8000000000000000 );
5782         }
5783         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5784     }
5785     else {
5786         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5787     }
5788     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5789 
5790 }
5791 
5792 /*----------------------------------------------------------------------------
5793 | Returns the result of converting the quadruple-precision floating-point
5794 | value `a' to the 64-bit two's complement integer format.  The conversion
5795 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5796 | Arithmetic, except that the conversion is always rounded toward zero.
5797 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5798 | the conversion overflows, the largest integer with the same sign as `a' is
5799 | returned.
5800 *----------------------------------------------------------------------------*/
5801 
5802 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5803 {
5804     flag aSign;
5805     int32_t aExp, shiftCount;
5806     uint64_t aSig0, aSig1;
5807     int64_t z;
5808 
5809     aSig1 = extractFloat128Frac1( a );
5810     aSig0 = extractFloat128Frac0( a );
5811     aExp = extractFloat128Exp( a );
5812     aSign = extractFloat128Sign( a );
5813     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5814     shiftCount = aExp - 0x402F;
5815     if ( 0 < shiftCount ) {
5816         if ( 0x403E <= aExp ) {
5817             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5818             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5819                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5820                 if (aSig1) {
5821                     status->float_exception_flags |= float_flag_inexact;
5822                 }
5823             }
5824             else {
5825                 float_raise(float_flag_invalid, status);
5826                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5827                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5828                 }
5829             }
5830             return (int64_t) LIT64( 0x8000000000000000 );
5831         }
5832         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5833         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5834             status->float_exception_flags |= float_flag_inexact;
5835         }
5836     }
5837     else {
5838         if ( aExp < 0x3FFF ) {
5839             if ( aExp | aSig0 | aSig1 ) {
5840                 status->float_exception_flags |= float_flag_inexact;
5841             }
5842             return 0;
5843         }
5844         z = aSig0>>( - shiftCount );
5845         if (    aSig1
5846              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5847             status->float_exception_flags |= float_flag_inexact;
5848         }
5849     }
5850     if ( aSign ) z = - z;
5851     return z;
5852 
5853 }
5854 
5855 /*----------------------------------------------------------------------------
5856 | Returns the result of converting the quadruple-precision floating-point value
5857 | `a' to the 64-bit unsigned integer format.  The conversion is
5858 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5859 | Arithmetic---which means in particular that the conversion is rounded
5860 | according to the current rounding mode.  If `a' is a NaN, the largest
5861 | positive integer is returned.  If the conversion overflows, the
5862 | largest unsigned integer is returned.  If 'a' is negative, the value is
5863 | rounded and zero is returned; negative values that do not round to zero
5864 | will raise the inexact exception.
5865 *----------------------------------------------------------------------------*/
5866 
5867 uint64_t float128_to_uint64(float128 a, float_status *status)
5868 {
5869     flag aSign;
5870     int aExp;
5871     int shiftCount;
5872     uint64_t aSig0, aSig1;
5873 
5874     aSig0 = extractFloat128Frac0(a);
5875     aSig1 = extractFloat128Frac1(a);
5876     aExp = extractFloat128Exp(a);
5877     aSign = extractFloat128Sign(a);
5878     if (aSign && (aExp > 0x3FFE)) {
5879         float_raise(float_flag_invalid, status);
5880         if (float128_is_any_nan(a)) {
5881             return LIT64(0xFFFFFFFFFFFFFFFF);
5882         } else {
5883             return 0;
5884         }
5885     }
5886     if (aExp) {
5887         aSig0 |= LIT64(0x0001000000000000);
5888     }
5889     shiftCount = 0x402F - aExp;
5890     if (shiftCount <= 0) {
5891         if (0x403E < aExp) {
5892             float_raise(float_flag_invalid, status);
5893             return LIT64(0xFFFFFFFFFFFFFFFF);
5894         }
5895         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5896     } else {
5897         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5898     }
5899     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5900 }
5901 
5902 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5903 {
5904     uint64_t v;
5905     signed char current_rounding_mode = status->float_rounding_mode;
5906 
5907     set_float_rounding_mode(float_round_to_zero, status);
5908     v = float128_to_uint64(a, status);
5909     set_float_rounding_mode(current_rounding_mode, status);
5910 
5911     return v;
5912 }
5913 
5914 /*----------------------------------------------------------------------------
5915 | Returns the result of converting the quadruple-precision floating-point
5916 | value `a' to the 32-bit unsigned integer format.  The conversion
5917 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5918 | Arithmetic except that the conversion is always rounded toward zero.
5919 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5920 | if the conversion overflows, the largest unsigned integer is returned.
5921 | If 'a' is negative, the value is rounded and zero is returned; negative
5922 | values that do not round to zero will raise the inexact exception.
5923 *----------------------------------------------------------------------------*/
5924 
5925 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5926 {
5927     uint64_t v;
5928     uint32_t res;
5929     int old_exc_flags = get_float_exception_flags(status);
5930 
5931     v = float128_to_uint64_round_to_zero(a, status);
5932     if (v > 0xffffffff) {
5933         res = 0xffffffff;
5934     } else {
5935         return v;
5936     }
5937     set_float_exception_flags(old_exc_flags, status);
5938     float_raise(float_flag_invalid, status);
5939     return res;
5940 }
5941 
5942 /*----------------------------------------------------------------------------
5943 | Returns the result of converting the quadruple-precision floating-point
5944 | value `a' to the single-precision floating-point format.  The conversion
5945 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5946 | Arithmetic.
5947 *----------------------------------------------------------------------------*/
5948 
5949 float32 float128_to_float32(float128 a, float_status *status)
5950 {
5951     flag aSign;
5952     int32_t aExp;
5953     uint64_t aSig0, aSig1;
5954     uint32_t zSig;
5955 
5956     aSig1 = extractFloat128Frac1( a );
5957     aSig0 = extractFloat128Frac0( a );
5958     aExp = extractFloat128Exp( a );
5959     aSign = extractFloat128Sign( a );
5960     if ( aExp == 0x7FFF ) {
5961         if ( aSig0 | aSig1 ) {
5962             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5963         }
5964         return packFloat32( aSign, 0xFF, 0 );
5965     }
5966     aSig0 |= ( aSig1 != 0 );
5967     shift64RightJamming( aSig0, 18, &aSig0 );
5968     zSig = aSig0;
5969     if ( aExp || zSig ) {
5970         zSig |= 0x40000000;
5971         aExp -= 0x3F81;
5972     }
5973     return roundAndPackFloat32(aSign, aExp, zSig, status);
5974 
5975 }
5976 
5977 /*----------------------------------------------------------------------------
5978 | Returns the result of converting the quadruple-precision floating-point
5979 | value `a' to the double-precision floating-point format.  The conversion
5980 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5981 | Arithmetic.
5982 *----------------------------------------------------------------------------*/
5983 
5984 float64 float128_to_float64(float128 a, float_status *status)
5985 {
5986     flag aSign;
5987     int32_t aExp;
5988     uint64_t aSig0, aSig1;
5989 
5990     aSig1 = extractFloat128Frac1( a );
5991     aSig0 = extractFloat128Frac0( a );
5992     aExp = extractFloat128Exp( a );
5993     aSign = extractFloat128Sign( a );
5994     if ( aExp == 0x7FFF ) {
5995         if ( aSig0 | aSig1 ) {
5996             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5997         }
5998         return packFloat64( aSign, 0x7FF, 0 );
5999     }
6000     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6001     aSig0 |= ( aSig1 != 0 );
6002     if ( aExp || aSig0 ) {
6003         aSig0 |= LIT64( 0x4000000000000000 );
6004         aExp -= 0x3C01;
6005     }
6006     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6007 
6008 }
6009 
6010 /*----------------------------------------------------------------------------
6011 | Returns the result of converting the quadruple-precision floating-point
6012 | value `a' to the extended double-precision floating-point format.  The
6013 | conversion is performed according to the IEC/IEEE Standard for Binary
6014 | Floating-Point Arithmetic.
6015 *----------------------------------------------------------------------------*/
6016 
6017 floatx80 float128_to_floatx80(float128 a, float_status *status)
6018 {
6019     flag aSign;
6020     int32_t aExp;
6021     uint64_t aSig0, aSig1;
6022 
6023     aSig1 = extractFloat128Frac1( a );
6024     aSig0 = extractFloat128Frac0( a );
6025     aExp = extractFloat128Exp( a );
6026     aSign = extractFloat128Sign( a );
6027     if ( aExp == 0x7FFF ) {
6028         if ( aSig0 | aSig1 ) {
6029             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6030         }
6031         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6032     }
6033     if ( aExp == 0 ) {
6034         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6035         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6036     }
6037     else {
6038         aSig0 |= LIT64( 0x0001000000000000 );
6039     }
6040     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6041     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6042 
6043 }
6044 
6045 /*----------------------------------------------------------------------------
6046 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6047 | returns the result as a quadruple-precision floating-point value.  The
6048 | operation is performed according to the IEC/IEEE Standard for Binary
6049 | Floating-Point Arithmetic.
6050 *----------------------------------------------------------------------------*/
6051 
6052 float128 float128_round_to_int(float128 a, float_status *status)
6053 {
6054     flag aSign;
6055     int32_t aExp;
6056     uint64_t lastBitMask, roundBitsMask;
6057     float128 z;
6058 
6059     aExp = extractFloat128Exp( a );
6060     if ( 0x402F <= aExp ) {
6061         if ( 0x406F <= aExp ) {
6062             if (    ( aExp == 0x7FFF )
6063                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6064                ) {
6065                 return propagateFloat128NaN(a, a, status);
6066             }
6067             return a;
6068         }
6069         lastBitMask = 1;
6070         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6071         roundBitsMask = lastBitMask - 1;
6072         z = a;
6073         switch (status->float_rounding_mode) {
6074         case float_round_nearest_even:
6075             if ( lastBitMask ) {
6076                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6077                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6078             }
6079             else {
6080                 if ( (int64_t) z.low < 0 ) {
6081                     ++z.high;
6082                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6083                 }
6084             }
6085             break;
6086         case float_round_ties_away:
6087             if (lastBitMask) {
6088                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6089             } else {
6090                 if ((int64_t) z.low < 0) {
6091                     ++z.high;
6092                 }
6093             }
6094             break;
6095         case float_round_to_zero:
6096             break;
6097         case float_round_up:
6098             if (!extractFloat128Sign(z)) {
6099                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6100             }
6101             break;
6102         case float_round_down:
6103             if (extractFloat128Sign(z)) {
6104                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6105             }
6106             break;
6107         default:
6108             abort();
6109         }
6110         z.low &= ~ roundBitsMask;
6111     }
6112     else {
6113         if ( aExp < 0x3FFF ) {
6114             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6115             status->float_exception_flags |= float_flag_inexact;
6116             aSign = extractFloat128Sign( a );
6117             switch (status->float_rounding_mode) {
6118              case float_round_nearest_even:
6119                 if (    ( aExp == 0x3FFE )
6120                      && (   extractFloat128Frac0( a )
6121                           | extractFloat128Frac1( a ) )
6122                    ) {
6123                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6124                 }
6125                 break;
6126             case float_round_ties_away:
6127                 if (aExp == 0x3FFE) {
6128                     return packFloat128(aSign, 0x3FFF, 0, 0);
6129                 }
6130                 break;
6131              case float_round_down:
6132                 return
6133                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6134                     : packFloat128( 0, 0, 0, 0 );
6135              case float_round_up:
6136                 return
6137                       aSign ? packFloat128( 1, 0, 0, 0 )
6138                     : packFloat128( 0, 0x3FFF, 0, 0 );
6139             }
6140             return packFloat128( aSign, 0, 0, 0 );
6141         }
6142         lastBitMask = 1;
6143         lastBitMask <<= 0x402F - aExp;
6144         roundBitsMask = lastBitMask - 1;
6145         z.low = 0;
6146         z.high = a.high;
6147         switch (status->float_rounding_mode) {
6148         case float_round_nearest_even:
6149             z.high += lastBitMask>>1;
6150             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6151                 z.high &= ~ lastBitMask;
6152             }
6153             break;
6154         case float_round_ties_away:
6155             z.high += lastBitMask>>1;
6156             break;
6157         case float_round_to_zero:
6158             break;
6159         case float_round_up:
6160             if (!extractFloat128Sign(z)) {
6161                 z.high |= ( a.low != 0 );
6162                 z.high += roundBitsMask;
6163             }
6164             break;
6165         case float_round_down:
6166             if (extractFloat128Sign(z)) {
6167                 z.high |= (a.low != 0);
6168                 z.high += roundBitsMask;
6169             }
6170             break;
6171         default:
6172             abort();
6173         }
6174         z.high &= ~ roundBitsMask;
6175     }
6176     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6177         status->float_exception_flags |= float_flag_inexact;
6178     }
6179     return z;
6180 
6181 }
6182 
6183 /*----------------------------------------------------------------------------
6184 | Returns the result of adding the absolute values of the quadruple-precision
6185 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6186 | before being returned.  `zSign' is ignored if the result is a NaN.
6187 | The addition is performed according to the IEC/IEEE Standard for Binary
6188 | Floating-Point Arithmetic.
6189 *----------------------------------------------------------------------------*/
6190 
6191 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6192                                 float_status *status)
6193 {
6194     int32_t aExp, bExp, zExp;
6195     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6196     int32_t expDiff;
6197 
6198     aSig1 = extractFloat128Frac1( a );
6199     aSig0 = extractFloat128Frac0( a );
6200     aExp = extractFloat128Exp( a );
6201     bSig1 = extractFloat128Frac1( b );
6202     bSig0 = extractFloat128Frac0( b );
6203     bExp = extractFloat128Exp( b );
6204     expDiff = aExp - bExp;
6205     if ( 0 < expDiff ) {
6206         if ( aExp == 0x7FFF ) {
6207             if (aSig0 | aSig1) {
6208                 return propagateFloat128NaN(a, b, status);
6209             }
6210             return a;
6211         }
6212         if ( bExp == 0 ) {
6213             --expDiff;
6214         }
6215         else {
6216             bSig0 |= LIT64( 0x0001000000000000 );
6217         }
6218         shift128ExtraRightJamming(
6219             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6220         zExp = aExp;
6221     }
6222     else if ( expDiff < 0 ) {
6223         if ( bExp == 0x7FFF ) {
6224             if (bSig0 | bSig1) {
6225                 return propagateFloat128NaN(a, b, status);
6226             }
6227             return packFloat128( zSign, 0x7FFF, 0, 0 );
6228         }
6229         if ( aExp == 0 ) {
6230             ++expDiff;
6231         }
6232         else {
6233             aSig0 |= LIT64( 0x0001000000000000 );
6234         }
6235         shift128ExtraRightJamming(
6236             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6237         zExp = bExp;
6238     }
6239     else {
6240         if ( aExp == 0x7FFF ) {
6241             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6242                 return propagateFloat128NaN(a, b, status);
6243             }
6244             return a;
6245         }
6246         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6247         if ( aExp == 0 ) {
6248             if (status->flush_to_zero) {
6249                 if (zSig0 | zSig1) {
6250                     float_raise(float_flag_output_denormal, status);
6251                 }
6252                 return packFloat128(zSign, 0, 0, 0);
6253             }
6254             return packFloat128( zSign, 0, zSig0, zSig1 );
6255         }
6256         zSig2 = 0;
6257         zSig0 |= LIT64( 0x0002000000000000 );
6258         zExp = aExp;
6259         goto shiftRight1;
6260     }
6261     aSig0 |= LIT64( 0x0001000000000000 );
6262     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6263     --zExp;
6264     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6265     ++zExp;
6266  shiftRight1:
6267     shift128ExtraRightJamming(
6268         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6269  roundAndPack:
6270     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6271 
6272 }
6273 
6274 /*----------------------------------------------------------------------------
6275 | Returns the result of subtracting the absolute values of the quadruple-
6276 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6277 | difference is negated before being returned.  `zSign' is ignored if the
6278 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6279 | Standard for Binary Floating-Point Arithmetic.
6280 *----------------------------------------------------------------------------*/
6281 
6282 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6283                                 float_status *status)
6284 {
6285     int32_t aExp, bExp, zExp;
6286     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6287     int32_t expDiff;
6288 
6289     aSig1 = extractFloat128Frac1( a );
6290     aSig0 = extractFloat128Frac0( a );
6291     aExp = extractFloat128Exp( a );
6292     bSig1 = extractFloat128Frac1( b );
6293     bSig0 = extractFloat128Frac0( b );
6294     bExp = extractFloat128Exp( b );
6295     expDiff = aExp - bExp;
6296     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6297     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6298     if ( 0 < expDiff ) goto aExpBigger;
6299     if ( expDiff < 0 ) goto bExpBigger;
6300     if ( aExp == 0x7FFF ) {
6301         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6302             return propagateFloat128NaN(a, b, status);
6303         }
6304         float_raise(float_flag_invalid, status);
6305         return float128_default_nan(status);
6306     }
6307     if ( aExp == 0 ) {
6308         aExp = 1;
6309         bExp = 1;
6310     }
6311     if ( bSig0 < aSig0 ) goto aBigger;
6312     if ( aSig0 < bSig0 ) goto bBigger;
6313     if ( bSig1 < aSig1 ) goto aBigger;
6314     if ( aSig1 < bSig1 ) goto bBigger;
6315     return packFloat128(status->float_rounding_mode == float_round_down,
6316                         0, 0, 0);
6317  bExpBigger:
6318     if ( bExp == 0x7FFF ) {
6319         if (bSig0 | bSig1) {
6320             return propagateFloat128NaN(a, b, status);
6321         }
6322         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6323     }
6324     if ( aExp == 0 ) {
6325         ++expDiff;
6326     }
6327     else {
6328         aSig0 |= LIT64( 0x4000000000000000 );
6329     }
6330     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6331     bSig0 |= LIT64( 0x4000000000000000 );
6332  bBigger:
6333     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6334     zExp = bExp;
6335     zSign ^= 1;
6336     goto normalizeRoundAndPack;
6337  aExpBigger:
6338     if ( aExp == 0x7FFF ) {
6339         if (aSig0 | aSig1) {
6340             return propagateFloat128NaN(a, b, status);
6341         }
6342         return a;
6343     }
6344     if ( bExp == 0 ) {
6345         --expDiff;
6346     }
6347     else {
6348         bSig0 |= LIT64( 0x4000000000000000 );
6349     }
6350     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6351     aSig0 |= LIT64( 0x4000000000000000 );
6352  aBigger:
6353     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6354     zExp = aExp;
6355  normalizeRoundAndPack:
6356     --zExp;
6357     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6358                                          status);
6359 
6360 }
6361 
6362 /*----------------------------------------------------------------------------
6363 | Returns the result of adding the quadruple-precision floating-point values
6364 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6365 | for Binary Floating-Point Arithmetic.
6366 *----------------------------------------------------------------------------*/
6367 
6368 float128 float128_add(float128 a, float128 b, float_status *status)
6369 {
6370     flag aSign, bSign;
6371 
6372     aSign = extractFloat128Sign( a );
6373     bSign = extractFloat128Sign( b );
6374     if ( aSign == bSign ) {
6375         return addFloat128Sigs(a, b, aSign, status);
6376     }
6377     else {
6378         return subFloat128Sigs(a, b, aSign, status);
6379     }
6380 
6381 }
6382 
6383 /*----------------------------------------------------------------------------
6384 | Returns the result of subtracting the quadruple-precision floating-point
6385 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6386 | Standard for Binary Floating-Point Arithmetic.
6387 *----------------------------------------------------------------------------*/
6388 
6389 float128 float128_sub(float128 a, float128 b, float_status *status)
6390 {
6391     flag aSign, bSign;
6392 
6393     aSign = extractFloat128Sign( a );
6394     bSign = extractFloat128Sign( b );
6395     if ( aSign == bSign ) {
6396         return subFloat128Sigs(a, b, aSign, status);
6397     }
6398     else {
6399         return addFloat128Sigs(a, b, aSign, status);
6400     }
6401 
6402 }
6403 
6404 /*----------------------------------------------------------------------------
6405 | Returns the result of multiplying the quadruple-precision floating-point
6406 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6407 | Standard for Binary Floating-Point Arithmetic.
6408 *----------------------------------------------------------------------------*/
6409 
6410 float128 float128_mul(float128 a, float128 b, float_status *status)
6411 {
6412     flag aSign, bSign, zSign;
6413     int32_t aExp, bExp, zExp;
6414     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6415 
6416     aSig1 = extractFloat128Frac1( a );
6417     aSig0 = extractFloat128Frac0( a );
6418     aExp = extractFloat128Exp( a );
6419     aSign = extractFloat128Sign( a );
6420     bSig1 = extractFloat128Frac1( b );
6421     bSig0 = extractFloat128Frac0( b );
6422     bExp = extractFloat128Exp( b );
6423     bSign = extractFloat128Sign( b );
6424     zSign = aSign ^ bSign;
6425     if ( aExp == 0x7FFF ) {
6426         if (    ( aSig0 | aSig1 )
6427              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6428             return propagateFloat128NaN(a, b, status);
6429         }
6430         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6431         return packFloat128( zSign, 0x7FFF, 0, 0 );
6432     }
6433     if ( bExp == 0x7FFF ) {
6434         if (bSig0 | bSig1) {
6435             return propagateFloat128NaN(a, b, status);
6436         }
6437         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6438  invalid:
6439             float_raise(float_flag_invalid, status);
6440             return float128_default_nan(status);
6441         }
6442         return packFloat128( zSign, 0x7FFF, 0, 0 );
6443     }
6444     if ( aExp == 0 ) {
6445         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6446         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6447     }
6448     if ( bExp == 0 ) {
6449         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6450         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6451     }
6452     zExp = aExp + bExp - 0x4000;
6453     aSig0 |= LIT64( 0x0001000000000000 );
6454     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6455     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6456     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6457     zSig2 |= ( zSig3 != 0 );
6458     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6459         shift128ExtraRightJamming(
6460             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6461         ++zExp;
6462     }
6463     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6464 
6465 }
6466 
6467 /*----------------------------------------------------------------------------
6468 | Returns the result of dividing the quadruple-precision floating-point value
6469 | `a' by the corresponding value `b'.  The operation is performed according to
6470 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6471 *----------------------------------------------------------------------------*/
6472 
6473 float128 float128_div(float128 a, float128 b, float_status *status)
6474 {
6475     flag aSign, bSign, zSign;
6476     int32_t aExp, bExp, zExp;
6477     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6478     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6479 
6480     aSig1 = extractFloat128Frac1( a );
6481     aSig0 = extractFloat128Frac0( a );
6482     aExp = extractFloat128Exp( a );
6483     aSign = extractFloat128Sign( a );
6484     bSig1 = extractFloat128Frac1( b );
6485     bSig0 = extractFloat128Frac0( b );
6486     bExp = extractFloat128Exp( b );
6487     bSign = extractFloat128Sign( b );
6488     zSign = aSign ^ bSign;
6489     if ( aExp == 0x7FFF ) {
6490         if (aSig0 | aSig1) {
6491             return propagateFloat128NaN(a, b, status);
6492         }
6493         if ( bExp == 0x7FFF ) {
6494             if (bSig0 | bSig1) {
6495                 return propagateFloat128NaN(a, b, status);
6496             }
6497             goto invalid;
6498         }
6499         return packFloat128( zSign, 0x7FFF, 0, 0 );
6500     }
6501     if ( bExp == 0x7FFF ) {
6502         if (bSig0 | bSig1) {
6503             return propagateFloat128NaN(a, b, status);
6504         }
6505         return packFloat128( zSign, 0, 0, 0 );
6506     }
6507     if ( bExp == 0 ) {
6508         if ( ( bSig0 | bSig1 ) == 0 ) {
6509             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6510  invalid:
6511                 float_raise(float_flag_invalid, status);
6512                 return float128_default_nan(status);
6513             }
6514             float_raise(float_flag_divbyzero, status);
6515             return packFloat128( zSign, 0x7FFF, 0, 0 );
6516         }
6517         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6518     }
6519     if ( aExp == 0 ) {
6520         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6521         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6522     }
6523     zExp = aExp - bExp + 0x3FFD;
6524     shortShift128Left(
6525         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6526     shortShift128Left(
6527         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6528     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6529         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6530         ++zExp;
6531     }
6532     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6533     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6534     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6535     while ( (int64_t) rem0 < 0 ) {
6536         --zSig0;
6537         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6538     }
6539     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6540     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6541         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6542         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6543         while ( (int64_t) rem1 < 0 ) {
6544             --zSig1;
6545             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6546         }
6547         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6548     }
6549     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6550     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6551 
6552 }
6553 
6554 /*----------------------------------------------------------------------------
6555 | Returns the remainder of the quadruple-precision floating-point value `a'
6556 | with respect to the corresponding value `b'.  The operation is performed
6557 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6558 *----------------------------------------------------------------------------*/
6559 
6560 float128 float128_rem(float128 a, float128 b, float_status *status)
6561 {
6562     flag aSign, zSign;
6563     int32_t aExp, bExp, expDiff;
6564     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6565     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6566     int64_t sigMean0;
6567 
6568     aSig1 = extractFloat128Frac1( a );
6569     aSig0 = extractFloat128Frac0( a );
6570     aExp = extractFloat128Exp( a );
6571     aSign = extractFloat128Sign( a );
6572     bSig1 = extractFloat128Frac1( b );
6573     bSig0 = extractFloat128Frac0( b );
6574     bExp = extractFloat128Exp( b );
6575     if ( aExp == 0x7FFF ) {
6576         if (    ( aSig0 | aSig1 )
6577              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6578             return propagateFloat128NaN(a, b, status);
6579         }
6580         goto invalid;
6581     }
6582     if ( bExp == 0x7FFF ) {
6583         if (bSig0 | bSig1) {
6584             return propagateFloat128NaN(a, b, status);
6585         }
6586         return a;
6587     }
6588     if ( bExp == 0 ) {
6589         if ( ( bSig0 | bSig1 ) == 0 ) {
6590  invalid:
6591             float_raise(float_flag_invalid, status);
6592             return float128_default_nan(status);
6593         }
6594         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6595     }
6596     if ( aExp == 0 ) {
6597         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6598         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6599     }
6600     expDiff = aExp - bExp;
6601     if ( expDiff < -1 ) return a;
6602     shortShift128Left(
6603         aSig0 | LIT64( 0x0001000000000000 ),
6604         aSig1,
6605         15 - ( expDiff < 0 ),
6606         &aSig0,
6607         &aSig1
6608     );
6609     shortShift128Left(
6610         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6611     q = le128( bSig0, bSig1, aSig0, aSig1 );
6612     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6613     expDiff -= 64;
6614     while ( 0 < expDiff ) {
6615         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6616         q = ( 4 < q ) ? q - 4 : 0;
6617         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6618         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6619         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6620         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6621         expDiff -= 61;
6622     }
6623     if ( -64 < expDiff ) {
6624         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6625         q = ( 4 < q ) ? q - 4 : 0;
6626         q >>= - expDiff;
6627         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6628         expDiff += 52;
6629         if ( expDiff < 0 ) {
6630             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6631         }
6632         else {
6633             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6634         }
6635         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6636         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6637     }
6638     else {
6639         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6640         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6641     }
6642     do {
6643         alternateASig0 = aSig0;
6644         alternateASig1 = aSig1;
6645         ++q;
6646         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6647     } while ( 0 <= (int64_t) aSig0 );
6648     add128(
6649         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6650     if (    ( sigMean0 < 0 )
6651          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6652         aSig0 = alternateASig0;
6653         aSig1 = alternateASig1;
6654     }
6655     zSign = ( (int64_t) aSig0 < 0 );
6656     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6657     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6658                                          status);
6659 }
6660 
6661 /*----------------------------------------------------------------------------
6662 | Returns the square root of the quadruple-precision floating-point value `a'.
6663 | The operation is performed according to the IEC/IEEE Standard for Binary
6664 | Floating-Point Arithmetic.
6665 *----------------------------------------------------------------------------*/
6666 
6667 float128 float128_sqrt(float128 a, float_status *status)
6668 {
6669     flag aSign;
6670     int32_t aExp, zExp;
6671     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6672     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6673 
6674     aSig1 = extractFloat128Frac1( a );
6675     aSig0 = extractFloat128Frac0( a );
6676     aExp = extractFloat128Exp( a );
6677     aSign = extractFloat128Sign( a );
6678     if ( aExp == 0x7FFF ) {
6679         if (aSig0 | aSig1) {
6680             return propagateFloat128NaN(a, a, status);
6681         }
6682         if ( ! aSign ) return a;
6683         goto invalid;
6684     }
6685     if ( aSign ) {
6686         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6687  invalid:
6688         float_raise(float_flag_invalid, status);
6689         return float128_default_nan(status);
6690     }
6691     if ( aExp == 0 ) {
6692         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6693         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6694     }
6695     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6696     aSig0 |= LIT64( 0x0001000000000000 );
6697     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6698     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6699     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6700     doubleZSig0 = zSig0<<1;
6701     mul64To128( zSig0, zSig0, &term0, &term1 );
6702     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6703     while ( (int64_t) rem0 < 0 ) {
6704         --zSig0;
6705         doubleZSig0 -= 2;
6706         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6707     }
6708     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6709     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6710         if ( zSig1 == 0 ) zSig1 = 1;
6711         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6712         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6713         mul64To128( zSig1, zSig1, &term2, &term3 );
6714         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6715         while ( (int64_t) rem1 < 0 ) {
6716             --zSig1;
6717             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6718             term3 |= 1;
6719             term2 |= doubleZSig0;
6720             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6721         }
6722         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6723     }
6724     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6725     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6726 
6727 }
6728 
6729 /*----------------------------------------------------------------------------
6730 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6731 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6732 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6733 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6734 *----------------------------------------------------------------------------*/
6735 
6736 int float128_eq(float128 a, float128 b, float_status *status)
6737 {
6738 
6739     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6740               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6741          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6742               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6743        ) {
6744         float_raise(float_flag_invalid, status);
6745         return 0;
6746     }
6747     return
6748            ( a.low == b.low )
6749         && (    ( a.high == b.high )
6750              || (    ( a.low == 0 )
6751                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6752            );
6753 
6754 }
6755 
6756 /*----------------------------------------------------------------------------
6757 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6758 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6759 | exception is raised if either operand is a NaN.  The comparison is performed
6760 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6761 *----------------------------------------------------------------------------*/
6762 
6763 int float128_le(float128 a, float128 b, float_status *status)
6764 {
6765     flag aSign, bSign;
6766 
6767     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6768               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6769          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6770               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6771        ) {
6772         float_raise(float_flag_invalid, status);
6773         return 0;
6774     }
6775     aSign = extractFloat128Sign( a );
6776     bSign = extractFloat128Sign( b );
6777     if ( aSign != bSign ) {
6778         return
6779                aSign
6780             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6781                  == 0 );
6782     }
6783     return
6784           aSign ? le128( b.high, b.low, a.high, a.low )
6785         : le128( a.high, a.low, b.high, b.low );
6786 
6787 }
6788 
6789 /*----------------------------------------------------------------------------
6790 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6791 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6792 | raised if either operand is a NaN.  The comparison is performed according
6793 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6794 *----------------------------------------------------------------------------*/
6795 
6796 int float128_lt(float128 a, float128 b, float_status *status)
6797 {
6798     flag aSign, bSign;
6799 
6800     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6801               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6802          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6803               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6804        ) {
6805         float_raise(float_flag_invalid, status);
6806         return 0;
6807     }
6808     aSign = extractFloat128Sign( a );
6809     bSign = extractFloat128Sign( b );
6810     if ( aSign != bSign ) {
6811         return
6812                aSign
6813             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6814                  != 0 );
6815     }
6816     return
6817           aSign ? lt128( b.high, b.low, a.high, a.low )
6818         : lt128( a.high, a.low, b.high, b.low );
6819 
6820 }
6821 
6822 /*----------------------------------------------------------------------------
6823 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6824 | be compared, and 0 otherwise.  The invalid exception is raised if either
6825 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6826 | Standard for Binary Floating-Point Arithmetic.
6827 *----------------------------------------------------------------------------*/
6828 
6829 int float128_unordered(float128 a, float128 b, float_status *status)
6830 {
6831     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6832               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6833          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6834               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6835        ) {
6836         float_raise(float_flag_invalid, status);
6837         return 1;
6838     }
6839     return 0;
6840 }
6841 
6842 /*----------------------------------------------------------------------------
6843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6844 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6845 | exception.  The comparison is performed according to the IEC/IEEE Standard
6846 | for Binary Floating-Point Arithmetic.
6847 *----------------------------------------------------------------------------*/
6848 
6849 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6850 {
6851 
6852     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6853               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6854          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6855               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6856        ) {
6857         if (float128_is_signaling_nan(a, status)
6858          || float128_is_signaling_nan(b, status)) {
6859             float_raise(float_flag_invalid, status);
6860         }
6861         return 0;
6862     }
6863     return
6864            ( a.low == b.low )
6865         && (    ( a.high == b.high )
6866              || (    ( a.low == 0 )
6867                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6868            );
6869 
6870 }
6871 
6872 /*----------------------------------------------------------------------------
6873 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6874 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6875 | cause an exception.  Otherwise, the comparison is performed according to the
6876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6877 *----------------------------------------------------------------------------*/
6878 
6879 int float128_le_quiet(float128 a, float128 b, float_status *status)
6880 {
6881     flag aSign, bSign;
6882 
6883     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6884               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6885          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6886               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6887        ) {
6888         if (float128_is_signaling_nan(a, status)
6889          || float128_is_signaling_nan(b, status)) {
6890             float_raise(float_flag_invalid, status);
6891         }
6892         return 0;
6893     }
6894     aSign = extractFloat128Sign( a );
6895     bSign = extractFloat128Sign( b );
6896     if ( aSign != bSign ) {
6897         return
6898                aSign
6899             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6900                  == 0 );
6901     }
6902     return
6903           aSign ? le128( b.high, b.low, a.high, a.low )
6904         : le128( a.high, a.low, b.high, b.low );
6905 
6906 }
6907 
6908 /*----------------------------------------------------------------------------
6909 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6910 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6911 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6912 | Standard for Binary Floating-Point Arithmetic.
6913 *----------------------------------------------------------------------------*/
6914 
6915 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6916 {
6917     flag aSign, bSign;
6918 
6919     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6920               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6921          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6922               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6923        ) {
6924         if (float128_is_signaling_nan(a, status)
6925          || float128_is_signaling_nan(b, status)) {
6926             float_raise(float_flag_invalid, status);
6927         }
6928         return 0;
6929     }
6930     aSign = extractFloat128Sign( a );
6931     bSign = extractFloat128Sign( b );
6932     if ( aSign != bSign ) {
6933         return
6934                aSign
6935             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6936                  != 0 );
6937     }
6938     return
6939           aSign ? lt128( b.high, b.low, a.high, a.low )
6940         : lt128( a.high, a.low, b.high, b.low );
6941 
6942 }
6943 
6944 /*----------------------------------------------------------------------------
6945 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6946 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6947 | comparison is performed according to the IEC/IEEE Standard for Binary
6948 | Floating-Point Arithmetic.
6949 *----------------------------------------------------------------------------*/
6950 
6951 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6952 {
6953     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6954               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6955          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6956               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6957        ) {
6958         if (float128_is_signaling_nan(a, status)
6959          || float128_is_signaling_nan(b, status)) {
6960             float_raise(float_flag_invalid, status);
6961         }
6962         return 1;
6963     }
6964     return 0;
6965 }
6966 
6967 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6968                                             int is_quiet, float_status *status)
6969 {
6970     flag aSign, bSign;
6971 
6972     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6973         float_raise(float_flag_invalid, status);
6974         return float_relation_unordered;
6975     }
6976     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6977           ( extractFloatx80Frac( a )<<1 ) ) ||
6978         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6979           ( extractFloatx80Frac( b )<<1 ) )) {
6980         if (!is_quiet ||
6981             floatx80_is_signaling_nan(a, status) ||
6982             floatx80_is_signaling_nan(b, status)) {
6983             float_raise(float_flag_invalid, status);
6984         }
6985         return float_relation_unordered;
6986     }
6987     aSign = extractFloatx80Sign( a );
6988     bSign = extractFloatx80Sign( b );
6989     if ( aSign != bSign ) {
6990 
6991         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6992              ( ( a.low | b.low ) == 0 ) ) {
6993             /* zero case */
6994             return float_relation_equal;
6995         } else {
6996             return 1 - (2 * aSign);
6997         }
6998     } else {
6999         if (a.low == b.low && a.high == b.high) {
7000             return float_relation_equal;
7001         } else {
7002             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7003         }
7004     }
7005 }
7006 
7007 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7008 {
7009     return floatx80_compare_internal(a, b, 0, status);
7010 }
7011 
7012 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7013 {
7014     return floatx80_compare_internal(a, b, 1, status);
7015 }
7016 
7017 static inline int float128_compare_internal(float128 a, float128 b,
7018                                             int is_quiet, float_status *status)
7019 {
7020     flag aSign, bSign;
7021 
7022     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7023           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7024         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7025           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7026         if (!is_quiet ||
7027             float128_is_signaling_nan(a, status) ||
7028             float128_is_signaling_nan(b, status)) {
7029             float_raise(float_flag_invalid, status);
7030         }
7031         return float_relation_unordered;
7032     }
7033     aSign = extractFloat128Sign( a );
7034     bSign = extractFloat128Sign( b );
7035     if ( aSign != bSign ) {
7036         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7037             /* zero case */
7038             return float_relation_equal;
7039         } else {
7040             return 1 - (2 * aSign);
7041         }
7042     } else {
7043         if (a.low == b.low && a.high == b.high) {
7044             return float_relation_equal;
7045         } else {
7046             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7047         }
7048     }
7049 }
7050 
7051 int float128_compare(float128 a, float128 b, float_status *status)
7052 {
7053     return float128_compare_internal(a, b, 0, status);
7054 }
7055 
7056 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7057 {
7058     return float128_compare_internal(a, b, 1, status);
7059 }
7060 
7061 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7062 {
7063     flag aSign;
7064     int32_t aExp;
7065     uint64_t aSig;
7066 
7067     if (floatx80_invalid_encoding(a)) {
7068         float_raise(float_flag_invalid, status);
7069         return floatx80_default_nan(status);
7070     }
7071     aSig = extractFloatx80Frac( a );
7072     aExp = extractFloatx80Exp( a );
7073     aSign = extractFloatx80Sign( a );
7074 
7075     if ( aExp == 0x7FFF ) {
7076         if ( aSig<<1 ) {
7077             return propagateFloatx80NaN(a, a, status);
7078         }
7079         return a;
7080     }
7081 
7082     if (aExp == 0) {
7083         if (aSig == 0) {
7084             return a;
7085         }
7086         aExp++;
7087     }
7088 
7089     if (n > 0x10000) {
7090         n = 0x10000;
7091     } else if (n < -0x10000) {
7092         n = -0x10000;
7093     }
7094 
7095     aExp += n;
7096     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7097                                          aSign, aExp, aSig, 0, status);
7098 }
7099 
7100 float128 float128_scalbn(float128 a, int n, float_status *status)
7101 {
7102     flag aSign;
7103     int32_t aExp;
7104     uint64_t aSig0, aSig1;
7105 
7106     aSig1 = extractFloat128Frac1( a );
7107     aSig0 = extractFloat128Frac0( a );
7108     aExp = extractFloat128Exp( a );
7109     aSign = extractFloat128Sign( a );
7110     if ( aExp == 0x7FFF ) {
7111         if ( aSig0 | aSig1 ) {
7112             return propagateFloat128NaN(a, a, status);
7113         }
7114         return a;
7115     }
7116     if (aExp != 0) {
7117         aSig0 |= LIT64( 0x0001000000000000 );
7118     } else if (aSig0 == 0 && aSig1 == 0) {
7119         return a;
7120     } else {
7121         aExp++;
7122     }
7123 
7124     if (n > 0x10000) {
7125         n = 0x10000;
7126     } else if (n < -0x10000) {
7127         n = -0x10000;
7128     }
7129 
7130     aExp += n - 1;
7131     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7132                                          , status);
7133 
7134 }
7135