xref: /qemu/fpu/softfloat.c (revision 0bfc9f195209593e91a98cf2233753f56a2e5c02)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333 
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341 
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377 
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396 
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400 
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404 
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408 
1409 #undef FLOAT_TO_INT
1410 
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423 
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429 
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446 
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455 
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488 
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492 
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496 
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500 
1501 #undef FLOAT_TO_UINT
1502 
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510 
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536 
1537     return r;
1538 }
1539 
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545 
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550 
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555 
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561 
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566 
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571 
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577 
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582 
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587 
1588 
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596 
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600 
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614 
1615     return r;
1616 }
1617 
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623 
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628 
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633 
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639 
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644 
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649 
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655 
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660 
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665 
1666 /* Multiply A by 2 raised to the power N.  */
1667 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1668 {
1669     if (unlikely(is_nan(a.cls))) {
1670         return return_nan(a, s);
1671     }
1672     if (a.cls == float_class_normal) {
1673         a.exp += n;
1674     }
1675     return a;
1676 }
1677 
1678 float16 float16_scalbn(float16 a, int n, float_status *status)
1679 {
1680     FloatParts pa = float16_unpack_canonical(a, status);
1681     FloatParts pr = scalbn_decomposed(pa, n, status);
1682     return float16_round_pack_canonical(pr, status);
1683 }
1684 
1685 float32 float32_scalbn(float32 a, int n, float_status *status)
1686 {
1687     FloatParts pa = float32_unpack_canonical(a, status);
1688     FloatParts pr = scalbn_decomposed(pa, n, status);
1689     return float32_round_pack_canonical(pr, status);
1690 }
1691 
1692 float64 float64_scalbn(float64 a, int n, float_status *status)
1693 {
1694     FloatParts pa = float64_unpack_canonical(a, status);
1695     FloatParts pr = scalbn_decomposed(pa, n, status);
1696     return float64_round_pack_canonical(pr, status);
1697 }
1698 
1699 /*----------------------------------------------------------------------------
1700 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1701 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1702 | input.  If `zSign' is 1, the input is negated before being converted to an
1703 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1704 | is simply rounded to an integer, with the inexact exception raised if the
1705 | input cannot be represented exactly as an integer.  However, if the fixed-
1706 | point input is too large, the invalid exception is raised and the largest
1707 | positive or negative integer is returned.
1708 *----------------------------------------------------------------------------*/
1709 
1710 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1711 {
1712     int8_t roundingMode;
1713     flag roundNearestEven;
1714     int8_t roundIncrement, roundBits;
1715     int32_t z;
1716 
1717     roundingMode = status->float_rounding_mode;
1718     roundNearestEven = ( roundingMode == float_round_nearest_even );
1719     switch (roundingMode) {
1720     case float_round_nearest_even:
1721     case float_round_ties_away:
1722         roundIncrement = 0x40;
1723         break;
1724     case float_round_to_zero:
1725         roundIncrement = 0;
1726         break;
1727     case float_round_up:
1728         roundIncrement = zSign ? 0 : 0x7f;
1729         break;
1730     case float_round_down:
1731         roundIncrement = zSign ? 0x7f : 0;
1732         break;
1733     default:
1734         abort();
1735     }
1736     roundBits = absZ & 0x7F;
1737     absZ = ( absZ + roundIncrement )>>7;
1738     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1739     z = absZ;
1740     if ( zSign ) z = - z;
1741     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1742         float_raise(float_flag_invalid, status);
1743         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1744     }
1745     if (roundBits) {
1746         status->float_exception_flags |= float_flag_inexact;
1747     }
1748     return z;
1749 
1750 }
1751 
1752 /*----------------------------------------------------------------------------
1753 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1754 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1755 | and returns the properly rounded 64-bit integer corresponding to the input.
1756 | If `zSign' is 1, the input is negated before being converted to an integer.
1757 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1758 | the inexact exception raised if the input cannot be represented exactly as
1759 | an integer.  However, if the fixed-point input is too large, the invalid
1760 | exception is raised and the largest positive or negative integer is
1761 | returned.
1762 *----------------------------------------------------------------------------*/
1763 
1764 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1765                                float_status *status)
1766 {
1767     int8_t roundingMode;
1768     flag roundNearestEven, increment;
1769     int64_t z;
1770 
1771     roundingMode = status->float_rounding_mode;
1772     roundNearestEven = ( roundingMode == float_round_nearest_even );
1773     switch (roundingMode) {
1774     case float_round_nearest_even:
1775     case float_round_ties_away:
1776         increment = ((int64_t) absZ1 < 0);
1777         break;
1778     case float_round_to_zero:
1779         increment = 0;
1780         break;
1781     case float_round_up:
1782         increment = !zSign && absZ1;
1783         break;
1784     case float_round_down:
1785         increment = zSign && absZ1;
1786         break;
1787     default:
1788         abort();
1789     }
1790     if ( increment ) {
1791         ++absZ0;
1792         if ( absZ0 == 0 ) goto overflow;
1793         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1794     }
1795     z = absZ0;
1796     if ( zSign ) z = - z;
1797     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1798  overflow:
1799         float_raise(float_flag_invalid, status);
1800         return
1801               zSign ? (int64_t) LIT64( 0x8000000000000000 )
1802             : LIT64( 0x7FFFFFFFFFFFFFFF );
1803     }
1804     if (absZ1) {
1805         status->float_exception_flags |= float_flag_inexact;
1806     }
1807     return z;
1808 
1809 }
1810 
1811 /*----------------------------------------------------------------------------
1812 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1813 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1814 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1815 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
1816 | with the inexact exception raised if the input cannot be represented exactly
1817 | as an integer.  However, if the fixed-point input is too large, the invalid
1818 | exception is raised and the largest unsigned integer is returned.
1819 *----------------------------------------------------------------------------*/
1820 
1821 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1822                                 uint64_t absZ1, float_status *status)
1823 {
1824     int8_t roundingMode;
1825     flag roundNearestEven, increment;
1826 
1827     roundingMode = status->float_rounding_mode;
1828     roundNearestEven = (roundingMode == float_round_nearest_even);
1829     switch (roundingMode) {
1830     case float_round_nearest_even:
1831     case float_round_ties_away:
1832         increment = ((int64_t)absZ1 < 0);
1833         break;
1834     case float_round_to_zero:
1835         increment = 0;
1836         break;
1837     case float_round_up:
1838         increment = !zSign && absZ1;
1839         break;
1840     case float_round_down:
1841         increment = zSign && absZ1;
1842         break;
1843     default:
1844         abort();
1845     }
1846     if (increment) {
1847         ++absZ0;
1848         if (absZ0 == 0) {
1849             float_raise(float_flag_invalid, status);
1850             return LIT64(0xFFFFFFFFFFFFFFFF);
1851         }
1852         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1853     }
1854 
1855     if (zSign && absZ0) {
1856         float_raise(float_flag_invalid, status);
1857         return 0;
1858     }
1859 
1860     if (absZ1) {
1861         status->float_exception_flags |= float_flag_inexact;
1862     }
1863     return absZ0;
1864 }
1865 
1866 /*----------------------------------------------------------------------------
1867 | If `a' is denormal and we are in flush-to-zero mode then set the
1868 | input-denormal exception and return zero. Otherwise just return the value.
1869 *----------------------------------------------------------------------------*/
1870 float32 float32_squash_input_denormal(float32 a, float_status *status)
1871 {
1872     if (status->flush_inputs_to_zero) {
1873         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1874             float_raise(float_flag_input_denormal, status);
1875             return make_float32(float32_val(a) & 0x80000000);
1876         }
1877     }
1878     return a;
1879 }
1880 
1881 /*----------------------------------------------------------------------------
1882 | Normalizes the subnormal single-precision floating-point value represented
1883 | by the denormalized significand `aSig'.  The normalized exponent and
1884 | significand are stored at the locations pointed to by `zExpPtr' and
1885 | `zSigPtr', respectively.
1886 *----------------------------------------------------------------------------*/
1887 
1888 static void
1889  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
1890 {
1891     int8_t shiftCount;
1892 
1893     shiftCount = countLeadingZeros32( aSig ) - 8;
1894     *zSigPtr = aSig<<shiftCount;
1895     *zExpPtr = 1 - shiftCount;
1896 
1897 }
1898 
1899 /*----------------------------------------------------------------------------
1900 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1901 | single-precision floating-point value, returning the result.  After being
1902 | shifted into the proper positions, the three fields are simply added
1903 | together to form the result.  This means that any integer portion of `zSig'
1904 | will be added into the exponent.  Since a properly normalized significand
1905 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1906 | than the desired result exponent whenever `zSig' is a complete, normalized
1907 | significand.
1908 *----------------------------------------------------------------------------*/
1909 
1910 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
1911 {
1912 
1913     return make_float32(
1914           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
1915 
1916 }
1917 
1918 /*----------------------------------------------------------------------------
1919 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1920 | and significand `zSig', and returns the proper single-precision floating-
1921 | point value corresponding to the abstract input.  Ordinarily, the abstract
1922 | value is simply rounded and packed into the single-precision format, with
1923 | the inexact exception raised if the abstract input cannot be represented
1924 | exactly.  However, if the abstract value is too large, the overflow and
1925 | inexact exceptions are raised and an infinity or maximal finite value is
1926 | returned.  If the abstract value is too small, the input value is rounded to
1927 | a subnormal number, and the underflow and inexact exceptions are raised if
1928 | the abstract input cannot be represented exactly as a subnormal single-
1929 | precision floating-point number.
1930 |     The input significand `zSig' has its binary point between bits 30
1931 | and 29, which is 7 bits to the left of the usual location.  This shifted
1932 | significand must be normalized or smaller.  If `zSig' is not normalized,
1933 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1934 | and it must not require rounding.  In the usual case that `zSig' is
1935 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1936 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1937 | Binary Floating-Point Arithmetic.
1938 *----------------------------------------------------------------------------*/
1939 
1940 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1941                                    float_status *status)
1942 {
1943     int8_t roundingMode;
1944     flag roundNearestEven;
1945     int8_t roundIncrement, roundBits;
1946     flag isTiny;
1947 
1948     roundingMode = status->float_rounding_mode;
1949     roundNearestEven = ( roundingMode == float_round_nearest_even );
1950     switch (roundingMode) {
1951     case float_round_nearest_even:
1952     case float_round_ties_away:
1953         roundIncrement = 0x40;
1954         break;
1955     case float_round_to_zero:
1956         roundIncrement = 0;
1957         break;
1958     case float_round_up:
1959         roundIncrement = zSign ? 0 : 0x7f;
1960         break;
1961     case float_round_down:
1962         roundIncrement = zSign ? 0x7f : 0;
1963         break;
1964     default:
1965         abort();
1966         break;
1967     }
1968     roundBits = zSig & 0x7F;
1969     if ( 0xFD <= (uint16_t) zExp ) {
1970         if (    ( 0xFD < zExp )
1971              || (    ( zExp == 0xFD )
1972                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
1973            ) {
1974             float_raise(float_flag_overflow | float_flag_inexact, status);
1975             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
1976         }
1977         if ( zExp < 0 ) {
1978             if (status->flush_to_zero) {
1979                 float_raise(float_flag_output_denormal, status);
1980                 return packFloat32(zSign, 0, 0);
1981             }
1982             isTiny =
1983                 (status->float_detect_tininess
1984                  == float_tininess_before_rounding)
1985                 || ( zExp < -1 )
1986                 || ( zSig + roundIncrement < 0x80000000 );
1987             shift32RightJamming( zSig, - zExp, &zSig );
1988             zExp = 0;
1989             roundBits = zSig & 0x7F;
1990             if (isTiny && roundBits) {
1991                 float_raise(float_flag_underflow, status);
1992             }
1993         }
1994     }
1995     if (roundBits) {
1996         status->float_exception_flags |= float_flag_inexact;
1997     }
1998     zSig = ( zSig + roundIncrement )>>7;
1999     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2000     if ( zSig == 0 ) zExp = 0;
2001     return packFloat32( zSign, zExp, zSig );
2002 
2003 }
2004 
2005 /*----------------------------------------------------------------------------
2006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2007 | and significand `zSig', and returns the proper single-precision floating-
2008 | point value corresponding to the abstract input.  This routine is just like
2009 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2010 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2011 | floating-point exponent.
2012 *----------------------------------------------------------------------------*/
2013 
2014 static float32
2015  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2016                               float_status *status)
2017 {
2018     int8_t shiftCount;
2019 
2020     shiftCount = countLeadingZeros32( zSig ) - 1;
2021     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2022                                status);
2023 
2024 }
2025 
2026 /*----------------------------------------------------------------------------
2027 | If `a' is denormal and we are in flush-to-zero mode then set the
2028 | input-denormal exception and return zero. Otherwise just return the value.
2029 *----------------------------------------------------------------------------*/
2030 float64 float64_squash_input_denormal(float64 a, float_status *status)
2031 {
2032     if (status->flush_inputs_to_zero) {
2033         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2034             float_raise(float_flag_input_denormal, status);
2035             return make_float64(float64_val(a) & (1ULL << 63));
2036         }
2037     }
2038     return a;
2039 }
2040 
2041 /*----------------------------------------------------------------------------
2042 | Normalizes the subnormal double-precision floating-point value represented
2043 | by the denormalized significand `aSig'.  The normalized exponent and
2044 | significand are stored at the locations pointed to by `zExpPtr' and
2045 | `zSigPtr', respectively.
2046 *----------------------------------------------------------------------------*/
2047 
2048 static void
2049  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2050 {
2051     int8_t shiftCount;
2052 
2053     shiftCount = countLeadingZeros64( aSig ) - 11;
2054     *zSigPtr = aSig<<shiftCount;
2055     *zExpPtr = 1 - shiftCount;
2056 
2057 }
2058 
2059 /*----------------------------------------------------------------------------
2060 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2061 | double-precision floating-point value, returning the result.  After being
2062 | shifted into the proper positions, the three fields are simply added
2063 | together to form the result.  This means that any integer portion of `zSig'
2064 | will be added into the exponent.  Since a properly normalized significand
2065 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2066 | than the desired result exponent whenever `zSig' is a complete, normalized
2067 | significand.
2068 *----------------------------------------------------------------------------*/
2069 
2070 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2071 {
2072 
2073     return make_float64(
2074         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2075 
2076 }
2077 
2078 /*----------------------------------------------------------------------------
2079 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2080 | and significand `zSig', and returns the proper double-precision floating-
2081 | point value corresponding to the abstract input.  Ordinarily, the abstract
2082 | value is simply rounded and packed into the double-precision format, with
2083 | the inexact exception raised if the abstract input cannot be represented
2084 | exactly.  However, if the abstract value is too large, the overflow and
2085 | inexact exceptions are raised and an infinity or maximal finite value is
2086 | returned.  If the abstract value is too small, the input value is rounded to
2087 | a subnormal number, and the underflow and inexact exceptions are raised if
2088 | the abstract input cannot be represented exactly as a subnormal double-
2089 | precision floating-point number.
2090 |     The input significand `zSig' has its binary point between bits 62
2091 | and 61, which is 10 bits to the left of the usual location.  This shifted
2092 | significand must be normalized or smaller.  If `zSig' is not normalized,
2093 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2094 | and it must not require rounding.  In the usual case that `zSig' is
2095 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2096 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2097 | Binary Floating-Point Arithmetic.
2098 *----------------------------------------------------------------------------*/
2099 
2100 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2101                                    float_status *status)
2102 {
2103     int8_t roundingMode;
2104     flag roundNearestEven;
2105     int roundIncrement, roundBits;
2106     flag isTiny;
2107 
2108     roundingMode = status->float_rounding_mode;
2109     roundNearestEven = ( roundingMode == float_round_nearest_even );
2110     switch (roundingMode) {
2111     case float_round_nearest_even:
2112     case float_round_ties_away:
2113         roundIncrement = 0x200;
2114         break;
2115     case float_round_to_zero:
2116         roundIncrement = 0;
2117         break;
2118     case float_round_up:
2119         roundIncrement = zSign ? 0 : 0x3ff;
2120         break;
2121     case float_round_down:
2122         roundIncrement = zSign ? 0x3ff : 0;
2123         break;
2124     case float_round_to_odd:
2125         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2126         break;
2127     default:
2128         abort();
2129     }
2130     roundBits = zSig & 0x3FF;
2131     if ( 0x7FD <= (uint16_t) zExp ) {
2132         if (    ( 0x7FD < zExp )
2133              || (    ( zExp == 0x7FD )
2134                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2135            ) {
2136             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2137                                    roundIncrement != 0;
2138             float_raise(float_flag_overflow | float_flag_inexact, status);
2139             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2140         }
2141         if ( zExp < 0 ) {
2142             if (status->flush_to_zero) {
2143                 float_raise(float_flag_output_denormal, status);
2144                 return packFloat64(zSign, 0, 0);
2145             }
2146             isTiny =
2147                    (status->float_detect_tininess
2148                     == float_tininess_before_rounding)
2149                 || ( zExp < -1 )
2150                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2151             shift64RightJamming( zSig, - zExp, &zSig );
2152             zExp = 0;
2153             roundBits = zSig & 0x3FF;
2154             if (isTiny && roundBits) {
2155                 float_raise(float_flag_underflow, status);
2156             }
2157             if (roundingMode == float_round_to_odd) {
2158                 /*
2159                  * For round-to-odd case, the roundIncrement depends on
2160                  * zSig which just changed.
2161                  */
2162                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2163             }
2164         }
2165     }
2166     if (roundBits) {
2167         status->float_exception_flags |= float_flag_inexact;
2168     }
2169     zSig = ( zSig + roundIncrement )>>10;
2170     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2171     if ( zSig == 0 ) zExp = 0;
2172     return packFloat64( zSign, zExp, zSig );
2173 
2174 }
2175 
2176 /*----------------------------------------------------------------------------
2177 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2178 | and significand `zSig', and returns the proper double-precision floating-
2179 | point value corresponding to the abstract input.  This routine is just like
2180 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2181 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2182 | floating-point exponent.
2183 *----------------------------------------------------------------------------*/
2184 
2185 static float64
2186  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2187                               float_status *status)
2188 {
2189     int8_t shiftCount;
2190 
2191     shiftCount = countLeadingZeros64( zSig ) - 1;
2192     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2193                                status);
2194 
2195 }
2196 
2197 /*----------------------------------------------------------------------------
2198 | Returns the fraction bits of the extended double-precision floating-point
2199 | value `a'.
2200 *----------------------------------------------------------------------------*/
2201 
2202 static inline uint64_t extractFloatx80Frac( floatx80 a )
2203 {
2204 
2205     return a.low;
2206 
2207 }
2208 
2209 /*----------------------------------------------------------------------------
2210 | Returns the exponent bits of the extended double-precision floating-point
2211 | value `a'.
2212 *----------------------------------------------------------------------------*/
2213 
2214 static inline int32_t extractFloatx80Exp( floatx80 a )
2215 {
2216 
2217     return a.high & 0x7FFF;
2218 
2219 }
2220 
2221 /*----------------------------------------------------------------------------
2222 | Returns the sign bit of the extended double-precision floating-point value
2223 | `a'.
2224 *----------------------------------------------------------------------------*/
2225 
2226 static inline flag extractFloatx80Sign( floatx80 a )
2227 {
2228 
2229     return a.high>>15;
2230 
2231 }
2232 
2233 /*----------------------------------------------------------------------------
2234 | Normalizes the subnormal extended double-precision floating-point value
2235 | represented by the denormalized significand `aSig'.  The normalized exponent
2236 | and significand are stored at the locations pointed to by `zExpPtr' and
2237 | `zSigPtr', respectively.
2238 *----------------------------------------------------------------------------*/
2239 
2240 static void
2241  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2242 {
2243     int8_t shiftCount;
2244 
2245     shiftCount = countLeadingZeros64( aSig );
2246     *zSigPtr = aSig<<shiftCount;
2247     *zExpPtr = 1 - shiftCount;
2248 
2249 }
2250 
2251 /*----------------------------------------------------------------------------
2252 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2253 | extended double-precision floating-point value, returning the result.
2254 *----------------------------------------------------------------------------*/
2255 
2256 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2257 {
2258     floatx80 z;
2259 
2260     z.low = zSig;
2261     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2262     return z;
2263 
2264 }
2265 
2266 /*----------------------------------------------------------------------------
2267 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2268 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2269 | and returns the proper extended double-precision floating-point value
2270 | corresponding to the abstract input.  Ordinarily, the abstract value is
2271 | rounded and packed into the extended double-precision format, with the
2272 | inexact exception raised if the abstract input cannot be represented
2273 | exactly.  However, if the abstract value is too large, the overflow and
2274 | inexact exceptions are raised and an infinity or maximal finite value is
2275 | returned.  If the abstract value is too small, the input value is rounded to
2276 | a subnormal number, and the underflow and inexact exceptions are raised if
2277 | the abstract input cannot be represented exactly as a subnormal extended
2278 | double-precision floating-point number.
2279 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2280 | number of bits as single or double precision, respectively.  Otherwise, the
2281 | result is rounded to the full precision of the extended double-precision
2282 | format.
2283 |     The input significand must be normalized or smaller.  If the input
2284 | significand is not normalized, `zExp' must be 0; in that case, the result
2285 | returned is a subnormal number, and it must not require rounding.  The
2286 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2287 | Floating-Point Arithmetic.
2288 *----------------------------------------------------------------------------*/
2289 
2290 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2291                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2292                                      float_status *status)
2293 {
2294     int8_t roundingMode;
2295     flag roundNearestEven, increment, isTiny;
2296     int64_t roundIncrement, roundMask, roundBits;
2297 
2298     roundingMode = status->float_rounding_mode;
2299     roundNearestEven = ( roundingMode == float_round_nearest_even );
2300     if ( roundingPrecision == 80 ) goto precision80;
2301     if ( roundingPrecision == 64 ) {
2302         roundIncrement = LIT64( 0x0000000000000400 );
2303         roundMask = LIT64( 0x00000000000007FF );
2304     }
2305     else if ( roundingPrecision == 32 ) {
2306         roundIncrement = LIT64( 0x0000008000000000 );
2307         roundMask = LIT64( 0x000000FFFFFFFFFF );
2308     }
2309     else {
2310         goto precision80;
2311     }
2312     zSig0 |= ( zSig1 != 0 );
2313     switch (roundingMode) {
2314     case float_round_nearest_even:
2315     case float_round_ties_away:
2316         break;
2317     case float_round_to_zero:
2318         roundIncrement = 0;
2319         break;
2320     case float_round_up:
2321         roundIncrement = zSign ? 0 : roundMask;
2322         break;
2323     case float_round_down:
2324         roundIncrement = zSign ? roundMask : 0;
2325         break;
2326     default:
2327         abort();
2328     }
2329     roundBits = zSig0 & roundMask;
2330     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2331         if (    ( 0x7FFE < zExp )
2332              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2333            ) {
2334             goto overflow;
2335         }
2336         if ( zExp <= 0 ) {
2337             if (status->flush_to_zero) {
2338                 float_raise(float_flag_output_denormal, status);
2339                 return packFloatx80(zSign, 0, 0);
2340             }
2341             isTiny =
2342                    (status->float_detect_tininess
2343                     == float_tininess_before_rounding)
2344                 || ( zExp < 0 )
2345                 || ( zSig0 <= zSig0 + roundIncrement );
2346             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2347             zExp = 0;
2348             roundBits = zSig0 & roundMask;
2349             if (isTiny && roundBits) {
2350                 float_raise(float_flag_underflow, status);
2351             }
2352             if (roundBits) {
2353                 status->float_exception_flags |= float_flag_inexact;
2354             }
2355             zSig0 += roundIncrement;
2356             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2357             roundIncrement = roundMask + 1;
2358             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2359                 roundMask |= roundIncrement;
2360             }
2361             zSig0 &= ~ roundMask;
2362             return packFloatx80( zSign, zExp, zSig0 );
2363         }
2364     }
2365     if (roundBits) {
2366         status->float_exception_flags |= float_flag_inexact;
2367     }
2368     zSig0 += roundIncrement;
2369     if ( zSig0 < roundIncrement ) {
2370         ++zExp;
2371         zSig0 = LIT64( 0x8000000000000000 );
2372     }
2373     roundIncrement = roundMask + 1;
2374     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2375         roundMask |= roundIncrement;
2376     }
2377     zSig0 &= ~ roundMask;
2378     if ( zSig0 == 0 ) zExp = 0;
2379     return packFloatx80( zSign, zExp, zSig0 );
2380  precision80:
2381     switch (roundingMode) {
2382     case float_round_nearest_even:
2383     case float_round_ties_away:
2384         increment = ((int64_t)zSig1 < 0);
2385         break;
2386     case float_round_to_zero:
2387         increment = 0;
2388         break;
2389     case float_round_up:
2390         increment = !zSign && zSig1;
2391         break;
2392     case float_round_down:
2393         increment = zSign && zSig1;
2394         break;
2395     default:
2396         abort();
2397     }
2398     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2399         if (    ( 0x7FFE < zExp )
2400              || (    ( zExp == 0x7FFE )
2401                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2402                   && increment
2403                 )
2404            ) {
2405             roundMask = 0;
2406  overflow:
2407             float_raise(float_flag_overflow | float_flag_inexact, status);
2408             if (    ( roundingMode == float_round_to_zero )
2409                  || ( zSign && ( roundingMode == float_round_up ) )
2410                  || ( ! zSign && ( roundingMode == float_round_down ) )
2411                ) {
2412                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2413             }
2414             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2415         }
2416         if ( zExp <= 0 ) {
2417             isTiny =
2418                    (status->float_detect_tininess
2419                     == float_tininess_before_rounding)
2420                 || ( zExp < 0 )
2421                 || ! increment
2422                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2423             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2424             zExp = 0;
2425             if (isTiny && zSig1) {
2426                 float_raise(float_flag_underflow, status);
2427             }
2428             if (zSig1) {
2429                 status->float_exception_flags |= float_flag_inexact;
2430             }
2431             switch (roundingMode) {
2432             case float_round_nearest_even:
2433             case float_round_ties_away:
2434                 increment = ((int64_t)zSig1 < 0);
2435                 break;
2436             case float_round_to_zero:
2437                 increment = 0;
2438                 break;
2439             case float_round_up:
2440                 increment = !zSign && zSig1;
2441                 break;
2442             case float_round_down:
2443                 increment = zSign && zSig1;
2444                 break;
2445             default:
2446                 abort();
2447             }
2448             if ( increment ) {
2449                 ++zSig0;
2450                 zSig0 &=
2451                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2452                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2453             }
2454             return packFloatx80( zSign, zExp, zSig0 );
2455         }
2456     }
2457     if (zSig1) {
2458         status->float_exception_flags |= float_flag_inexact;
2459     }
2460     if ( increment ) {
2461         ++zSig0;
2462         if ( zSig0 == 0 ) {
2463             ++zExp;
2464             zSig0 = LIT64( 0x8000000000000000 );
2465         }
2466         else {
2467             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2468         }
2469     }
2470     else {
2471         if ( zSig0 == 0 ) zExp = 0;
2472     }
2473     return packFloatx80( zSign, zExp, zSig0 );
2474 
2475 }
2476 
2477 /*----------------------------------------------------------------------------
2478 | Takes an abstract floating-point value having sign `zSign', exponent
2479 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2480 | and returns the proper extended double-precision floating-point value
2481 | corresponding to the abstract input.  This routine is just like
2482 | `roundAndPackFloatx80' except that the input significand does not have to be
2483 | normalized.
2484 *----------------------------------------------------------------------------*/
2485 
2486 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2487                                               flag zSign, int32_t zExp,
2488                                               uint64_t zSig0, uint64_t zSig1,
2489                                               float_status *status)
2490 {
2491     int8_t shiftCount;
2492 
2493     if ( zSig0 == 0 ) {
2494         zSig0 = zSig1;
2495         zSig1 = 0;
2496         zExp -= 64;
2497     }
2498     shiftCount = countLeadingZeros64( zSig0 );
2499     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2500     zExp -= shiftCount;
2501     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2502                                 zSig0, zSig1, status);
2503 
2504 }
2505 
2506 /*----------------------------------------------------------------------------
2507 | Returns the least-significant 64 fraction bits of the quadruple-precision
2508 | floating-point value `a'.
2509 *----------------------------------------------------------------------------*/
2510 
2511 static inline uint64_t extractFloat128Frac1( float128 a )
2512 {
2513 
2514     return a.low;
2515 
2516 }
2517 
2518 /*----------------------------------------------------------------------------
2519 | Returns the most-significant 48 fraction bits of the quadruple-precision
2520 | floating-point value `a'.
2521 *----------------------------------------------------------------------------*/
2522 
2523 static inline uint64_t extractFloat128Frac0( float128 a )
2524 {
2525 
2526     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2527 
2528 }
2529 
2530 /*----------------------------------------------------------------------------
2531 | Returns the exponent bits of the quadruple-precision floating-point value
2532 | `a'.
2533 *----------------------------------------------------------------------------*/
2534 
2535 static inline int32_t extractFloat128Exp( float128 a )
2536 {
2537 
2538     return ( a.high>>48 ) & 0x7FFF;
2539 
2540 }
2541 
2542 /*----------------------------------------------------------------------------
2543 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2544 *----------------------------------------------------------------------------*/
2545 
2546 static inline flag extractFloat128Sign( float128 a )
2547 {
2548 
2549     return a.high>>63;
2550 
2551 }
2552 
2553 /*----------------------------------------------------------------------------
2554 | Normalizes the subnormal quadruple-precision floating-point value
2555 | represented by the denormalized significand formed by the concatenation of
2556 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2557 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2558 | significand are stored at the location pointed to by `zSig0Ptr', and the
2559 | least significant 64 bits of the normalized significand are stored at the
2560 | location pointed to by `zSig1Ptr'.
2561 *----------------------------------------------------------------------------*/
2562 
2563 static void
2564  normalizeFloat128Subnormal(
2565      uint64_t aSig0,
2566      uint64_t aSig1,
2567      int32_t *zExpPtr,
2568      uint64_t *zSig0Ptr,
2569      uint64_t *zSig1Ptr
2570  )
2571 {
2572     int8_t shiftCount;
2573 
2574     if ( aSig0 == 0 ) {
2575         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2576         if ( shiftCount < 0 ) {
2577             *zSig0Ptr = aSig1>>( - shiftCount );
2578             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2579         }
2580         else {
2581             *zSig0Ptr = aSig1<<shiftCount;
2582             *zSig1Ptr = 0;
2583         }
2584         *zExpPtr = - shiftCount - 63;
2585     }
2586     else {
2587         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2588         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2589         *zExpPtr = 1 - shiftCount;
2590     }
2591 
2592 }
2593 
2594 /*----------------------------------------------------------------------------
2595 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2596 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2597 | floating-point value, returning the result.  After being shifted into the
2598 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2599 | added together to form the most significant 32 bits of the result.  This
2600 | means that any integer portion of `zSig0' will be added into the exponent.
2601 | Since a properly normalized significand will have an integer portion equal
2602 | to 1, the `zExp' input should be 1 less than the desired result exponent
2603 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2604 | significand.
2605 *----------------------------------------------------------------------------*/
2606 
2607 static inline float128
2608  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2609 {
2610     float128 z;
2611 
2612     z.low = zSig1;
2613     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2614     return z;
2615 
2616 }
2617 
2618 /*----------------------------------------------------------------------------
2619 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2620 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2621 | and `zSig2', and returns the proper quadruple-precision floating-point value
2622 | corresponding to the abstract input.  Ordinarily, the abstract value is
2623 | simply rounded and packed into the quadruple-precision format, with the
2624 | inexact exception raised if the abstract input cannot be represented
2625 | exactly.  However, if the abstract value is too large, the overflow and
2626 | inexact exceptions are raised and an infinity or maximal finite value is
2627 | returned.  If the abstract value is too small, the input value is rounded to
2628 | a subnormal number, and the underflow and inexact exceptions are raised if
2629 | the abstract input cannot be represented exactly as a subnormal quadruple-
2630 | precision floating-point number.
2631 |     The input significand must be normalized or smaller.  If the input
2632 | significand is not normalized, `zExp' must be 0; in that case, the result
2633 | returned is a subnormal number, and it must not require rounding.  In the
2634 | usual case that the input significand is normalized, `zExp' must be 1 less
2635 | than the ``true'' floating-point exponent.  The handling of underflow and
2636 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2637 *----------------------------------------------------------------------------*/
2638 
2639 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2640                                      uint64_t zSig0, uint64_t zSig1,
2641                                      uint64_t zSig2, float_status *status)
2642 {
2643     int8_t roundingMode;
2644     flag roundNearestEven, increment, isTiny;
2645 
2646     roundingMode = status->float_rounding_mode;
2647     roundNearestEven = ( roundingMode == float_round_nearest_even );
2648     switch (roundingMode) {
2649     case float_round_nearest_even:
2650     case float_round_ties_away:
2651         increment = ((int64_t)zSig2 < 0);
2652         break;
2653     case float_round_to_zero:
2654         increment = 0;
2655         break;
2656     case float_round_up:
2657         increment = !zSign && zSig2;
2658         break;
2659     case float_round_down:
2660         increment = zSign && zSig2;
2661         break;
2662     case float_round_to_odd:
2663         increment = !(zSig1 & 0x1) && zSig2;
2664         break;
2665     default:
2666         abort();
2667     }
2668     if ( 0x7FFD <= (uint32_t) zExp ) {
2669         if (    ( 0x7FFD < zExp )
2670              || (    ( zExp == 0x7FFD )
2671                   && eq128(
2672                          LIT64( 0x0001FFFFFFFFFFFF ),
2673                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2674                          zSig0,
2675                          zSig1
2676                      )
2677                   && increment
2678                 )
2679            ) {
2680             float_raise(float_flag_overflow | float_flag_inexact, status);
2681             if (    ( roundingMode == float_round_to_zero )
2682                  || ( zSign && ( roundingMode == float_round_up ) )
2683                  || ( ! zSign && ( roundingMode == float_round_down ) )
2684                  || (roundingMode == float_round_to_odd)
2685                ) {
2686                 return
2687                     packFloat128(
2688                         zSign,
2689                         0x7FFE,
2690                         LIT64( 0x0000FFFFFFFFFFFF ),
2691                         LIT64( 0xFFFFFFFFFFFFFFFF )
2692                     );
2693             }
2694             return packFloat128( zSign, 0x7FFF, 0, 0 );
2695         }
2696         if ( zExp < 0 ) {
2697             if (status->flush_to_zero) {
2698                 float_raise(float_flag_output_denormal, status);
2699                 return packFloat128(zSign, 0, 0, 0);
2700             }
2701             isTiny =
2702                    (status->float_detect_tininess
2703                     == float_tininess_before_rounding)
2704                 || ( zExp < -1 )
2705                 || ! increment
2706                 || lt128(
2707                        zSig0,
2708                        zSig1,
2709                        LIT64( 0x0001FFFFFFFFFFFF ),
2710                        LIT64( 0xFFFFFFFFFFFFFFFF )
2711                    );
2712             shift128ExtraRightJamming(
2713                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2714             zExp = 0;
2715             if (isTiny && zSig2) {
2716                 float_raise(float_flag_underflow, status);
2717             }
2718             switch (roundingMode) {
2719             case float_round_nearest_even:
2720             case float_round_ties_away:
2721                 increment = ((int64_t)zSig2 < 0);
2722                 break;
2723             case float_round_to_zero:
2724                 increment = 0;
2725                 break;
2726             case float_round_up:
2727                 increment = !zSign && zSig2;
2728                 break;
2729             case float_round_down:
2730                 increment = zSign && zSig2;
2731                 break;
2732             case float_round_to_odd:
2733                 increment = !(zSig1 & 0x1) && zSig2;
2734                 break;
2735             default:
2736                 abort();
2737             }
2738         }
2739     }
2740     if (zSig2) {
2741         status->float_exception_flags |= float_flag_inexact;
2742     }
2743     if ( increment ) {
2744         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2745         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2746     }
2747     else {
2748         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2749     }
2750     return packFloat128( zSign, zExp, zSig0, zSig1 );
2751 
2752 }
2753 
2754 /*----------------------------------------------------------------------------
2755 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2756 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2757 | returns the proper quadruple-precision floating-point value corresponding
2758 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2759 | except that the input significand has fewer bits and does not have to be
2760 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2761 | point exponent.
2762 *----------------------------------------------------------------------------*/
2763 
2764 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2765                                               uint64_t zSig0, uint64_t zSig1,
2766                                               float_status *status)
2767 {
2768     int8_t shiftCount;
2769     uint64_t zSig2;
2770 
2771     if ( zSig0 == 0 ) {
2772         zSig0 = zSig1;
2773         zSig1 = 0;
2774         zExp -= 64;
2775     }
2776     shiftCount = countLeadingZeros64( zSig0 ) - 15;
2777     if ( 0 <= shiftCount ) {
2778         zSig2 = 0;
2779         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2780     }
2781     else {
2782         shift128ExtraRightJamming(
2783             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2784     }
2785     zExp -= shiftCount;
2786     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2787 
2788 }
2789 
2790 
2791 /*----------------------------------------------------------------------------
2792 | Returns the result of converting the 32-bit two's complement integer `a'
2793 | to the extended double-precision floating-point format.  The conversion
2794 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2795 | Arithmetic.
2796 *----------------------------------------------------------------------------*/
2797 
2798 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2799 {
2800     flag zSign;
2801     uint32_t absA;
2802     int8_t shiftCount;
2803     uint64_t zSig;
2804 
2805     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2806     zSign = ( a < 0 );
2807     absA = zSign ? - a : a;
2808     shiftCount = countLeadingZeros32( absA ) + 32;
2809     zSig = absA;
2810     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2811 
2812 }
2813 
2814 /*----------------------------------------------------------------------------
2815 | Returns the result of converting the 32-bit two's complement integer `a' to
2816 | the quadruple-precision floating-point format.  The conversion is performed
2817 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2818 *----------------------------------------------------------------------------*/
2819 
2820 float128 int32_to_float128(int32_t a, float_status *status)
2821 {
2822     flag zSign;
2823     uint32_t absA;
2824     int8_t shiftCount;
2825     uint64_t zSig0;
2826 
2827     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2828     zSign = ( a < 0 );
2829     absA = zSign ? - a : a;
2830     shiftCount = countLeadingZeros32( absA ) + 17;
2831     zSig0 = absA;
2832     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2833 
2834 }
2835 
2836 /*----------------------------------------------------------------------------
2837 | Returns the result of converting the 64-bit two's complement integer `a'
2838 | to the extended double-precision floating-point format.  The conversion
2839 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2840 | Arithmetic.
2841 *----------------------------------------------------------------------------*/
2842 
2843 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2844 {
2845     flag zSign;
2846     uint64_t absA;
2847     int8_t shiftCount;
2848 
2849     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2850     zSign = ( a < 0 );
2851     absA = zSign ? - a : a;
2852     shiftCount = countLeadingZeros64( absA );
2853     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2854 
2855 }
2856 
2857 /*----------------------------------------------------------------------------
2858 | Returns the result of converting the 64-bit two's complement integer `a' to
2859 | the quadruple-precision floating-point format.  The conversion is performed
2860 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2861 *----------------------------------------------------------------------------*/
2862 
2863 float128 int64_to_float128(int64_t a, float_status *status)
2864 {
2865     flag zSign;
2866     uint64_t absA;
2867     int8_t shiftCount;
2868     int32_t zExp;
2869     uint64_t zSig0, zSig1;
2870 
2871     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2872     zSign = ( a < 0 );
2873     absA = zSign ? - a : a;
2874     shiftCount = countLeadingZeros64( absA ) + 49;
2875     zExp = 0x406E - shiftCount;
2876     if ( 64 <= shiftCount ) {
2877         zSig1 = 0;
2878         zSig0 = absA;
2879         shiftCount -= 64;
2880     }
2881     else {
2882         zSig1 = absA;
2883         zSig0 = 0;
2884     }
2885     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2886     return packFloat128( zSign, zExp, zSig0, zSig1 );
2887 
2888 }
2889 
2890 /*----------------------------------------------------------------------------
2891 | Returns the result of converting the 64-bit unsigned integer `a'
2892 | to the quadruple-precision floating-point format.  The conversion is performed
2893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894 *----------------------------------------------------------------------------*/
2895 
2896 float128 uint64_to_float128(uint64_t a, float_status *status)
2897 {
2898     if (a == 0) {
2899         return float128_zero;
2900     }
2901     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
2902 }
2903 
2904 
2905 
2906 
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of converting the single-precision floating-point value
2909 | `a' to the double-precision floating-point format.  The conversion is
2910 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2911 | Arithmetic.
2912 *----------------------------------------------------------------------------*/
2913 
2914 float64 float32_to_float64(float32 a, float_status *status)
2915 {
2916     flag aSign;
2917     int aExp;
2918     uint32_t aSig;
2919     a = float32_squash_input_denormal(a, status);
2920 
2921     aSig = extractFloat32Frac( a );
2922     aExp = extractFloat32Exp( a );
2923     aSign = extractFloat32Sign( a );
2924     if ( aExp == 0xFF ) {
2925         if (aSig) {
2926             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2927         }
2928         return packFloat64( aSign, 0x7FF, 0 );
2929     }
2930     if ( aExp == 0 ) {
2931         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2932         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2933         --aExp;
2934     }
2935     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
2936 
2937 }
2938 
2939 /*----------------------------------------------------------------------------
2940 | Returns the result of converting the single-precision floating-point value
2941 | `a' to the extended double-precision floating-point format.  The conversion
2942 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2943 | Arithmetic.
2944 *----------------------------------------------------------------------------*/
2945 
2946 floatx80 float32_to_floatx80(float32 a, float_status *status)
2947 {
2948     flag aSign;
2949     int aExp;
2950     uint32_t aSig;
2951 
2952     a = float32_squash_input_denormal(a, status);
2953     aSig = extractFloat32Frac( a );
2954     aExp = extractFloat32Exp( a );
2955     aSign = extractFloat32Sign( a );
2956     if ( aExp == 0xFF ) {
2957         if (aSig) {
2958             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2959         }
2960         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2961     }
2962     if ( aExp == 0 ) {
2963         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2964         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2965     }
2966     aSig |= 0x00800000;
2967     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
2968 
2969 }
2970 
2971 /*----------------------------------------------------------------------------
2972 | Returns the result of converting the single-precision floating-point value
2973 | `a' to the double-precision floating-point format.  The conversion is
2974 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2975 | Arithmetic.
2976 *----------------------------------------------------------------------------*/
2977 
2978 float128 float32_to_float128(float32 a, float_status *status)
2979 {
2980     flag aSign;
2981     int aExp;
2982     uint32_t aSig;
2983 
2984     a = float32_squash_input_denormal(a, status);
2985     aSig = extractFloat32Frac( a );
2986     aExp = extractFloat32Exp( a );
2987     aSign = extractFloat32Sign( a );
2988     if ( aExp == 0xFF ) {
2989         if (aSig) {
2990             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2991         }
2992         return packFloat128( aSign, 0x7FFF, 0, 0 );
2993     }
2994     if ( aExp == 0 ) {
2995         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2996         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2997         --aExp;
2998     }
2999     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3000 
3001 }
3002 
3003 /*----------------------------------------------------------------------------
3004 | Returns the remainder of the single-precision floating-point value `a'
3005 | with respect to the corresponding value `b'.  The operation is performed
3006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3007 *----------------------------------------------------------------------------*/
3008 
3009 float32 float32_rem(float32 a, float32 b, float_status *status)
3010 {
3011     flag aSign, zSign;
3012     int aExp, bExp, expDiff;
3013     uint32_t aSig, bSig;
3014     uint32_t q;
3015     uint64_t aSig64, bSig64, q64;
3016     uint32_t alternateASig;
3017     int32_t sigMean;
3018     a = float32_squash_input_denormal(a, status);
3019     b = float32_squash_input_denormal(b, status);
3020 
3021     aSig = extractFloat32Frac( a );
3022     aExp = extractFloat32Exp( a );
3023     aSign = extractFloat32Sign( a );
3024     bSig = extractFloat32Frac( b );
3025     bExp = extractFloat32Exp( b );
3026     if ( aExp == 0xFF ) {
3027         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3028             return propagateFloat32NaN(a, b, status);
3029         }
3030         float_raise(float_flag_invalid, status);
3031         return float32_default_nan(status);
3032     }
3033     if ( bExp == 0xFF ) {
3034         if (bSig) {
3035             return propagateFloat32NaN(a, b, status);
3036         }
3037         return a;
3038     }
3039     if ( bExp == 0 ) {
3040         if ( bSig == 0 ) {
3041             float_raise(float_flag_invalid, status);
3042             return float32_default_nan(status);
3043         }
3044         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3045     }
3046     if ( aExp == 0 ) {
3047         if ( aSig == 0 ) return a;
3048         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3049     }
3050     expDiff = aExp - bExp;
3051     aSig |= 0x00800000;
3052     bSig |= 0x00800000;
3053     if ( expDiff < 32 ) {
3054         aSig <<= 8;
3055         bSig <<= 8;
3056         if ( expDiff < 0 ) {
3057             if ( expDiff < -1 ) return a;
3058             aSig >>= 1;
3059         }
3060         q = ( bSig <= aSig );
3061         if ( q ) aSig -= bSig;
3062         if ( 0 < expDiff ) {
3063             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3064             q >>= 32 - expDiff;
3065             bSig >>= 2;
3066             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3067         }
3068         else {
3069             aSig >>= 2;
3070             bSig >>= 2;
3071         }
3072     }
3073     else {
3074         if ( bSig <= aSig ) aSig -= bSig;
3075         aSig64 = ( (uint64_t) aSig )<<40;
3076         bSig64 = ( (uint64_t) bSig )<<40;
3077         expDiff -= 64;
3078         while ( 0 < expDiff ) {
3079             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3080             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3081             aSig64 = - ( ( bSig * q64 )<<38 );
3082             expDiff -= 62;
3083         }
3084         expDiff += 64;
3085         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3086         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3087         q = q64>>( 64 - expDiff );
3088         bSig <<= 6;
3089         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3090     }
3091     do {
3092         alternateASig = aSig;
3093         ++q;
3094         aSig -= bSig;
3095     } while ( 0 <= (int32_t) aSig );
3096     sigMean = aSig + alternateASig;
3097     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3098         aSig = alternateASig;
3099     }
3100     zSign = ( (int32_t) aSig < 0 );
3101     if ( zSign ) aSig = - aSig;
3102     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3103 }
3104 
3105 
3106 /*----------------------------------------------------------------------------
3107 | Returns the square root of the single-precision floating-point value `a'.
3108 | The operation is performed according to the IEC/IEEE Standard for Binary
3109 | Floating-Point Arithmetic.
3110 *----------------------------------------------------------------------------*/
3111 
3112 float32 float32_sqrt(float32 a, float_status *status)
3113 {
3114     flag aSign;
3115     int aExp, zExp;
3116     uint32_t aSig, zSig;
3117     uint64_t rem, term;
3118     a = float32_squash_input_denormal(a, status);
3119 
3120     aSig = extractFloat32Frac( a );
3121     aExp = extractFloat32Exp( a );
3122     aSign = extractFloat32Sign( a );
3123     if ( aExp == 0xFF ) {
3124         if (aSig) {
3125             return propagateFloat32NaN(a, float32_zero, status);
3126         }
3127         if ( ! aSign ) return a;
3128         float_raise(float_flag_invalid, status);
3129         return float32_default_nan(status);
3130     }
3131     if ( aSign ) {
3132         if ( ( aExp | aSig ) == 0 ) return a;
3133         float_raise(float_flag_invalid, status);
3134         return float32_default_nan(status);
3135     }
3136     if ( aExp == 0 ) {
3137         if ( aSig == 0 ) return float32_zero;
3138         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3139     }
3140     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3141     aSig = ( aSig | 0x00800000 )<<8;
3142     zSig = estimateSqrt32( aExp, aSig ) + 2;
3143     if ( ( zSig & 0x7F ) <= 5 ) {
3144         if ( zSig < 2 ) {
3145             zSig = 0x7FFFFFFF;
3146             goto roundAndPack;
3147         }
3148         aSig >>= aExp & 1;
3149         term = ( (uint64_t) zSig ) * zSig;
3150         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3151         while ( (int64_t) rem < 0 ) {
3152             --zSig;
3153             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3154         }
3155         zSig |= ( rem != 0 );
3156     }
3157     shift32RightJamming( zSig, 1, &zSig );
3158  roundAndPack:
3159     return roundAndPackFloat32(0, zExp, zSig, status);
3160 
3161 }
3162 
3163 /*----------------------------------------------------------------------------
3164 | Returns the binary exponential of the single-precision floating-point value
3165 | `a'. The operation is performed according to the IEC/IEEE Standard for
3166 | Binary Floating-Point Arithmetic.
3167 |
3168 | Uses the following identities:
3169 |
3170 | 1. -------------------------------------------------------------------------
3171 |      x    x*ln(2)
3172 |     2  = e
3173 |
3174 | 2. -------------------------------------------------------------------------
3175 |                      2     3     4     5           n
3176 |      x        x     x     x     x     x           x
3177 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3178 |               1!    2!    3!    4!    5!          n!
3179 *----------------------------------------------------------------------------*/
3180 
3181 static const float64 float32_exp2_coefficients[15] =
3182 {
3183     const_float64( 0x3ff0000000000000ll ), /*  1 */
3184     const_float64( 0x3fe0000000000000ll ), /*  2 */
3185     const_float64( 0x3fc5555555555555ll ), /*  3 */
3186     const_float64( 0x3fa5555555555555ll ), /*  4 */
3187     const_float64( 0x3f81111111111111ll ), /*  5 */
3188     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3189     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3190     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3191     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3192     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3193     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3194     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3195     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3196     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3197     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3198 };
3199 
3200 float32 float32_exp2(float32 a, float_status *status)
3201 {
3202     flag aSign;
3203     int aExp;
3204     uint32_t aSig;
3205     float64 r, x, xn;
3206     int i;
3207     a = float32_squash_input_denormal(a, status);
3208 
3209     aSig = extractFloat32Frac( a );
3210     aExp = extractFloat32Exp( a );
3211     aSign = extractFloat32Sign( a );
3212 
3213     if ( aExp == 0xFF) {
3214         if (aSig) {
3215             return propagateFloat32NaN(a, float32_zero, status);
3216         }
3217         return (aSign) ? float32_zero : a;
3218     }
3219     if (aExp == 0) {
3220         if (aSig == 0) return float32_one;
3221     }
3222 
3223     float_raise(float_flag_inexact, status);
3224 
3225     /* ******************************* */
3226     /* using float64 for approximation */
3227     /* ******************************* */
3228     x = float32_to_float64(a, status);
3229     x = float64_mul(x, float64_ln2, status);
3230 
3231     xn = x;
3232     r = float64_one;
3233     for (i = 0 ; i < 15 ; i++) {
3234         float64 f;
3235 
3236         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3237         r = float64_add(r, f, status);
3238 
3239         xn = float64_mul(xn, x, status);
3240     }
3241 
3242     return float64_to_float32(r, status);
3243 }
3244 
3245 /*----------------------------------------------------------------------------
3246 | Returns the binary log of the single-precision floating-point value `a'.
3247 | The operation is performed according to the IEC/IEEE Standard for Binary
3248 | Floating-Point Arithmetic.
3249 *----------------------------------------------------------------------------*/
3250 float32 float32_log2(float32 a, float_status *status)
3251 {
3252     flag aSign, zSign;
3253     int aExp;
3254     uint32_t aSig, zSig, i;
3255 
3256     a = float32_squash_input_denormal(a, status);
3257     aSig = extractFloat32Frac( a );
3258     aExp = extractFloat32Exp( a );
3259     aSign = extractFloat32Sign( a );
3260 
3261     if ( aExp == 0 ) {
3262         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3263         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3264     }
3265     if ( aSign ) {
3266         float_raise(float_flag_invalid, status);
3267         return float32_default_nan(status);
3268     }
3269     if ( aExp == 0xFF ) {
3270         if (aSig) {
3271             return propagateFloat32NaN(a, float32_zero, status);
3272         }
3273         return a;
3274     }
3275 
3276     aExp -= 0x7F;
3277     aSig |= 0x00800000;
3278     zSign = aExp < 0;
3279     zSig = aExp << 23;
3280 
3281     for (i = 1 << 22; i > 0; i >>= 1) {
3282         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3283         if ( aSig & 0x01000000 ) {
3284             aSig >>= 1;
3285             zSig |= i;
3286         }
3287     }
3288 
3289     if ( zSign )
3290         zSig = -zSig;
3291 
3292     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3293 }
3294 
3295 /*----------------------------------------------------------------------------
3296 | Returns 1 if the single-precision floating-point value `a' is equal to
3297 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3298 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3299 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3300 *----------------------------------------------------------------------------*/
3301 
3302 int float32_eq(float32 a, float32 b, float_status *status)
3303 {
3304     uint32_t av, bv;
3305     a = float32_squash_input_denormal(a, status);
3306     b = float32_squash_input_denormal(b, status);
3307 
3308     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3309          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3310        ) {
3311         float_raise(float_flag_invalid, status);
3312         return 0;
3313     }
3314     av = float32_val(a);
3315     bv = float32_val(b);
3316     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3317 }
3318 
3319 /*----------------------------------------------------------------------------
3320 | Returns 1 if the single-precision floating-point value `a' is less than
3321 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3322 | exception is raised if either operand is a NaN.  The comparison is performed
3323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3324 *----------------------------------------------------------------------------*/
3325 
3326 int float32_le(float32 a, float32 b, float_status *status)
3327 {
3328     flag aSign, bSign;
3329     uint32_t av, bv;
3330     a = float32_squash_input_denormal(a, status);
3331     b = float32_squash_input_denormal(b, status);
3332 
3333     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3334          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3335        ) {
3336         float_raise(float_flag_invalid, status);
3337         return 0;
3338     }
3339     aSign = extractFloat32Sign( a );
3340     bSign = extractFloat32Sign( b );
3341     av = float32_val(a);
3342     bv = float32_val(b);
3343     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3344     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3345 
3346 }
3347 
3348 /*----------------------------------------------------------------------------
3349 | Returns 1 if the single-precision floating-point value `a' is less than
3350 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3351 | raised if either operand is a NaN.  The comparison is performed according
3352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3353 *----------------------------------------------------------------------------*/
3354 
3355 int float32_lt(float32 a, float32 b, float_status *status)
3356 {
3357     flag aSign, bSign;
3358     uint32_t av, bv;
3359     a = float32_squash_input_denormal(a, status);
3360     b = float32_squash_input_denormal(b, status);
3361 
3362     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3363          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3364        ) {
3365         float_raise(float_flag_invalid, status);
3366         return 0;
3367     }
3368     aSign = extractFloat32Sign( a );
3369     bSign = extractFloat32Sign( b );
3370     av = float32_val(a);
3371     bv = float32_val(b);
3372     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3373     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3374 
3375 }
3376 
3377 /*----------------------------------------------------------------------------
3378 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3379 | be compared, and 0 otherwise.  The invalid exception is raised if either
3380 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3381 | Standard for Binary Floating-Point Arithmetic.
3382 *----------------------------------------------------------------------------*/
3383 
3384 int float32_unordered(float32 a, float32 b, float_status *status)
3385 {
3386     a = float32_squash_input_denormal(a, status);
3387     b = float32_squash_input_denormal(b, status);
3388 
3389     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3390          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3391        ) {
3392         float_raise(float_flag_invalid, status);
3393         return 1;
3394     }
3395     return 0;
3396 }
3397 
3398 /*----------------------------------------------------------------------------
3399 | Returns 1 if the single-precision floating-point value `a' is equal to
3400 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3401 | exception.  The comparison is performed according to the IEC/IEEE Standard
3402 | for Binary Floating-Point Arithmetic.
3403 *----------------------------------------------------------------------------*/
3404 
3405 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3406 {
3407     a = float32_squash_input_denormal(a, status);
3408     b = float32_squash_input_denormal(b, status);
3409 
3410     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3411          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3412        ) {
3413         if (float32_is_signaling_nan(a, status)
3414          || float32_is_signaling_nan(b, status)) {
3415             float_raise(float_flag_invalid, status);
3416         }
3417         return 0;
3418     }
3419     return ( float32_val(a) == float32_val(b) ) ||
3420             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3421 }
3422 
3423 /*----------------------------------------------------------------------------
3424 | Returns 1 if the single-precision floating-point value `a' is less than or
3425 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3426 | cause an exception.  Otherwise, the comparison is performed according to the
3427 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3428 *----------------------------------------------------------------------------*/
3429 
3430 int float32_le_quiet(float32 a, float32 b, float_status *status)
3431 {
3432     flag aSign, bSign;
3433     uint32_t av, bv;
3434     a = float32_squash_input_denormal(a, status);
3435     b = float32_squash_input_denormal(b, status);
3436 
3437     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3438          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3439        ) {
3440         if (float32_is_signaling_nan(a, status)
3441          || float32_is_signaling_nan(b, status)) {
3442             float_raise(float_flag_invalid, status);
3443         }
3444         return 0;
3445     }
3446     aSign = extractFloat32Sign( a );
3447     bSign = extractFloat32Sign( b );
3448     av = float32_val(a);
3449     bv = float32_val(b);
3450     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3451     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3452 
3453 }
3454 
3455 /*----------------------------------------------------------------------------
3456 | Returns 1 if the single-precision floating-point value `a' is less than
3457 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3458 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3459 | Standard for Binary Floating-Point Arithmetic.
3460 *----------------------------------------------------------------------------*/
3461 
3462 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3463 {
3464     flag aSign, bSign;
3465     uint32_t av, bv;
3466     a = float32_squash_input_denormal(a, status);
3467     b = float32_squash_input_denormal(b, status);
3468 
3469     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3470          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3471        ) {
3472         if (float32_is_signaling_nan(a, status)
3473          || float32_is_signaling_nan(b, status)) {
3474             float_raise(float_flag_invalid, status);
3475         }
3476         return 0;
3477     }
3478     aSign = extractFloat32Sign( a );
3479     bSign = extractFloat32Sign( b );
3480     av = float32_val(a);
3481     bv = float32_val(b);
3482     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3483     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3484 
3485 }
3486 
3487 /*----------------------------------------------------------------------------
3488 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3489 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3490 | comparison is performed according to the IEC/IEEE Standard for Binary
3491 | Floating-Point Arithmetic.
3492 *----------------------------------------------------------------------------*/
3493 
3494 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3495 {
3496     a = float32_squash_input_denormal(a, status);
3497     b = float32_squash_input_denormal(b, status);
3498 
3499     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3500          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3501        ) {
3502         if (float32_is_signaling_nan(a, status)
3503          || float32_is_signaling_nan(b, status)) {
3504             float_raise(float_flag_invalid, status);
3505         }
3506         return 1;
3507     }
3508     return 0;
3509 }
3510 
3511 
3512 /*----------------------------------------------------------------------------
3513 | Returns the result of converting the double-precision floating-point value
3514 | `a' to the single-precision floating-point format.  The conversion is
3515 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3516 | Arithmetic.
3517 *----------------------------------------------------------------------------*/
3518 
3519 float32 float64_to_float32(float64 a, float_status *status)
3520 {
3521     flag aSign;
3522     int aExp;
3523     uint64_t aSig;
3524     uint32_t zSig;
3525     a = float64_squash_input_denormal(a, status);
3526 
3527     aSig = extractFloat64Frac( a );
3528     aExp = extractFloat64Exp( a );
3529     aSign = extractFloat64Sign( a );
3530     if ( aExp == 0x7FF ) {
3531         if (aSig) {
3532             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3533         }
3534         return packFloat32( aSign, 0xFF, 0 );
3535     }
3536     shift64RightJamming( aSig, 22, &aSig );
3537     zSig = aSig;
3538     if ( aExp || zSig ) {
3539         zSig |= 0x40000000;
3540         aExp -= 0x381;
3541     }
3542     return roundAndPackFloat32(aSign, aExp, zSig, status);
3543 
3544 }
3545 
3546 
3547 /*----------------------------------------------------------------------------
3548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3549 | half-precision floating-point value, returning the result.  After being
3550 | shifted into the proper positions, the three fields are simply added
3551 | together to form the result.  This means that any integer portion of `zSig'
3552 | will be added into the exponent.  Since a properly normalized significand
3553 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3554 | than the desired result exponent whenever `zSig' is a complete, normalized
3555 | significand.
3556 *----------------------------------------------------------------------------*/
3557 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3558 {
3559     return make_float16(
3560         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3561 }
3562 
3563 /*----------------------------------------------------------------------------
3564 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3565 | and significand `zSig', and returns the proper half-precision floating-
3566 | point value corresponding to the abstract input.  Ordinarily, the abstract
3567 | value is simply rounded and packed into the half-precision format, with
3568 | the inexact exception raised if the abstract input cannot be represented
3569 | exactly.  However, if the abstract value is too large, the overflow and
3570 | inexact exceptions are raised and an infinity or maximal finite value is
3571 | returned.  If the abstract value is too small, the input value is rounded to
3572 | a subnormal number, and the underflow and inexact exceptions are raised if
3573 | the abstract input cannot be represented exactly as a subnormal half-
3574 | precision floating-point number.
3575 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3576 | ARM-style "alternative representation", which omits the NaN and Inf
3577 | encodings in order to raise the maximum representable exponent by one.
3578 |     The input significand `zSig' has its binary point between bits 22
3579 | and 23, which is 13 bits to the left of the usual location.  This shifted
3580 | significand must be normalized or smaller.  If `zSig' is not normalized,
3581 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3582 | and it must not require rounding.  In the usual case that `zSig' is
3583 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3584 | Note the slightly odd position of the binary point in zSig compared with the
3585 | other roundAndPackFloat functions. This should probably be fixed if we
3586 | need to implement more float16 routines than just conversion.
3587 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3588 | Binary Floating-Point Arithmetic.
3589 *----------------------------------------------------------------------------*/
3590 
3591 static float16 roundAndPackFloat16(flag zSign, int zExp,
3592                                    uint32_t zSig, flag ieee,
3593                                    float_status *status)
3594 {
3595     int maxexp = ieee ? 29 : 30;
3596     uint32_t mask;
3597     uint32_t increment;
3598     bool rounding_bumps_exp;
3599     bool is_tiny = false;
3600 
3601     /* Calculate the mask of bits of the mantissa which are not
3602      * representable in half-precision and will be lost.
3603      */
3604     if (zExp < 1) {
3605         /* Will be denormal in halfprec */
3606         mask = 0x00ffffff;
3607         if (zExp >= -11) {
3608             mask >>= 11 + zExp;
3609         }
3610     } else {
3611         /* Normal number in halfprec */
3612         mask = 0x00001fff;
3613     }
3614 
3615     switch (status->float_rounding_mode) {
3616     case float_round_nearest_even:
3617         increment = (mask + 1) >> 1;
3618         if ((zSig & mask) == increment) {
3619             increment = zSig & (increment << 1);
3620         }
3621         break;
3622     case float_round_ties_away:
3623         increment = (mask + 1) >> 1;
3624         break;
3625     case float_round_up:
3626         increment = zSign ? 0 : mask;
3627         break;
3628     case float_round_down:
3629         increment = zSign ? mask : 0;
3630         break;
3631     default: /* round_to_zero */
3632         increment = 0;
3633         break;
3634     }
3635 
3636     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3637 
3638     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3639         if (ieee) {
3640             float_raise(float_flag_overflow | float_flag_inexact, status);
3641             return packFloat16(zSign, 0x1f, 0);
3642         } else {
3643             float_raise(float_flag_invalid, status);
3644             return packFloat16(zSign, 0x1f, 0x3ff);
3645         }
3646     }
3647 
3648     if (zExp < 0) {
3649         /* Note that flush-to-zero does not affect half-precision results */
3650         is_tiny =
3651             (status->float_detect_tininess == float_tininess_before_rounding)
3652             || (zExp < -1)
3653             || (!rounding_bumps_exp);
3654     }
3655     if (zSig & mask) {
3656         float_raise(float_flag_inexact, status);
3657         if (is_tiny) {
3658             float_raise(float_flag_underflow, status);
3659         }
3660     }
3661 
3662     zSig += increment;
3663     if (rounding_bumps_exp) {
3664         zSig >>= 1;
3665         zExp++;
3666     }
3667 
3668     if (zExp < -10) {
3669         return packFloat16(zSign, 0, 0);
3670     }
3671     if (zExp < 0) {
3672         zSig >>= -zExp;
3673         zExp = 0;
3674     }
3675     return packFloat16(zSign, zExp, zSig >> 13);
3676 }
3677 
3678 /*----------------------------------------------------------------------------
3679 | If `a' is denormal and we are in flush-to-zero mode then set the
3680 | input-denormal exception and return zero. Otherwise just return the value.
3681 *----------------------------------------------------------------------------*/
3682 float16 float16_squash_input_denormal(float16 a, float_status *status)
3683 {
3684     if (status->flush_inputs_to_zero) {
3685         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3686             float_raise(float_flag_input_denormal, status);
3687             return make_float16(float16_val(a) & 0x8000);
3688         }
3689     }
3690     return a;
3691 }
3692 
3693 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3694                                       uint32_t *zSigPtr)
3695 {
3696     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3697     *zSigPtr = aSig << shiftCount;
3698     *zExpPtr = 1 - shiftCount;
3699 }
3700 
3701 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3702    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3703 
3704 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3705 {
3706     flag aSign;
3707     int aExp;
3708     uint32_t aSig;
3709 
3710     aSign = extractFloat16Sign(a);
3711     aExp = extractFloat16Exp(a);
3712     aSig = extractFloat16Frac(a);
3713 
3714     if (aExp == 0x1f && ieee) {
3715         if (aSig) {
3716             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3717         }
3718         return packFloat32(aSign, 0xff, 0);
3719     }
3720     if (aExp == 0) {
3721         if (aSig == 0) {
3722             return packFloat32(aSign, 0, 0);
3723         }
3724 
3725         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3726         aExp--;
3727     }
3728     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3729 }
3730 
3731 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3732 {
3733     flag aSign;
3734     int aExp;
3735     uint32_t aSig;
3736 
3737     a = float32_squash_input_denormal(a, status);
3738 
3739     aSig = extractFloat32Frac( a );
3740     aExp = extractFloat32Exp( a );
3741     aSign = extractFloat32Sign( a );
3742     if ( aExp == 0xFF ) {
3743         if (aSig) {
3744             /* Input is a NaN */
3745             if (!ieee) {
3746                 float_raise(float_flag_invalid, status);
3747                 return packFloat16(aSign, 0, 0);
3748             }
3749             return commonNaNToFloat16(
3750                 float32ToCommonNaN(a, status), status);
3751         }
3752         /* Infinity */
3753         if (!ieee) {
3754             float_raise(float_flag_invalid, status);
3755             return packFloat16(aSign, 0x1f, 0x3ff);
3756         }
3757         return packFloat16(aSign, 0x1f, 0);
3758     }
3759     if (aExp == 0 && aSig == 0) {
3760         return packFloat16(aSign, 0, 0);
3761     }
3762     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3763      * even if the input is denormal; however this is harmless because
3764      * the largest possible single-precision denormal is still smaller
3765      * than the smallest representable half-precision denormal, and so we
3766      * will end up ignoring aSig and returning via the "always return zero"
3767      * codepath.
3768      */
3769     aSig |= 0x00800000;
3770     aExp -= 0x71;
3771 
3772     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3773 }
3774 
3775 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3776 {
3777     flag aSign;
3778     int aExp;
3779     uint32_t aSig;
3780 
3781     aSign = extractFloat16Sign(a);
3782     aExp = extractFloat16Exp(a);
3783     aSig = extractFloat16Frac(a);
3784 
3785     if (aExp == 0x1f && ieee) {
3786         if (aSig) {
3787             return commonNaNToFloat64(
3788                 float16ToCommonNaN(a, status), status);
3789         }
3790         return packFloat64(aSign, 0x7ff, 0);
3791     }
3792     if (aExp == 0) {
3793         if (aSig == 0) {
3794             return packFloat64(aSign, 0, 0);
3795         }
3796 
3797         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3798         aExp--;
3799     }
3800     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3801 }
3802 
3803 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3804 {
3805     flag aSign;
3806     int aExp;
3807     uint64_t aSig;
3808     uint32_t zSig;
3809 
3810     a = float64_squash_input_denormal(a, status);
3811 
3812     aSig = extractFloat64Frac(a);
3813     aExp = extractFloat64Exp(a);
3814     aSign = extractFloat64Sign(a);
3815     if (aExp == 0x7FF) {
3816         if (aSig) {
3817             /* Input is a NaN */
3818             if (!ieee) {
3819                 float_raise(float_flag_invalid, status);
3820                 return packFloat16(aSign, 0, 0);
3821             }
3822             return commonNaNToFloat16(
3823                 float64ToCommonNaN(a, status), status);
3824         }
3825         /* Infinity */
3826         if (!ieee) {
3827             float_raise(float_flag_invalid, status);
3828             return packFloat16(aSign, 0x1f, 0x3ff);
3829         }
3830         return packFloat16(aSign, 0x1f, 0);
3831     }
3832     shift64RightJamming(aSig, 29, &aSig);
3833     zSig = aSig;
3834     if (aExp == 0 && zSig == 0) {
3835         return packFloat16(aSign, 0, 0);
3836     }
3837     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3838      * even if the input is denormal; however this is harmless because
3839      * the largest possible single-precision denormal is still smaller
3840      * than the smallest representable half-precision denormal, and so we
3841      * will end up ignoring aSig and returning via the "always return zero"
3842      * codepath.
3843      */
3844     zSig |= 0x00800000;
3845     aExp -= 0x3F1;
3846 
3847     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3848 }
3849 
3850 /*----------------------------------------------------------------------------
3851 | Returns the result of converting the double-precision floating-point value
3852 | `a' to the extended double-precision floating-point format.  The conversion
3853 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3854 | Arithmetic.
3855 *----------------------------------------------------------------------------*/
3856 
3857 floatx80 float64_to_floatx80(float64 a, float_status *status)
3858 {
3859     flag aSign;
3860     int aExp;
3861     uint64_t aSig;
3862 
3863     a = float64_squash_input_denormal(a, status);
3864     aSig = extractFloat64Frac( a );
3865     aExp = extractFloat64Exp( a );
3866     aSign = extractFloat64Sign( a );
3867     if ( aExp == 0x7FF ) {
3868         if (aSig) {
3869             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3870         }
3871         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3872     }
3873     if ( aExp == 0 ) {
3874         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3875         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3876     }
3877     return
3878         packFloatx80(
3879             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3880 
3881 }
3882 
3883 /*----------------------------------------------------------------------------
3884 | Returns the result of converting the double-precision floating-point value
3885 | `a' to the quadruple-precision floating-point format.  The conversion is
3886 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3887 | Arithmetic.
3888 *----------------------------------------------------------------------------*/
3889 
3890 float128 float64_to_float128(float64 a, float_status *status)
3891 {
3892     flag aSign;
3893     int aExp;
3894     uint64_t aSig, zSig0, zSig1;
3895 
3896     a = float64_squash_input_denormal(a, status);
3897     aSig = extractFloat64Frac( a );
3898     aExp = extractFloat64Exp( a );
3899     aSign = extractFloat64Sign( a );
3900     if ( aExp == 0x7FF ) {
3901         if (aSig) {
3902             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3903         }
3904         return packFloat128( aSign, 0x7FFF, 0, 0 );
3905     }
3906     if ( aExp == 0 ) {
3907         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3908         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3909         --aExp;
3910     }
3911     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3912     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3913 
3914 }
3915 
3916 
3917 /*----------------------------------------------------------------------------
3918 | Returns the remainder of the double-precision floating-point value `a'
3919 | with respect to the corresponding value `b'.  The operation is performed
3920 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3921 *----------------------------------------------------------------------------*/
3922 
3923 float64 float64_rem(float64 a, float64 b, float_status *status)
3924 {
3925     flag aSign, zSign;
3926     int aExp, bExp, expDiff;
3927     uint64_t aSig, bSig;
3928     uint64_t q, alternateASig;
3929     int64_t sigMean;
3930 
3931     a = float64_squash_input_denormal(a, status);
3932     b = float64_squash_input_denormal(b, status);
3933     aSig = extractFloat64Frac( a );
3934     aExp = extractFloat64Exp( a );
3935     aSign = extractFloat64Sign( a );
3936     bSig = extractFloat64Frac( b );
3937     bExp = extractFloat64Exp( b );
3938     if ( aExp == 0x7FF ) {
3939         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3940             return propagateFloat64NaN(a, b, status);
3941         }
3942         float_raise(float_flag_invalid, status);
3943         return float64_default_nan(status);
3944     }
3945     if ( bExp == 0x7FF ) {
3946         if (bSig) {
3947             return propagateFloat64NaN(a, b, status);
3948         }
3949         return a;
3950     }
3951     if ( bExp == 0 ) {
3952         if ( bSig == 0 ) {
3953             float_raise(float_flag_invalid, status);
3954             return float64_default_nan(status);
3955         }
3956         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3957     }
3958     if ( aExp == 0 ) {
3959         if ( aSig == 0 ) return a;
3960         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3961     }
3962     expDiff = aExp - bExp;
3963     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3964     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3965     if ( expDiff < 0 ) {
3966         if ( expDiff < -1 ) return a;
3967         aSig >>= 1;
3968     }
3969     q = ( bSig <= aSig );
3970     if ( q ) aSig -= bSig;
3971     expDiff -= 64;
3972     while ( 0 < expDiff ) {
3973         q = estimateDiv128To64( aSig, 0, bSig );
3974         q = ( 2 < q ) ? q - 2 : 0;
3975         aSig = - ( ( bSig>>2 ) * q );
3976         expDiff -= 62;
3977     }
3978     expDiff += 64;
3979     if ( 0 < expDiff ) {
3980         q = estimateDiv128To64( aSig, 0, bSig );
3981         q = ( 2 < q ) ? q - 2 : 0;
3982         q >>= 64 - expDiff;
3983         bSig >>= 2;
3984         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3985     }
3986     else {
3987         aSig >>= 2;
3988         bSig >>= 2;
3989     }
3990     do {
3991         alternateASig = aSig;
3992         ++q;
3993         aSig -= bSig;
3994     } while ( 0 <= (int64_t) aSig );
3995     sigMean = aSig + alternateASig;
3996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3997         aSig = alternateASig;
3998     }
3999     zSign = ( (int64_t) aSig < 0 );
4000     if ( zSign ) aSig = - aSig;
4001     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4002 
4003 }
4004 
4005 
4006 /*----------------------------------------------------------------------------
4007 | Returns the square root of the double-precision floating-point value `a'.
4008 | The operation is performed according to the IEC/IEEE Standard for Binary
4009 | Floating-Point Arithmetic.
4010 *----------------------------------------------------------------------------*/
4011 
4012 float64 float64_sqrt(float64 a, float_status *status)
4013 {
4014     flag aSign;
4015     int aExp, zExp;
4016     uint64_t aSig, zSig, doubleZSig;
4017     uint64_t rem0, rem1, term0, term1;
4018     a = float64_squash_input_denormal(a, status);
4019 
4020     aSig = extractFloat64Frac( a );
4021     aExp = extractFloat64Exp( a );
4022     aSign = extractFloat64Sign( a );
4023     if ( aExp == 0x7FF ) {
4024         if (aSig) {
4025             return propagateFloat64NaN(a, a, status);
4026         }
4027         if ( ! aSign ) return a;
4028         float_raise(float_flag_invalid, status);
4029         return float64_default_nan(status);
4030     }
4031     if ( aSign ) {
4032         if ( ( aExp | aSig ) == 0 ) return a;
4033         float_raise(float_flag_invalid, status);
4034         return float64_default_nan(status);
4035     }
4036     if ( aExp == 0 ) {
4037         if ( aSig == 0 ) return float64_zero;
4038         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4039     }
4040     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4041     aSig |= LIT64( 0x0010000000000000 );
4042     zSig = estimateSqrt32( aExp, aSig>>21 );
4043     aSig <<= 9 - ( aExp & 1 );
4044     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4045     if ( ( zSig & 0x1FF ) <= 5 ) {
4046         doubleZSig = zSig<<1;
4047         mul64To128( zSig, zSig, &term0, &term1 );
4048         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4049         while ( (int64_t) rem0 < 0 ) {
4050             --zSig;
4051             doubleZSig -= 2;
4052             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4053         }
4054         zSig |= ( ( rem0 | rem1 ) != 0 );
4055     }
4056     return roundAndPackFloat64(0, zExp, zSig, status);
4057 
4058 }
4059 
4060 /*----------------------------------------------------------------------------
4061 | Returns the binary log of the double-precision floating-point value `a'.
4062 | The operation is performed according to the IEC/IEEE Standard for Binary
4063 | Floating-Point Arithmetic.
4064 *----------------------------------------------------------------------------*/
4065 float64 float64_log2(float64 a, float_status *status)
4066 {
4067     flag aSign, zSign;
4068     int aExp;
4069     uint64_t aSig, aSig0, aSig1, zSig, i;
4070     a = float64_squash_input_denormal(a, status);
4071 
4072     aSig = extractFloat64Frac( a );
4073     aExp = extractFloat64Exp( a );
4074     aSign = extractFloat64Sign( a );
4075 
4076     if ( aExp == 0 ) {
4077         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4078         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4079     }
4080     if ( aSign ) {
4081         float_raise(float_flag_invalid, status);
4082         return float64_default_nan(status);
4083     }
4084     if ( aExp == 0x7FF ) {
4085         if (aSig) {
4086             return propagateFloat64NaN(a, float64_zero, status);
4087         }
4088         return a;
4089     }
4090 
4091     aExp -= 0x3FF;
4092     aSig |= LIT64( 0x0010000000000000 );
4093     zSign = aExp < 0;
4094     zSig = (uint64_t)aExp << 52;
4095     for (i = 1LL << 51; i > 0; i >>= 1) {
4096         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4097         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4098         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4099             aSig >>= 1;
4100             zSig |= i;
4101         }
4102     }
4103 
4104     if ( zSign )
4105         zSig = -zSig;
4106     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4107 }
4108 
4109 /*----------------------------------------------------------------------------
4110 | Returns 1 if the double-precision floating-point value `a' is equal to the
4111 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4112 | if either operand is a NaN.  Otherwise, the comparison is performed
4113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4114 *----------------------------------------------------------------------------*/
4115 
4116 int float64_eq(float64 a, float64 b, float_status *status)
4117 {
4118     uint64_t av, bv;
4119     a = float64_squash_input_denormal(a, status);
4120     b = float64_squash_input_denormal(b, status);
4121 
4122     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4123          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4124        ) {
4125         float_raise(float_flag_invalid, status);
4126         return 0;
4127     }
4128     av = float64_val(a);
4129     bv = float64_val(b);
4130     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4131 
4132 }
4133 
4134 /*----------------------------------------------------------------------------
4135 | Returns 1 if the double-precision floating-point value `a' is less than or
4136 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4137 | exception is raised if either operand is a NaN.  The comparison is performed
4138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4139 *----------------------------------------------------------------------------*/
4140 
4141 int float64_le(float64 a, float64 b, float_status *status)
4142 {
4143     flag aSign, bSign;
4144     uint64_t av, bv;
4145     a = float64_squash_input_denormal(a, status);
4146     b = float64_squash_input_denormal(b, status);
4147 
4148     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4149          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4150        ) {
4151         float_raise(float_flag_invalid, status);
4152         return 0;
4153     }
4154     aSign = extractFloat64Sign( a );
4155     bSign = extractFloat64Sign( b );
4156     av = float64_val(a);
4157     bv = float64_val(b);
4158     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4159     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4160 
4161 }
4162 
4163 /*----------------------------------------------------------------------------
4164 | Returns 1 if the double-precision floating-point value `a' is less than
4165 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4166 | raised if either operand is a NaN.  The comparison is performed according
4167 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4168 *----------------------------------------------------------------------------*/
4169 
4170 int float64_lt(float64 a, float64 b, float_status *status)
4171 {
4172     flag aSign, bSign;
4173     uint64_t av, bv;
4174 
4175     a = float64_squash_input_denormal(a, status);
4176     b = float64_squash_input_denormal(b, status);
4177     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4178          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4179        ) {
4180         float_raise(float_flag_invalid, status);
4181         return 0;
4182     }
4183     aSign = extractFloat64Sign( a );
4184     bSign = extractFloat64Sign( b );
4185     av = float64_val(a);
4186     bv = float64_val(b);
4187     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4188     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4189 
4190 }
4191 
4192 /*----------------------------------------------------------------------------
4193 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4194 | be compared, and 0 otherwise.  The invalid exception is raised if either
4195 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4196 | Standard for Binary Floating-Point Arithmetic.
4197 *----------------------------------------------------------------------------*/
4198 
4199 int float64_unordered(float64 a, float64 b, float_status *status)
4200 {
4201     a = float64_squash_input_denormal(a, status);
4202     b = float64_squash_input_denormal(b, status);
4203 
4204     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4205          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4206        ) {
4207         float_raise(float_flag_invalid, status);
4208         return 1;
4209     }
4210     return 0;
4211 }
4212 
4213 /*----------------------------------------------------------------------------
4214 | Returns 1 if the double-precision floating-point value `a' is equal to the
4215 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4216 | exception.The comparison is performed according to the IEC/IEEE Standard
4217 | for Binary Floating-Point Arithmetic.
4218 *----------------------------------------------------------------------------*/
4219 
4220 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4221 {
4222     uint64_t av, bv;
4223     a = float64_squash_input_denormal(a, status);
4224     b = float64_squash_input_denormal(b, status);
4225 
4226     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4227          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4228        ) {
4229         if (float64_is_signaling_nan(a, status)
4230          || float64_is_signaling_nan(b, status)) {
4231             float_raise(float_flag_invalid, status);
4232         }
4233         return 0;
4234     }
4235     av = float64_val(a);
4236     bv = float64_val(b);
4237     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4238 
4239 }
4240 
4241 /*----------------------------------------------------------------------------
4242 | Returns 1 if the double-precision floating-point value `a' is less than or
4243 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4244 | cause an exception.  Otherwise, the comparison is performed according to the
4245 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4246 *----------------------------------------------------------------------------*/
4247 
4248 int float64_le_quiet(float64 a, float64 b, float_status *status)
4249 {
4250     flag aSign, bSign;
4251     uint64_t av, bv;
4252     a = float64_squash_input_denormal(a, status);
4253     b = float64_squash_input_denormal(b, status);
4254 
4255     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4256          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4257        ) {
4258         if (float64_is_signaling_nan(a, status)
4259          || float64_is_signaling_nan(b, status)) {
4260             float_raise(float_flag_invalid, status);
4261         }
4262         return 0;
4263     }
4264     aSign = extractFloat64Sign( a );
4265     bSign = extractFloat64Sign( b );
4266     av = float64_val(a);
4267     bv = float64_val(b);
4268     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4269     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4270 
4271 }
4272 
4273 /*----------------------------------------------------------------------------
4274 | Returns 1 if the double-precision floating-point value `a' is less than
4275 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4276 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4277 | Standard for Binary Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4279 
4280 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4281 {
4282     flag aSign, bSign;
4283     uint64_t av, bv;
4284     a = float64_squash_input_denormal(a, status);
4285     b = float64_squash_input_denormal(b, status);
4286 
4287     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4288          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4289        ) {
4290         if (float64_is_signaling_nan(a, status)
4291          || float64_is_signaling_nan(b, status)) {
4292             float_raise(float_flag_invalid, status);
4293         }
4294         return 0;
4295     }
4296     aSign = extractFloat64Sign( a );
4297     bSign = extractFloat64Sign( b );
4298     av = float64_val(a);
4299     bv = float64_val(b);
4300     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4301     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4302 
4303 }
4304 
4305 /*----------------------------------------------------------------------------
4306 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4307 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4308 | comparison is performed according to the IEC/IEEE Standard for Binary
4309 | Floating-Point Arithmetic.
4310 *----------------------------------------------------------------------------*/
4311 
4312 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4313 {
4314     a = float64_squash_input_denormal(a, status);
4315     b = float64_squash_input_denormal(b, status);
4316 
4317     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4318          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4319        ) {
4320         if (float64_is_signaling_nan(a, status)
4321          || float64_is_signaling_nan(b, status)) {
4322             float_raise(float_flag_invalid, status);
4323         }
4324         return 1;
4325     }
4326     return 0;
4327 }
4328 
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the extended double-precision floating-
4331 | point value `a' to the 32-bit two's complement integer format.  The
4332 | conversion is performed according to the IEC/IEEE Standard for Binary
4333 | Floating-Point Arithmetic---which means in particular that the conversion
4334 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4335 | largest positive integer is returned.  Otherwise, if the conversion
4336 | overflows, the largest integer with the same sign as `a' is returned.
4337 *----------------------------------------------------------------------------*/
4338 
4339 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4340 {
4341     flag aSign;
4342     int32_t aExp, shiftCount;
4343     uint64_t aSig;
4344 
4345     if (floatx80_invalid_encoding(a)) {
4346         float_raise(float_flag_invalid, status);
4347         return 1 << 31;
4348     }
4349     aSig = extractFloatx80Frac( a );
4350     aExp = extractFloatx80Exp( a );
4351     aSign = extractFloatx80Sign( a );
4352     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4353     shiftCount = 0x4037 - aExp;
4354     if ( shiftCount <= 0 ) shiftCount = 1;
4355     shift64RightJamming( aSig, shiftCount, &aSig );
4356     return roundAndPackInt32(aSign, aSig, status);
4357 
4358 }
4359 
4360 /*----------------------------------------------------------------------------
4361 | Returns the result of converting the extended double-precision floating-
4362 | point value `a' to the 32-bit two's complement integer format.  The
4363 | conversion is performed according to the IEC/IEEE Standard for Binary
4364 | Floating-Point Arithmetic, except that the conversion is always rounded
4365 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4366 | Otherwise, if the conversion overflows, the largest integer with the same
4367 | sign as `a' is returned.
4368 *----------------------------------------------------------------------------*/
4369 
4370 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4371 {
4372     flag aSign;
4373     int32_t aExp, shiftCount;
4374     uint64_t aSig, savedASig;
4375     int32_t z;
4376 
4377     if (floatx80_invalid_encoding(a)) {
4378         float_raise(float_flag_invalid, status);
4379         return 1 << 31;
4380     }
4381     aSig = extractFloatx80Frac( a );
4382     aExp = extractFloatx80Exp( a );
4383     aSign = extractFloatx80Sign( a );
4384     if ( 0x401E < aExp ) {
4385         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4386         goto invalid;
4387     }
4388     else if ( aExp < 0x3FFF ) {
4389         if (aExp || aSig) {
4390             status->float_exception_flags |= float_flag_inexact;
4391         }
4392         return 0;
4393     }
4394     shiftCount = 0x403E - aExp;
4395     savedASig = aSig;
4396     aSig >>= shiftCount;
4397     z = aSig;
4398     if ( aSign ) z = - z;
4399     if ( ( z < 0 ) ^ aSign ) {
4400  invalid:
4401         float_raise(float_flag_invalid, status);
4402         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4403     }
4404     if ( ( aSig<<shiftCount ) != savedASig ) {
4405         status->float_exception_flags |= float_flag_inexact;
4406     }
4407     return z;
4408 
4409 }
4410 
4411 /*----------------------------------------------------------------------------
4412 | Returns the result of converting the extended double-precision floating-
4413 | point value `a' to the 64-bit two's complement integer format.  The
4414 | conversion is performed according to the IEC/IEEE Standard for Binary
4415 | Floating-Point Arithmetic---which means in particular that the conversion
4416 | is rounded according to the current rounding mode.  If `a' is a NaN,
4417 | the largest positive integer is returned.  Otherwise, if the conversion
4418 | overflows, the largest integer with the same sign as `a' is returned.
4419 *----------------------------------------------------------------------------*/
4420 
4421 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4422 {
4423     flag aSign;
4424     int32_t aExp, shiftCount;
4425     uint64_t aSig, aSigExtra;
4426 
4427     if (floatx80_invalid_encoding(a)) {
4428         float_raise(float_flag_invalid, status);
4429         return 1ULL << 63;
4430     }
4431     aSig = extractFloatx80Frac( a );
4432     aExp = extractFloatx80Exp( a );
4433     aSign = extractFloatx80Sign( a );
4434     shiftCount = 0x403E - aExp;
4435     if ( shiftCount <= 0 ) {
4436         if ( shiftCount ) {
4437             float_raise(float_flag_invalid, status);
4438             if (    ! aSign
4439                  || (    ( aExp == 0x7FFF )
4440                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4441                ) {
4442                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4443             }
4444             return (int64_t) LIT64( 0x8000000000000000 );
4445         }
4446         aSigExtra = 0;
4447     }
4448     else {
4449         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4450     }
4451     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4452 
4453 }
4454 
4455 /*----------------------------------------------------------------------------
4456 | Returns the result of converting the extended double-precision floating-
4457 | point value `a' to the 64-bit two's complement integer format.  The
4458 | conversion is performed according to the IEC/IEEE Standard for Binary
4459 | Floating-Point Arithmetic, except that the conversion is always rounded
4460 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4461 | Otherwise, if the conversion overflows, the largest integer with the same
4462 | sign as `a' is returned.
4463 *----------------------------------------------------------------------------*/
4464 
4465 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4466 {
4467     flag aSign;
4468     int32_t aExp, shiftCount;
4469     uint64_t aSig;
4470     int64_t z;
4471 
4472     if (floatx80_invalid_encoding(a)) {
4473         float_raise(float_flag_invalid, status);
4474         return 1ULL << 63;
4475     }
4476     aSig = extractFloatx80Frac( a );
4477     aExp = extractFloatx80Exp( a );
4478     aSign = extractFloatx80Sign( a );
4479     shiftCount = aExp - 0x403E;
4480     if ( 0 <= shiftCount ) {
4481         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4482         if ( ( a.high != 0xC03E ) || aSig ) {
4483             float_raise(float_flag_invalid, status);
4484             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4485                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4486             }
4487         }
4488         return (int64_t) LIT64( 0x8000000000000000 );
4489     }
4490     else if ( aExp < 0x3FFF ) {
4491         if (aExp | aSig) {
4492             status->float_exception_flags |= float_flag_inexact;
4493         }
4494         return 0;
4495     }
4496     z = aSig>>( - shiftCount );
4497     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4498         status->float_exception_flags |= float_flag_inexact;
4499     }
4500     if ( aSign ) z = - z;
4501     return z;
4502 
4503 }
4504 
4505 /*----------------------------------------------------------------------------
4506 | Returns the result of converting the extended double-precision floating-
4507 | point value `a' to the single-precision floating-point format.  The
4508 | conversion is performed according to the IEC/IEEE Standard for Binary
4509 | Floating-Point Arithmetic.
4510 *----------------------------------------------------------------------------*/
4511 
4512 float32 floatx80_to_float32(floatx80 a, float_status *status)
4513 {
4514     flag aSign;
4515     int32_t aExp;
4516     uint64_t aSig;
4517 
4518     if (floatx80_invalid_encoding(a)) {
4519         float_raise(float_flag_invalid, status);
4520         return float32_default_nan(status);
4521     }
4522     aSig = extractFloatx80Frac( a );
4523     aExp = extractFloatx80Exp( a );
4524     aSign = extractFloatx80Sign( a );
4525     if ( aExp == 0x7FFF ) {
4526         if ( (uint64_t) ( aSig<<1 ) ) {
4527             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4528         }
4529         return packFloat32( aSign, 0xFF, 0 );
4530     }
4531     shift64RightJamming( aSig, 33, &aSig );
4532     if ( aExp || aSig ) aExp -= 0x3F81;
4533     return roundAndPackFloat32(aSign, aExp, aSig, status);
4534 
4535 }
4536 
4537 /*----------------------------------------------------------------------------
4538 | Returns the result of converting the extended double-precision floating-
4539 | point value `a' to the double-precision floating-point format.  The
4540 | conversion is performed according to the IEC/IEEE Standard for Binary
4541 | Floating-Point Arithmetic.
4542 *----------------------------------------------------------------------------*/
4543 
4544 float64 floatx80_to_float64(floatx80 a, float_status *status)
4545 {
4546     flag aSign;
4547     int32_t aExp;
4548     uint64_t aSig, zSig;
4549 
4550     if (floatx80_invalid_encoding(a)) {
4551         float_raise(float_flag_invalid, status);
4552         return float64_default_nan(status);
4553     }
4554     aSig = extractFloatx80Frac( a );
4555     aExp = extractFloatx80Exp( a );
4556     aSign = extractFloatx80Sign( a );
4557     if ( aExp == 0x7FFF ) {
4558         if ( (uint64_t) ( aSig<<1 ) ) {
4559             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4560         }
4561         return packFloat64( aSign, 0x7FF, 0 );
4562     }
4563     shift64RightJamming( aSig, 1, &zSig );
4564     if ( aExp || aSig ) aExp -= 0x3C01;
4565     return roundAndPackFloat64(aSign, aExp, zSig, status);
4566 
4567 }
4568 
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the extended double-precision floating-
4571 | point value `a' to the quadruple-precision floating-point format.  The
4572 | conversion is performed according to the IEC/IEEE Standard for Binary
4573 | Floating-Point Arithmetic.
4574 *----------------------------------------------------------------------------*/
4575 
4576 float128 floatx80_to_float128(floatx80 a, float_status *status)
4577 {
4578     flag aSign;
4579     int aExp;
4580     uint64_t aSig, zSig0, zSig1;
4581 
4582     if (floatx80_invalid_encoding(a)) {
4583         float_raise(float_flag_invalid, status);
4584         return float128_default_nan(status);
4585     }
4586     aSig = extractFloatx80Frac( a );
4587     aExp = extractFloatx80Exp( a );
4588     aSign = extractFloatx80Sign( a );
4589     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4590         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4591     }
4592     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4593     return packFloat128( aSign, aExp, zSig0, zSig1 );
4594 
4595 }
4596 
4597 /*----------------------------------------------------------------------------
4598 | Rounds the extended double-precision floating-point value `a'
4599 | to the precision provided by floatx80_rounding_precision and returns the
4600 | result as an extended double-precision floating-point value.
4601 | The operation is performed according to the IEC/IEEE Standard for Binary
4602 | Floating-Point Arithmetic.
4603 *----------------------------------------------------------------------------*/
4604 
4605 floatx80 floatx80_round(floatx80 a, float_status *status)
4606 {
4607     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4608                                 extractFloatx80Sign(a),
4609                                 extractFloatx80Exp(a),
4610                                 extractFloatx80Frac(a), 0, status);
4611 }
4612 
4613 /*----------------------------------------------------------------------------
4614 | Rounds the extended double-precision floating-point value `a' to an integer,
4615 | and returns the result as an extended quadruple-precision floating-point
4616 | value.  The operation is performed according to the IEC/IEEE Standard for
4617 | Binary Floating-Point Arithmetic.
4618 *----------------------------------------------------------------------------*/
4619 
4620 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4621 {
4622     flag aSign;
4623     int32_t aExp;
4624     uint64_t lastBitMask, roundBitsMask;
4625     floatx80 z;
4626 
4627     if (floatx80_invalid_encoding(a)) {
4628         float_raise(float_flag_invalid, status);
4629         return floatx80_default_nan(status);
4630     }
4631     aExp = extractFloatx80Exp( a );
4632     if ( 0x403E <= aExp ) {
4633         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4634             return propagateFloatx80NaN(a, a, status);
4635         }
4636         return a;
4637     }
4638     if ( aExp < 0x3FFF ) {
4639         if (    ( aExp == 0 )
4640              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4641             return a;
4642         }
4643         status->float_exception_flags |= float_flag_inexact;
4644         aSign = extractFloatx80Sign( a );
4645         switch (status->float_rounding_mode) {
4646          case float_round_nearest_even:
4647             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4648                ) {
4649                 return
4650                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4651             }
4652             break;
4653         case float_round_ties_away:
4654             if (aExp == 0x3FFE) {
4655                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4656             }
4657             break;
4658          case float_round_down:
4659             return
4660                   aSign ?
4661                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4662                 : packFloatx80( 0, 0, 0 );
4663          case float_round_up:
4664             return
4665                   aSign ? packFloatx80( 1, 0, 0 )
4666                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4667         }
4668         return packFloatx80( aSign, 0, 0 );
4669     }
4670     lastBitMask = 1;
4671     lastBitMask <<= 0x403E - aExp;
4672     roundBitsMask = lastBitMask - 1;
4673     z = a;
4674     switch (status->float_rounding_mode) {
4675     case float_round_nearest_even:
4676         z.low += lastBitMask>>1;
4677         if ((z.low & roundBitsMask) == 0) {
4678             z.low &= ~lastBitMask;
4679         }
4680         break;
4681     case float_round_ties_away:
4682         z.low += lastBitMask >> 1;
4683         break;
4684     case float_round_to_zero:
4685         break;
4686     case float_round_up:
4687         if (!extractFloatx80Sign(z)) {
4688             z.low += roundBitsMask;
4689         }
4690         break;
4691     case float_round_down:
4692         if (extractFloatx80Sign(z)) {
4693             z.low += roundBitsMask;
4694         }
4695         break;
4696     default:
4697         abort();
4698     }
4699     z.low &= ~ roundBitsMask;
4700     if ( z.low == 0 ) {
4701         ++z.high;
4702         z.low = LIT64( 0x8000000000000000 );
4703     }
4704     if (z.low != a.low) {
4705         status->float_exception_flags |= float_flag_inexact;
4706     }
4707     return z;
4708 
4709 }
4710 
4711 /*----------------------------------------------------------------------------
4712 | Returns the result of adding the absolute values of the extended double-
4713 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4714 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4715 | The addition is performed according to the IEC/IEEE Standard for Binary
4716 | Floating-Point Arithmetic.
4717 *----------------------------------------------------------------------------*/
4718 
4719 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4720                                 float_status *status)
4721 {
4722     int32_t aExp, bExp, zExp;
4723     uint64_t aSig, bSig, zSig0, zSig1;
4724     int32_t expDiff;
4725 
4726     aSig = extractFloatx80Frac( a );
4727     aExp = extractFloatx80Exp( a );
4728     bSig = extractFloatx80Frac( b );
4729     bExp = extractFloatx80Exp( b );
4730     expDiff = aExp - bExp;
4731     if ( 0 < expDiff ) {
4732         if ( aExp == 0x7FFF ) {
4733             if ((uint64_t)(aSig << 1)) {
4734                 return propagateFloatx80NaN(a, b, status);
4735             }
4736             return a;
4737         }
4738         if ( bExp == 0 ) --expDiff;
4739         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4740         zExp = aExp;
4741     }
4742     else if ( expDiff < 0 ) {
4743         if ( bExp == 0x7FFF ) {
4744             if ((uint64_t)(bSig << 1)) {
4745                 return propagateFloatx80NaN(a, b, status);
4746             }
4747             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4748         }
4749         if ( aExp == 0 ) ++expDiff;
4750         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4751         zExp = bExp;
4752     }
4753     else {
4754         if ( aExp == 0x7FFF ) {
4755             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4756                 return propagateFloatx80NaN(a, b, status);
4757             }
4758             return a;
4759         }
4760         zSig1 = 0;
4761         zSig0 = aSig + bSig;
4762         if ( aExp == 0 ) {
4763             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4764             goto roundAndPack;
4765         }
4766         zExp = aExp;
4767         goto shiftRight1;
4768     }
4769     zSig0 = aSig + bSig;
4770     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4771  shiftRight1:
4772     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4773     zSig0 |= LIT64( 0x8000000000000000 );
4774     ++zExp;
4775  roundAndPack:
4776     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4777                                 zSign, zExp, zSig0, zSig1, status);
4778 }
4779 
4780 /*----------------------------------------------------------------------------
4781 | Returns the result of subtracting the absolute values of the extended
4782 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4783 | difference is negated before being returned.  `zSign' is ignored if the
4784 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4785 | Standard for Binary Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787 
4788 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4789                                 float_status *status)
4790 {
4791     int32_t aExp, bExp, zExp;
4792     uint64_t aSig, bSig, zSig0, zSig1;
4793     int32_t expDiff;
4794 
4795     aSig = extractFloatx80Frac( a );
4796     aExp = extractFloatx80Exp( a );
4797     bSig = extractFloatx80Frac( b );
4798     bExp = extractFloatx80Exp( b );
4799     expDiff = aExp - bExp;
4800     if ( 0 < expDiff ) goto aExpBigger;
4801     if ( expDiff < 0 ) goto bExpBigger;
4802     if ( aExp == 0x7FFF ) {
4803         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4804             return propagateFloatx80NaN(a, b, status);
4805         }
4806         float_raise(float_flag_invalid, status);
4807         return floatx80_default_nan(status);
4808     }
4809     if ( aExp == 0 ) {
4810         aExp = 1;
4811         bExp = 1;
4812     }
4813     zSig1 = 0;
4814     if ( bSig < aSig ) goto aBigger;
4815     if ( aSig < bSig ) goto bBigger;
4816     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4817  bExpBigger:
4818     if ( bExp == 0x7FFF ) {
4819         if ((uint64_t)(bSig << 1)) {
4820             return propagateFloatx80NaN(a, b, status);
4821         }
4822         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4823     }
4824     if ( aExp == 0 ) ++expDiff;
4825     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4826  bBigger:
4827     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4828     zExp = bExp;
4829     zSign ^= 1;
4830     goto normalizeRoundAndPack;
4831  aExpBigger:
4832     if ( aExp == 0x7FFF ) {
4833         if ((uint64_t)(aSig << 1)) {
4834             return propagateFloatx80NaN(a, b, status);
4835         }
4836         return a;
4837     }
4838     if ( bExp == 0 ) --expDiff;
4839     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4840  aBigger:
4841     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4842     zExp = aExp;
4843  normalizeRoundAndPack:
4844     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4845                                          zSign, zExp, zSig0, zSig1, status);
4846 }
4847 
4848 /*----------------------------------------------------------------------------
4849 | Returns the result of adding the extended double-precision floating-point
4850 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4851 | Standard for Binary Floating-Point Arithmetic.
4852 *----------------------------------------------------------------------------*/
4853 
4854 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4855 {
4856     flag aSign, bSign;
4857 
4858     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4859         float_raise(float_flag_invalid, status);
4860         return floatx80_default_nan(status);
4861     }
4862     aSign = extractFloatx80Sign( a );
4863     bSign = extractFloatx80Sign( b );
4864     if ( aSign == bSign ) {
4865         return addFloatx80Sigs(a, b, aSign, status);
4866     }
4867     else {
4868         return subFloatx80Sigs(a, b, aSign, status);
4869     }
4870 
4871 }
4872 
4873 /*----------------------------------------------------------------------------
4874 | Returns the result of subtracting the extended double-precision floating-
4875 | point values `a' and `b'.  The operation is performed according to the
4876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4877 *----------------------------------------------------------------------------*/
4878 
4879 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4880 {
4881     flag aSign, bSign;
4882 
4883     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4884         float_raise(float_flag_invalid, status);
4885         return floatx80_default_nan(status);
4886     }
4887     aSign = extractFloatx80Sign( a );
4888     bSign = extractFloatx80Sign( b );
4889     if ( aSign == bSign ) {
4890         return subFloatx80Sigs(a, b, aSign, status);
4891     }
4892     else {
4893         return addFloatx80Sigs(a, b, aSign, status);
4894     }
4895 
4896 }
4897 
4898 /*----------------------------------------------------------------------------
4899 | Returns the result of multiplying the extended double-precision floating-
4900 | point values `a' and `b'.  The operation is performed according to the
4901 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4902 *----------------------------------------------------------------------------*/
4903 
4904 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4905 {
4906     flag aSign, bSign, zSign;
4907     int32_t aExp, bExp, zExp;
4908     uint64_t aSig, bSig, zSig0, zSig1;
4909 
4910     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4911         float_raise(float_flag_invalid, status);
4912         return floatx80_default_nan(status);
4913     }
4914     aSig = extractFloatx80Frac( a );
4915     aExp = extractFloatx80Exp( a );
4916     aSign = extractFloatx80Sign( a );
4917     bSig = extractFloatx80Frac( b );
4918     bExp = extractFloatx80Exp( b );
4919     bSign = extractFloatx80Sign( b );
4920     zSign = aSign ^ bSign;
4921     if ( aExp == 0x7FFF ) {
4922         if (    (uint64_t) ( aSig<<1 )
4923              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4924             return propagateFloatx80NaN(a, b, status);
4925         }
4926         if ( ( bExp | bSig ) == 0 ) goto invalid;
4927         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4928     }
4929     if ( bExp == 0x7FFF ) {
4930         if ((uint64_t)(bSig << 1)) {
4931             return propagateFloatx80NaN(a, b, status);
4932         }
4933         if ( ( aExp | aSig ) == 0 ) {
4934  invalid:
4935             float_raise(float_flag_invalid, status);
4936             return floatx80_default_nan(status);
4937         }
4938         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4939     }
4940     if ( aExp == 0 ) {
4941         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4942         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4943     }
4944     if ( bExp == 0 ) {
4945         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4946         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4947     }
4948     zExp = aExp + bExp - 0x3FFE;
4949     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4950     if ( 0 < (int64_t) zSig0 ) {
4951         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4952         --zExp;
4953     }
4954     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4955                                 zSign, zExp, zSig0, zSig1, status);
4956 }
4957 
4958 /*----------------------------------------------------------------------------
4959 | Returns the result of dividing the extended double-precision floating-point
4960 | value `a' by the corresponding value `b'.  The operation is performed
4961 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4962 *----------------------------------------------------------------------------*/
4963 
4964 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4965 {
4966     flag aSign, bSign, zSign;
4967     int32_t aExp, bExp, zExp;
4968     uint64_t aSig, bSig, zSig0, zSig1;
4969     uint64_t rem0, rem1, rem2, term0, term1, term2;
4970 
4971     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4972         float_raise(float_flag_invalid, status);
4973         return floatx80_default_nan(status);
4974     }
4975     aSig = extractFloatx80Frac( a );
4976     aExp = extractFloatx80Exp( a );
4977     aSign = extractFloatx80Sign( a );
4978     bSig = extractFloatx80Frac( b );
4979     bExp = extractFloatx80Exp( b );
4980     bSign = extractFloatx80Sign( b );
4981     zSign = aSign ^ bSign;
4982     if ( aExp == 0x7FFF ) {
4983         if ((uint64_t)(aSig << 1)) {
4984             return propagateFloatx80NaN(a, b, status);
4985         }
4986         if ( bExp == 0x7FFF ) {
4987             if ((uint64_t)(bSig << 1)) {
4988                 return propagateFloatx80NaN(a, b, status);
4989             }
4990             goto invalid;
4991         }
4992         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4993     }
4994     if ( bExp == 0x7FFF ) {
4995         if ((uint64_t)(bSig << 1)) {
4996             return propagateFloatx80NaN(a, b, status);
4997         }
4998         return packFloatx80( zSign, 0, 0 );
4999     }
5000     if ( bExp == 0 ) {
5001         if ( bSig == 0 ) {
5002             if ( ( aExp | aSig ) == 0 ) {
5003  invalid:
5004                 float_raise(float_flag_invalid, status);
5005                 return floatx80_default_nan(status);
5006             }
5007             float_raise(float_flag_divbyzero, status);
5008             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5009         }
5010         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5011     }
5012     if ( aExp == 0 ) {
5013         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5014         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5015     }
5016     zExp = aExp - bExp + 0x3FFE;
5017     rem1 = 0;
5018     if ( bSig <= aSig ) {
5019         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5020         ++zExp;
5021     }
5022     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5023     mul64To128( bSig, zSig0, &term0, &term1 );
5024     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5025     while ( (int64_t) rem0 < 0 ) {
5026         --zSig0;
5027         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5028     }
5029     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5030     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5031         mul64To128( bSig, zSig1, &term1, &term2 );
5032         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5033         while ( (int64_t) rem1 < 0 ) {
5034             --zSig1;
5035             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5036         }
5037         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5038     }
5039     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5040                                 zSign, zExp, zSig0, zSig1, status);
5041 }
5042 
5043 /*----------------------------------------------------------------------------
5044 | Returns the remainder of the extended double-precision floating-point value
5045 | `a' with respect to the corresponding value `b'.  The operation is performed
5046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048 
5049 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5050 {
5051     flag aSign, zSign;
5052     int32_t aExp, bExp, expDiff;
5053     uint64_t aSig0, aSig1, bSig;
5054     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5055 
5056     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5057         float_raise(float_flag_invalid, status);
5058         return floatx80_default_nan(status);
5059     }
5060     aSig0 = extractFloatx80Frac( a );
5061     aExp = extractFloatx80Exp( a );
5062     aSign = extractFloatx80Sign( a );
5063     bSig = extractFloatx80Frac( b );
5064     bExp = extractFloatx80Exp( b );
5065     if ( aExp == 0x7FFF ) {
5066         if (    (uint64_t) ( aSig0<<1 )
5067              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5068             return propagateFloatx80NaN(a, b, status);
5069         }
5070         goto invalid;
5071     }
5072     if ( bExp == 0x7FFF ) {
5073         if ((uint64_t)(bSig << 1)) {
5074             return propagateFloatx80NaN(a, b, status);
5075         }
5076         return a;
5077     }
5078     if ( bExp == 0 ) {
5079         if ( bSig == 0 ) {
5080  invalid:
5081             float_raise(float_flag_invalid, status);
5082             return floatx80_default_nan(status);
5083         }
5084         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5085     }
5086     if ( aExp == 0 ) {
5087         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5088         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5089     }
5090     bSig |= LIT64( 0x8000000000000000 );
5091     zSign = aSign;
5092     expDiff = aExp - bExp;
5093     aSig1 = 0;
5094     if ( expDiff < 0 ) {
5095         if ( expDiff < -1 ) return a;
5096         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5097         expDiff = 0;
5098     }
5099     q = ( bSig <= aSig0 );
5100     if ( q ) aSig0 -= bSig;
5101     expDiff -= 64;
5102     while ( 0 < expDiff ) {
5103         q = estimateDiv128To64( aSig0, aSig1, bSig );
5104         q = ( 2 < q ) ? q - 2 : 0;
5105         mul64To128( bSig, q, &term0, &term1 );
5106         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5107         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5108         expDiff -= 62;
5109     }
5110     expDiff += 64;
5111     if ( 0 < expDiff ) {
5112         q = estimateDiv128To64( aSig0, aSig1, bSig );
5113         q = ( 2 < q ) ? q - 2 : 0;
5114         q >>= 64 - expDiff;
5115         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5116         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5117         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5118         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5119             ++q;
5120             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5121         }
5122     }
5123     else {
5124         term1 = 0;
5125         term0 = bSig;
5126     }
5127     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5128     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5129          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5130               && ( q & 1 ) )
5131        ) {
5132         aSig0 = alternateASig0;
5133         aSig1 = alternateASig1;
5134         zSign = ! zSign;
5135     }
5136     return
5137         normalizeRoundAndPackFloatx80(
5138             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5139 
5140 }
5141 
5142 /*----------------------------------------------------------------------------
5143 | Returns the square root of the extended double-precision floating-point
5144 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5145 | for Binary Floating-Point Arithmetic.
5146 *----------------------------------------------------------------------------*/
5147 
5148 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5149 {
5150     flag aSign;
5151     int32_t aExp, zExp;
5152     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5153     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5154 
5155     if (floatx80_invalid_encoding(a)) {
5156         float_raise(float_flag_invalid, status);
5157         return floatx80_default_nan(status);
5158     }
5159     aSig0 = extractFloatx80Frac( a );
5160     aExp = extractFloatx80Exp( a );
5161     aSign = extractFloatx80Sign( a );
5162     if ( aExp == 0x7FFF ) {
5163         if ((uint64_t)(aSig0 << 1)) {
5164             return propagateFloatx80NaN(a, a, status);
5165         }
5166         if ( ! aSign ) return a;
5167         goto invalid;
5168     }
5169     if ( aSign ) {
5170         if ( ( aExp | aSig0 ) == 0 ) return a;
5171  invalid:
5172         float_raise(float_flag_invalid, status);
5173         return floatx80_default_nan(status);
5174     }
5175     if ( aExp == 0 ) {
5176         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5177         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5178     }
5179     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5180     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5181     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5182     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5183     doubleZSig0 = zSig0<<1;
5184     mul64To128( zSig0, zSig0, &term0, &term1 );
5185     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5186     while ( (int64_t) rem0 < 0 ) {
5187         --zSig0;
5188         doubleZSig0 -= 2;
5189         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5190     }
5191     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5192     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5193         if ( zSig1 == 0 ) zSig1 = 1;
5194         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5195         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5196         mul64To128( zSig1, zSig1, &term2, &term3 );
5197         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5198         while ( (int64_t) rem1 < 0 ) {
5199             --zSig1;
5200             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5201             term3 |= 1;
5202             term2 |= doubleZSig0;
5203             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5204         }
5205         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5206     }
5207     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5208     zSig0 |= doubleZSig0;
5209     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5210                                 0, zExp, zSig0, zSig1, status);
5211 }
5212 
5213 /*----------------------------------------------------------------------------
5214 | Returns 1 if the extended double-precision floating-point value `a' is equal
5215 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5216 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5217 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5218 *----------------------------------------------------------------------------*/
5219 
5220 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5221 {
5222 
5223     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5224         || (extractFloatx80Exp(a) == 0x7FFF
5225             && (uint64_t) (extractFloatx80Frac(a) << 1))
5226         || (extractFloatx80Exp(b) == 0x7FFF
5227             && (uint64_t) (extractFloatx80Frac(b) << 1))
5228        ) {
5229         float_raise(float_flag_invalid, status);
5230         return 0;
5231     }
5232     return
5233            ( a.low == b.low )
5234         && (    ( a.high == b.high )
5235              || (    ( a.low == 0 )
5236                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5237            );
5238 
5239 }
5240 
5241 /*----------------------------------------------------------------------------
5242 | Returns 1 if the extended double-precision floating-point value `a' is
5243 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5244 | invalid exception is raised if either operand is a NaN.  The comparison is
5245 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5246 | Arithmetic.
5247 *----------------------------------------------------------------------------*/
5248 
5249 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5250 {
5251     flag aSign, bSign;
5252 
5253     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5254         || (extractFloatx80Exp(a) == 0x7FFF
5255             && (uint64_t) (extractFloatx80Frac(a) << 1))
5256         || (extractFloatx80Exp(b) == 0x7FFF
5257             && (uint64_t) (extractFloatx80Frac(b) << 1))
5258        ) {
5259         float_raise(float_flag_invalid, status);
5260         return 0;
5261     }
5262     aSign = extractFloatx80Sign( a );
5263     bSign = extractFloatx80Sign( b );
5264     if ( aSign != bSign ) {
5265         return
5266                aSign
5267             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5268                  == 0 );
5269     }
5270     return
5271           aSign ? le128( b.high, b.low, a.high, a.low )
5272         : le128( a.high, a.low, b.high, b.low );
5273 
5274 }
5275 
5276 /*----------------------------------------------------------------------------
5277 | Returns 1 if the extended double-precision floating-point value `a' is
5278 | less than the corresponding value `b', and 0 otherwise.  The invalid
5279 | exception is raised if either operand is a NaN.  The comparison is performed
5280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281 *----------------------------------------------------------------------------*/
5282 
5283 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5284 {
5285     flag aSign, bSign;
5286 
5287     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5288         || (extractFloatx80Exp(a) == 0x7FFF
5289             && (uint64_t) (extractFloatx80Frac(a) << 1))
5290         || (extractFloatx80Exp(b) == 0x7FFF
5291             && (uint64_t) (extractFloatx80Frac(b) << 1))
5292        ) {
5293         float_raise(float_flag_invalid, status);
5294         return 0;
5295     }
5296     aSign = extractFloatx80Sign( a );
5297     bSign = extractFloatx80Sign( b );
5298     if ( aSign != bSign ) {
5299         return
5300                aSign
5301             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5302                  != 0 );
5303     }
5304     return
5305           aSign ? lt128( b.high, b.low, a.high, a.low )
5306         : lt128( a.high, a.low, b.high, b.low );
5307 
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5312 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5313 | either operand is a NaN.   The comparison is performed according to the
5314 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5316 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5317 {
5318     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5319         || (extractFloatx80Exp(a) == 0x7FFF
5320             && (uint64_t) (extractFloatx80Frac(a) << 1))
5321         || (extractFloatx80Exp(b) == 0x7FFF
5322             && (uint64_t) (extractFloatx80Frac(b) << 1))
5323        ) {
5324         float_raise(float_flag_invalid, status);
5325         return 1;
5326     }
5327     return 0;
5328 }
5329 
5330 /*----------------------------------------------------------------------------
5331 | Returns 1 if the extended double-precision floating-point value `a' is
5332 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5333 | cause an exception.  The comparison is performed according to the IEC/IEEE
5334 | Standard for Binary Floating-Point Arithmetic.
5335 *----------------------------------------------------------------------------*/
5336 
5337 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5338 {
5339 
5340     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5341         float_raise(float_flag_invalid, status);
5342         return 0;
5343     }
5344     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5345               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5346          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5347               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5348        ) {
5349         if (floatx80_is_signaling_nan(a, status)
5350          || floatx80_is_signaling_nan(b, status)) {
5351             float_raise(float_flag_invalid, status);
5352         }
5353         return 0;
5354     }
5355     return
5356            ( a.low == b.low )
5357         && (    ( a.high == b.high )
5358              || (    ( a.low == 0 )
5359                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5360            );
5361 
5362 }
5363 
5364 /*----------------------------------------------------------------------------
5365 | Returns 1 if the extended double-precision floating-point value `a' is less
5366 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5367 | do not cause an exception.  Otherwise, the comparison is performed according
5368 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5369 *----------------------------------------------------------------------------*/
5370 
5371 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5372 {
5373     flag aSign, bSign;
5374 
5375     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5376         float_raise(float_flag_invalid, status);
5377         return 0;
5378     }
5379     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5380               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5381          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5382               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5383        ) {
5384         if (floatx80_is_signaling_nan(a, status)
5385          || floatx80_is_signaling_nan(b, status)) {
5386             float_raise(float_flag_invalid, status);
5387         }
5388         return 0;
5389     }
5390     aSign = extractFloatx80Sign( a );
5391     bSign = extractFloatx80Sign( b );
5392     if ( aSign != bSign ) {
5393         return
5394                aSign
5395             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5396                  == 0 );
5397     }
5398     return
5399           aSign ? le128( b.high, b.low, a.high, a.low )
5400         : le128( a.high, a.low, b.high, b.low );
5401 
5402 }
5403 
5404 /*----------------------------------------------------------------------------
5405 | Returns 1 if the extended double-precision floating-point value `a' is less
5406 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5407 | an exception.  Otherwise, the comparison is performed according to the
5408 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5410 
5411 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5412 {
5413     flag aSign, bSign;
5414 
5415     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5416         float_raise(float_flag_invalid, status);
5417         return 0;
5418     }
5419     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5420               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5421          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5422               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5423        ) {
5424         if (floatx80_is_signaling_nan(a, status)
5425          || floatx80_is_signaling_nan(b, status)) {
5426             float_raise(float_flag_invalid, status);
5427         }
5428         return 0;
5429     }
5430     aSign = extractFloatx80Sign( a );
5431     bSign = extractFloatx80Sign( b );
5432     if ( aSign != bSign ) {
5433         return
5434                aSign
5435             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5436                  != 0 );
5437     }
5438     return
5439           aSign ? lt128( b.high, b.low, a.high, a.low )
5440         : lt128( a.high, a.low, b.high, b.low );
5441 
5442 }
5443 
5444 /*----------------------------------------------------------------------------
5445 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5446 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5447 | The comparison is performed according to the IEC/IEEE Standard for Binary
5448 | Floating-Point Arithmetic.
5449 *----------------------------------------------------------------------------*/
5450 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5451 {
5452     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5453         float_raise(float_flag_invalid, status);
5454         return 1;
5455     }
5456     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5457               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5458          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5459               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5460        ) {
5461         if (floatx80_is_signaling_nan(a, status)
5462          || floatx80_is_signaling_nan(b, status)) {
5463             float_raise(float_flag_invalid, status);
5464         }
5465         return 1;
5466     }
5467     return 0;
5468 }
5469 
5470 /*----------------------------------------------------------------------------
5471 | Returns the result of converting the quadruple-precision floating-point
5472 | value `a' to the 32-bit two's complement integer format.  The conversion
5473 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5474 | Arithmetic---which means in particular that the conversion is rounded
5475 | according to the current rounding mode.  If `a' is a NaN, the largest
5476 | positive integer is returned.  Otherwise, if the conversion overflows, the
5477 | largest integer with the same sign as `a' is returned.
5478 *----------------------------------------------------------------------------*/
5479 
5480 int32_t float128_to_int32(float128 a, float_status *status)
5481 {
5482     flag aSign;
5483     int32_t aExp, shiftCount;
5484     uint64_t aSig0, aSig1;
5485 
5486     aSig1 = extractFloat128Frac1( a );
5487     aSig0 = extractFloat128Frac0( a );
5488     aExp = extractFloat128Exp( a );
5489     aSign = extractFloat128Sign( a );
5490     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5491     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5492     aSig0 |= ( aSig1 != 0 );
5493     shiftCount = 0x4028 - aExp;
5494     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5495     return roundAndPackInt32(aSign, aSig0, status);
5496 
5497 }
5498 
5499 /*----------------------------------------------------------------------------
5500 | Returns the result of converting the quadruple-precision floating-point
5501 | value `a' to the 32-bit two's complement integer format.  The conversion
5502 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5503 | Arithmetic, except that the conversion is always rounded toward zero.  If
5504 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5505 | conversion overflows, the largest integer with the same sign as `a' is
5506 | returned.
5507 *----------------------------------------------------------------------------*/
5508 
5509 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5510 {
5511     flag aSign;
5512     int32_t aExp, shiftCount;
5513     uint64_t aSig0, aSig1, savedASig;
5514     int32_t z;
5515 
5516     aSig1 = extractFloat128Frac1( a );
5517     aSig0 = extractFloat128Frac0( a );
5518     aExp = extractFloat128Exp( a );
5519     aSign = extractFloat128Sign( a );
5520     aSig0 |= ( aSig1 != 0 );
5521     if ( 0x401E < aExp ) {
5522         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5523         goto invalid;
5524     }
5525     else if ( aExp < 0x3FFF ) {
5526         if (aExp || aSig0) {
5527             status->float_exception_flags |= float_flag_inexact;
5528         }
5529         return 0;
5530     }
5531     aSig0 |= LIT64( 0x0001000000000000 );
5532     shiftCount = 0x402F - aExp;
5533     savedASig = aSig0;
5534     aSig0 >>= shiftCount;
5535     z = aSig0;
5536     if ( aSign ) z = - z;
5537     if ( ( z < 0 ) ^ aSign ) {
5538  invalid:
5539         float_raise(float_flag_invalid, status);
5540         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5541     }
5542     if ( ( aSig0<<shiftCount ) != savedASig ) {
5543         status->float_exception_flags |= float_flag_inexact;
5544     }
5545     return z;
5546 
5547 }
5548 
5549 /*----------------------------------------------------------------------------
5550 | Returns the result of converting the quadruple-precision floating-point
5551 | value `a' to the 64-bit two's complement integer format.  The conversion
5552 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5553 | Arithmetic---which means in particular that the conversion is rounded
5554 | according to the current rounding mode.  If `a' is a NaN, the largest
5555 | positive integer is returned.  Otherwise, if the conversion overflows, the
5556 | largest integer with the same sign as `a' is returned.
5557 *----------------------------------------------------------------------------*/
5558 
5559 int64_t float128_to_int64(float128 a, float_status *status)
5560 {
5561     flag aSign;
5562     int32_t aExp, shiftCount;
5563     uint64_t aSig0, aSig1;
5564 
5565     aSig1 = extractFloat128Frac1( a );
5566     aSig0 = extractFloat128Frac0( a );
5567     aExp = extractFloat128Exp( a );
5568     aSign = extractFloat128Sign( a );
5569     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5570     shiftCount = 0x402F - aExp;
5571     if ( shiftCount <= 0 ) {
5572         if ( 0x403E < aExp ) {
5573             float_raise(float_flag_invalid, status);
5574             if (    ! aSign
5575                  || (    ( aExp == 0x7FFF )
5576                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5577                     )
5578                ) {
5579                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5580             }
5581             return (int64_t) LIT64( 0x8000000000000000 );
5582         }
5583         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5584     }
5585     else {
5586         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5587     }
5588     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5589 
5590 }
5591 
5592 /*----------------------------------------------------------------------------
5593 | Returns the result of converting the quadruple-precision floating-point
5594 | value `a' to the 64-bit two's complement integer format.  The conversion
5595 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5596 | Arithmetic, except that the conversion is always rounded toward zero.
5597 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5598 | the conversion overflows, the largest integer with the same sign as `a' is
5599 | returned.
5600 *----------------------------------------------------------------------------*/
5601 
5602 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5603 {
5604     flag aSign;
5605     int32_t aExp, shiftCount;
5606     uint64_t aSig0, aSig1;
5607     int64_t z;
5608 
5609     aSig1 = extractFloat128Frac1( a );
5610     aSig0 = extractFloat128Frac0( a );
5611     aExp = extractFloat128Exp( a );
5612     aSign = extractFloat128Sign( a );
5613     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5614     shiftCount = aExp - 0x402F;
5615     if ( 0 < shiftCount ) {
5616         if ( 0x403E <= aExp ) {
5617             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5618             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5619                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5620                 if (aSig1) {
5621                     status->float_exception_flags |= float_flag_inexact;
5622                 }
5623             }
5624             else {
5625                 float_raise(float_flag_invalid, status);
5626                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5627                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5628                 }
5629             }
5630             return (int64_t) LIT64( 0x8000000000000000 );
5631         }
5632         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5633         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5634             status->float_exception_flags |= float_flag_inexact;
5635         }
5636     }
5637     else {
5638         if ( aExp < 0x3FFF ) {
5639             if ( aExp | aSig0 | aSig1 ) {
5640                 status->float_exception_flags |= float_flag_inexact;
5641             }
5642             return 0;
5643         }
5644         z = aSig0>>( - shiftCount );
5645         if (    aSig1
5646              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5647             status->float_exception_flags |= float_flag_inexact;
5648         }
5649     }
5650     if ( aSign ) z = - z;
5651     return z;
5652 
5653 }
5654 
5655 /*----------------------------------------------------------------------------
5656 | Returns the result of converting the quadruple-precision floating-point value
5657 | `a' to the 64-bit unsigned integer format.  The conversion is
5658 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic---which means in particular that the conversion is rounded
5660 | according to the current rounding mode.  If `a' is a NaN, the largest
5661 | positive integer is returned.  If the conversion overflows, the
5662 | largest unsigned integer is returned.  If 'a' is negative, the value is
5663 | rounded and zero is returned; negative values that do not round to zero
5664 | will raise the inexact exception.
5665 *----------------------------------------------------------------------------*/
5666 
5667 uint64_t float128_to_uint64(float128 a, float_status *status)
5668 {
5669     flag aSign;
5670     int aExp;
5671     int shiftCount;
5672     uint64_t aSig0, aSig1;
5673 
5674     aSig0 = extractFloat128Frac0(a);
5675     aSig1 = extractFloat128Frac1(a);
5676     aExp = extractFloat128Exp(a);
5677     aSign = extractFloat128Sign(a);
5678     if (aSign && (aExp > 0x3FFE)) {
5679         float_raise(float_flag_invalid, status);
5680         if (float128_is_any_nan(a)) {
5681             return LIT64(0xFFFFFFFFFFFFFFFF);
5682         } else {
5683             return 0;
5684         }
5685     }
5686     if (aExp) {
5687         aSig0 |= LIT64(0x0001000000000000);
5688     }
5689     shiftCount = 0x402F - aExp;
5690     if (shiftCount <= 0) {
5691         if (0x403E < aExp) {
5692             float_raise(float_flag_invalid, status);
5693             return LIT64(0xFFFFFFFFFFFFFFFF);
5694         }
5695         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5696     } else {
5697         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5698     }
5699     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5700 }
5701 
5702 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5703 {
5704     uint64_t v;
5705     signed char current_rounding_mode = status->float_rounding_mode;
5706 
5707     set_float_rounding_mode(float_round_to_zero, status);
5708     v = float128_to_uint64(a, status);
5709     set_float_rounding_mode(current_rounding_mode, status);
5710 
5711     return v;
5712 }
5713 
5714 /*----------------------------------------------------------------------------
5715 | Returns the result of converting the quadruple-precision floating-point
5716 | value `a' to the 32-bit unsigned integer format.  The conversion
5717 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5718 | Arithmetic except that the conversion is always rounded toward zero.
5719 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5720 | if the conversion overflows, the largest unsigned integer is returned.
5721 | If 'a' is negative, the value is rounded and zero is returned; negative
5722 | values that do not round to zero will raise the inexact exception.
5723 *----------------------------------------------------------------------------*/
5724 
5725 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5726 {
5727     uint64_t v;
5728     uint32_t res;
5729     int old_exc_flags = get_float_exception_flags(status);
5730 
5731     v = float128_to_uint64_round_to_zero(a, status);
5732     if (v > 0xffffffff) {
5733         res = 0xffffffff;
5734     } else {
5735         return v;
5736     }
5737     set_float_exception_flags(old_exc_flags, status);
5738     float_raise(float_flag_invalid, status);
5739     return res;
5740 }
5741 
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of converting the quadruple-precision floating-point
5744 | value `a' to the single-precision floating-point format.  The conversion
5745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5746 | Arithmetic.
5747 *----------------------------------------------------------------------------*/
5748 
5749 float32 float128_to_float32(float128 a, float_status *status)
5750 {
5751     flag aSign;
5752     int32_t aExp;
5753     uint64_t aSig0, aSig1;
5754     uint32_t zSig;
5755 
5756     aSig1 = extractFloat128Frac1( a );
5757     aSig0 = extractFloat128Frac0( a );
5758     aExp = extractFloat128Exp( a );
5759     aSign = extractFloat128Sign( a );
5760     if ( aExp == 0x7FFF ) {
5761         if ( aSig0 | aSig1 ) {
5762             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5763         }
5764         return packFloat32( aSign, 0xFF, 0 );
5765     }
5766     aSig0 |= ( aSig1 != 0 );
5767     shift64RightJamming( aSig0, 18, &aSig0 );
5768     zSig = aSig0;
5769     if ( aExp || zSig ) {
5770         zSig |= 0x40000000;
5771         aExp -= 0x3F81;
5772     }
5773     return roundAndPackFloat32(aSign, aExp, zSig, status);
5774 
5775 }
5776 
5777 /*----------------------------------------------------------------------------
5778 | Returns the result of converting the quadruple-precision floating-point
5779 | value `a' to the double-precision floating-point format.  The conversion
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic.
5782 *----------------------------------------------------------------------------*/
5783 
5784 float64 float128_to_float64(float128 a, float_status *status)
5785 {
5786     flag aSign;
5787     int32_t aExp;
5788     uint64_t aSig0, aSig1;
5789 
5790     aSig1 = extractFloat128Frac1( a );
5791     aSig0 = extractFloat128Frac0( a );
5792     aExp = extractFloat128Exp( a );
5793     aSign = extractFloat128Sign( a );
5794     if ( aExp == 0x7FFF ) {
5795         if ( aSig0 | aSig1 ) {
5796             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5797         }
5798         return packFloat64( aSign, 0x7FF, 0 );
5799     }
5800     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5801     aSig0 |= ( aSig1 != 0 );
5802     if ( aExp || aSig0 ) {
5803         aSig0 |= LIT64( 0x4000000000000000 );
5804         aExp -= 0x3C01;
5805     }
5806     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5807 
5808 }
5809 
5810 /*----------------------------------------------------------------------------
5811 | Returns the result of converting the quadruple-precision floating-point
5812 | value `a' to the extended double-precision floating-point format.  The
5813 | conversion is performed according to the IEC/IEEE Standard for Binary
5814 | Floating-Point Arithmetic.
5815 *----------------------------------------------------------------------------*/
5816 
5817 floatx80 float128_to_floatx80(float128 a, float_status *status)
5818 {
5819     flag aSign;
5820     int32_t aExp;
5821     uint64_t aSig0, aSig1;
5822 
5823     aSig1 = extractFloat128Frac1( a );
5824     aSig0 = extractFloat128Frac0( a );
5825     aExp = extractFloat128Exp( a );
5826     aSign = extractFloat128Sign( a );
5827     if ( aExp == 0x7FFF ) {
5828         if ( aSig0 | aSig1 ) {
5829             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5830         }
5831         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5832     }
5833     if ( aExp == 0 ) {
5834         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5835         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5836     }
5837     else {
5838         aSig0 |= LIT64( 0x0001000000000000 );
5839     }
5840     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5841     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5842 
5843 }
5844 
5845 /*----------------------------------------------------------------------------
5846 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5847 | returns the result as a quadruple-precision floating-point value.  The
5848 | operation is performed according to the IEC/IEEE Standard for Binary
5849 | Floating-Point Arithmetic.
5850 *----------------------------------------------------------------------------*/
5851 
5852 float128 float128_round_to_int(float128 a, float_status *status)
5853 {
5854     flag aSign;
5855     int32_t aExp;
5856     uint64_t lastBitMask, roundBitsMask;
5857     float128 z;
5858 
5859     aExp = extractFloat128Exp( a );
5860     if ( 0x402F <= aExp ) {
5861         if ( 0x406F <= aExp ) {
5862             if (    ( aExp == 0x7FFF )
5863                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5864                ) {
5865                 return propagateFloat128NaN(a, a, status);
5866             }
5867             return a;
5868         }
5869         lastBitMask = 1;
5870         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5871         roundBitsMask = lastBitMask - 1;
5872         z = a;
5873         switch (status->float_rounding_mode) {
5874         case float_round_nearest_even:
5875             if ( lastBitMask ) {
5876                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5877                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5878             }
5879             else {
5880                 if ( (int64_t) z.low < 0 ) {
5881                     ++z.high;
5882                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5883                 }
5884             }
5885             break;
5886         case float_round_ties_away:
5887             if (lastBitMask) {
5888                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5889             } else {
5890                 if ((int64_t) z.low < 0) {
5891                     ++z.high;
5892                 }
5893             }
5894             break;
5895         case float_round_to_zero:
5896             break;
5897         case float_round_up:
5898             if (!extractFloat128Sign(z)) {
5899                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5900             }
5901             break;
5902         case float_round_down:
5903             if (extractFloat128Sign(z)) {
5904                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5905             }
5906             break;
5907         default:
5908             abort();
5909         }
5910         z.low &= ~ roundBitsMask;
5911     }
5912     else {
5913         if ( aExp < 0x3FFF ) {
5914             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5915             status->float_exception_flags |= float_flag_inexact;
5916             aSign = extractFloat128Sign( a );
5917             switch (status->float_rounding_mode) {
5918              case float_round_nearest_even:
5919                 if (    ( aExp == 0x3FFE )
5920                      && (   extractFloat128Frac0( a )
5921                           | extractFloat128Frac1( a ) )
5922                    ) {
5923                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5924                 }
5925                 break;
5926             case float_round_ties_away:
5927                 if (aExp == 0x3FFE) {
5928                     return packFloat128(aSign, 0x3FFF, 0, 0);
5929                 }
5930                 break;
5931              case float_round_down:
5932                 return
5933                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5934                     : packFloat128( 0, 0, 0, 0 );
5935              case float_round_up:
5936                 return
5937                       aSign ? packFloat128( 1, 0, 0, 0 )
5938                     : packFloat128( 0, 0x3FFF, 0, 0 );
5939             }
5940             return packFloat128( aSign, 0, 0, 0 );
5941         }
5942         lastBitMask = 1;
5943         lastBitMask <<= 0x402F - aExp;
5944         roundBitsMask = lastBitMask - 1;
5945         z.low = 0;
5946         z.high = a.high;
5947         switch (status->float_rounding_mode) {
5948         case float_round_nearest_even:
5949             z.high += lastBitMask>>1;
5950             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5951                 z.high &= ~ lastBitMask;
5952             }
5953             break;
5954         case float_round_ties_away:
5955             z.high += lastBitMask>>1;
5956             break;
5957         case float_round_to_zero:
5958             break;
5959         case float_round_up:
5960             if (!extractFloat128Sign(z)) {
5961                 z.high |= ( a.low != 0 );
5962                 z.high += roundBitsMask;
5963             }
5964             break;
5965         case float_round_down:
5966             if (extractFloat128Sign(z)) {
5967                 z.high |= (a.low != 0);
5968                 z.high += roundBitsMask;
5969             }
5970             break;
5971         default:
5972             abort();
5973         }
5974         z.high &= ~ roundBitsMask;
5975     }
5976     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5977         status->float_exception_flags |= float_flag_inexact;
5978     }
5979     return z;
5980 
5981 }
5982 
5983 /*----------------------------------------------------------------------------
5984 | Returns the result of adding the absolute values of the quadruple-precision
5985 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5986 | before being returned.  `zSign' is ignored if the result is a NaN.
5987 | The addition is performed according to the IEC/IEEE Standard for Binary
5988 | Floating-Point Arithmetic.
5989 *----------------------------------------------------------------------------*/
5990 
5991 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5992                                 float_status *status)
5993 {
5994     int32_t aExp, bExp, zExp;
5995     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5996     int32_t expDiff;
5997 
5998     aSig1 = extractFloat128Frac1( a );
5999     aSig0 = extractFloat128Frac0( a );
6000     aExp = extractFloat128Exp( a );
6001     bSig1 = extractFloat128Frac1( b );
6002     bSig0 = extractFloat128Frac0( b );
6003     bExp = extractFloat128Exp( b );
6004     expDiff = aExp - bExp;
6005     if ( 0 < expDiff ) {
6006         if ( aExp == 0x7FFF ) {
6007             if (aSig0 | aSig1) {
6008                 return propagateFloat128NaN(a, b, status);
6009             }
6010             return a;
6011         }
6012         if ( bExp == 0 ) {
6013             --expDiff;
6014         }
6015         else {
6016             bSig0 |= LIT64( 0x0001000000000000 );
6017         }
6018         shift128ExtraRightJamming(
6019             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6020         zExp = aExp;
6021     }
6022     else if ( expDiff < 0 ) {
6023         if ( bExp == 0x7FFF ) {
6024             if (bSig0 | bSig1) {
6025                 return propagateFloat128NaN(a, b, status);
6026             }
6027             return packFloat128( zSign, 0x7FFF, 0, 0 );
6028         }
6029         if ( aExp == 0 ) {
6030             ++expDiff;
6031         }
6032         else {
6033             aSig0 |= LIT64( 0x0001000000000000 );
6034         }
6035         shift128ExtraRightJamming(
6036             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6037         zExp = bExp;
6038     }
6039     else {
6040         if ( aExp == 0x7FFF ) {
6041             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6042                 return propagateFloat128NaN(a, b, status);
6043             }
6044             return a;
6045         }
6046         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6047         if ( aExp == 0 ) {
6048             if (status->flush_to_zero) {
6049                 if (zSig0 | zSig1) {
6050                     float_raise(float_flag_output_denormal, status);
6051                 }
6052                 return packFloat128(zSign, 0, 0, 0);
6053             }
6054             return packFloat128( zSign, 0, zSig0, zSig1 );
6055         }
6056         zSig2 = 0;
6057         zSig0 |= LIT64( 0x0002000000000000 );
6058         zExp = aExp;
6059         goto shiftRight1;
6060     }
6061     aSig0 |= LIT64( 0x0001000000000000 );
6062     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6063     --zExp;
6064     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6065     ++zExp;
6066  shiftRight1:
6067     shift128ExtraRightJamming(
6068         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6069  roundAndPack:
6070     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6071 
6072 }
6073 
6074 /*----------------------------------------------------------------------------
6075 | Returns the result of subtracting the absolute values of the quadruple-
6076 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6077 | difference is negated before being returned.  `zSign' is ignored if the
6078 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6079 | Standard for Binary Floating-Point Arithmetic.
6080 *----------------------------------------------------------------------------*/
6081 
6082 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6083                                 float_status *status)
6084 {
6085     int32_t aExp, bExp, zExp;
6086     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6087     int32_t expDiff;
6088 
6089     aSig1 = extractFloat128Frac1( a );
6090     aSig0 = extractFloat128Frac0( a );
6091     aExp = extractFloat128Exp( a );
6092     bSig1 = extractFloat128Frac1( b );
6093     bSig0 = extractFloat128Frac0( b );
6094     bExp = extractFloat128Exp( b );
6095     expDiff = aExp - bExp;
6096     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6097     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6098     if ( 0 < expDiff ) goto aExpBigger;
6099     if ( expDiff < 0 ) goto bExpBigger;
6100     if ( aExp == 0x7FFF ) {
6101         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6102             return propagateFloat128NaN(a, b, status);
6103         }
6104         float_raise(float_flag_invalid, status);
6105         return float128_default_nan(status);
6106     }
6107     if ( aExp == 0 ) {
6108         aExp = 1;
6109         bExp = 1;
6110     }
6111     if ( bSig0 < aSig0 ) goto aBigger;
6112     if ( aSig0 < bSig0 ) goto bBigger;
6113     if ( bSig1 < aSig1 ) goto aBigger;
6114     if ( aSig1 < bSig1 ) goto bBigger;
6115     return packFloat128(status->float_rounding_mode == float_round_down,
6116                         0, 0, 0);
6117  bExpBigger:
6118     if ( bExp == 0x7FFF ) {
6119         if (bSig0 | bSig1) {
6120             return propagateFloat128NaN(a, b, status);
6121         }
6122         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6123     }
6124     if ( aExp == 0 ) {
6125         ++expDiff;
6126     }
6127     else {
6128         aSig0 |= LIT64( 0x4000000000000000 );
6129     }
6130     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6131     bSig0 |= LIT64( 0x4000000000000000 );
6132  bBigger:
6133     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6134     zExp = bExp;
6135     zSign ^= 1;
6136     goto normalizeRoundAndPack;
6137  aExpBigger:
6138     if ( aExp == 0x7FFF ) {
6139         if (aSig0 | aSig1) {
6140             return propagateFloat128NaN(a, b, status);
6141         }
6142         return a;
6143     }
6144     if ( bExp == 0 ) {
6145         --expDiff;
6146     }
6147     else {
6148         bSig0 |= LIT64( 0x4000000000000000 );
6149     }
6150     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6151     aSig0 |= LIT64( 0x4000000000000000 );
6152  aBigger:
6153     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6154     zExp = aExp;
6155  normalizeRoundAndPack:
6156     --zExp;
6157     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6158                                          status);
6159 
6160 }
6161 
6162 /*----------------------------------------------------------------------------
6163 | Returns the result of adding the quadruple-precision floating-point values
6164 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6165 | for Binary Floating-Point Arithmetic.
6166 *----------------------------------------------------------------------------*/
6167 
6168 float128 float128_add(float128 a, float128 b, float_status *status)
6169 {
6170     flag aSign, bSign;
6171 
6172     aSign = extractFloat128Sign( a );
6173     bSign = extractFloat128Sign( b );
6174     if ( aSign == bSign ) {
6175         return addFloat128Sigs(a, b, aSign, status);
6176     }
6177     else {
6178         return subFloat128Sigs(a, b, aSign, status);
6179     }
6180 
6181 }
6182 
6183 /*----------------------------------------------------------------------------
6184 | Returns the result of subtracting the quadruple-precision floating-point
6185 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6186 | Standard for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6188 
6189 float128 float128_sub(float128 a, float128 b, float_status *status)
6190 {
6191     flag aSign, bSign;
6192 
6193     aSign = extractFloat128Sign( a );
6194     bSign = extractFloat128Sign( b );
6195     if ( aSign == bSign ) {
6196         return subFloat128Sigs(a, b, aSign, status);
6197     }
6198     else {
6199         return addFloat128Sigs(a, b, aSign, status);
6200     }
6201 
6202 }
6203 
6204 /*----------------------------------------------------------------------------
6205 | Returns the result of multiplying the quadruple-precision floating-point
6206 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6207 | Standard for Binary Floating-Point Arithmetic.
6208 *----------------------------------------------------------------------------*/
6209 
6210 float128 float128_mul(float128 a, float128 b, float_status *status)
6211 {
6212     flag aSign, bSign, zSign;
6213     int32_t aExp, bExp, zExp;
6214     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6215 
6216     aSig1 = extractFloat128Frac1( a );
6217     aSig0 = extractFloat128Frac0( a );
6218     aExp = extractFloat128Exp( a );
6219     aSign = extractFloat128Sign( a );
6220     bSig1 = extractFloat128Frac1( b );
6221     bSig0 = extractFloat128Frac0( b );
6222     bExp = extractFloat128Exp( b );
6223     bSign = extractFloat128Sign( b );
6224     zSign = aSign ^ bSign;
6225     if ( aExp == 0x7FFF ) {
6226         if (    ( aSig0 | aSig1 )
6227              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6228             return propagateFloat128NaN(a, b, status);
6229         }
6230         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6231         return packFloat128( zSign, 0x7FFF, 0, 0 );
6232     }
6233     if ( bExp == 0x7FFF ) {
6234         if (bSig0 | bSig1) {
6235             return propagateFloat128NaN(a, b, status);
6236         }
6237         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6238  invalid:
6239             float_raise(float_flag_invalid, status);
6240             return float128_default_nan(status);
6241         }
6242         return packFloat128( zSign, 0x7FFF, 0, 0 );
6243     }
6244     if ( aExp == 0 ) {
6245         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6246         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6247     }
6248     if ( bExp == 0 ) {
6249         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6250         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6251     }
6252     zExp = aExp + bExp - 0x4000;
6253     aSig0 |= LIT64( 0x0001000000000000 );
6254     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6255     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6256     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6257     zSig2 |= ( zSig3 != 0 );
6258     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6259         shift128ExtraRightJamming(
6260             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6261         ++zExp;
6262     }
6263     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6264 
6265 }
6266 
6267 /*----------------------------------------------------------------------------
6268 | Returns the result of dividing the quadruple-precision floating-point value
6269 | `a' by the corresponding value `b'.  The operation is performed according to
6270 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6272 
6273 float128 float128_div(float128 a, float128 b, float_status *status)
6274 {
6275     flag aSign, bSign, zSign;
6276     int32_t aExp, bExp, zExp;
6277     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6278     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6279 
6280     aSig1 = extractFloat128Frac1( a );
6281     aSig0 = extractFloat128Frac0( a );
6282     aExp = extractFloat128Exp( a );
6283     aSign = extractFloat128Sign( a );
6284     bSig1 = extractFloat128Frac1( b );
6285     bSig0 = extractFloat128Frac0( b );
6286     bExp = extractFloat128Exp( b );
6287     bSign = extractFloat128Sign( b );
6288     zSign = aSign ^ bSign;
6289     if ( aExp == 0x7FFF ) {
6290         if (aSig0 | aSig1) {
6291             return propagateFloat128NaN(a, b, status);
6292         }
6293         if ( bExp == 0x7FFF ) {
6294             if (bSig0 | bSig1) {
6295                 return propagateFloat128NaN(a, b, status);
6296             }
6297             goto invalid;
6298         }
6299         return packFloat128( zSign, 0x7FFF, 0, 0 );
6300     }
6301     if ( bExp == 0x7FFF ) {
6302         if (bSig0 | bSig1) {
6303             return propagateFloat128NaN(a, b, status);
6304         }
6305         return packFloat128( zSign, 0, 0, 0 );
6306     }
6307     if ( bExp == 0 ) {
6308         if ( ( bSig0 | bSig1 ) == 0 ) {
6309             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6310  invalid:
6311                 float_raise(float_flag_invalid, status);
6312                 return float128_default_nan(status);
6313             }
6314             float_raise(float_flag_divbyzero, status);
6315             return packFloat128( zSign, 0x7FFF, 0, 0 );
6316         }
6317         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6318     }
6319     if ( aExp == 0 ) {
6320         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6321         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6322     }
6323     zExp = aExp - bExp + 0x3FFD;
6324     shortShift128Left(
6325         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6326     shortShift128Left(
6327         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6328     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6329         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6330         ++zExp;
6331     }
6332     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6333     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6334     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6335     while ( (int64_t) rem0 < 0 ) {
6336         --zSig0;
6337         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6338     }
6339     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6340     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6341         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6342         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6343         while ( (int64_t) rem1 < 0 ) {
6344             --zSig1;
6345             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6346         }
6347         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6348     }
6349     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6350     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6351 
6352 }
6353 
6354 /*----------------------------------------------------------------------------
6355 | Returns the remainder of the quadruple-precision floating-point value `a'
6356 | with respect to the corresponding value `b'.  The operation is performed
6357 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6358 *----------------------------------------------------------------------------*/
6359 
6360 float128 float128_rem(float128 a, float128 b, float_status *status)
6361 {
6362     flag aSign, zSign;
6363     int32_t aExp, bExp, expDiff;
6364     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6365     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6366     int64_t sigMean0;
6367 
6368     aSig1 = extractFloat128Frac1( a );
6369     aSig0 = extractFloat128Frac0( a );
6370     aExp = extractFloat128Exp( a );
6371     aSign = extractFloat128Sign( a );
6372     bSig1 = extractFloat128Frac1( b );
6373     bSig0 = extractFloat128Frac0( b );
6374     bExp = extractFloat128Exp( b );
6375     if ( aExp == 0x7FFF ) {
6376         if (    ( aSig0 | aSig1 )
6377              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6378             return propagateFloat128NaN(a, b, status);
6379         }
6380         goto invalid;
6381     }
6382     if ( bExp == 0x7FFF ) {
6383         if (bSig0 | bSig1) {
6384             return propagateFloat128NaN(a, b, status);
6385         }
6386         return a;
6387     }
6388     if ( bExp == 0 ) {
6389         if ( ( bSig0 | bSig1 ) == 0 ) {
6390  invalid:
6391             float_raise(float_flag_invalid, status);
6392             return float128_default_nan(status);
6393         }
6394         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6395     }
6396     if ( aExp == 0 ) {
6397         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6398         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6399     }
6400     expDiff = aExp - bExp;
6401     if ( expDiff < -1 ) return a;
6402     shortShift128Left(
6403         aSig0 | LIT64( 0x0001000000000000 ),
6404         aSig1,
6405         15 - ( expDiff < 0 ),
6406         &aSig0,
6407         &aSig1
6408     );
6409     shortShift128Left(
6410         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6411     q = le128( bSig0, bSig1, aSig0, aSig1 );
6412     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6413     expDiff -= 64;
6414     while ( 0 < expDiff ) {
6415         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6416         q = ( 4 < q ) ? q - 4 : 0;
6417         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6418         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6419         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6420         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6421         expDiff -= 61;
6422     }
6423     if ( -64 < expDiff ) {
6424         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6425         q = ( 4 < q ) ? q - 4 : 0;
6426         q >>= - expDiff;
6427         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6428         expDiff += 52;
6429         if ( expDiff < 0 ) {
6430             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6431         }
6432         else {
6433             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6434         }
6435         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6436         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6437     }
6438     else {
6439         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6440         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6441     }
6442     do {
6443         alternateASig0 = aSig0;
6444         alternateASig1 = aSig1;
6445         ++q;
6446         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6447     } while ( 0 <= (int64_t) aSig0 );
6448     add128(
6449         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6450     if (    ( sigMean0 < 0 )
6451          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6452         aSig0 = alternateASig0;
6453         aSig1 = alternateASig1;
6454     }
6455     zSign = ( (int64_t) aSig0 < 0 );
6456     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6457     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6458                                          status);
6459 }
6460 
6461 /*----------------------------------------------------------------------------
6462 | Returns the square root of the quadruple-precision floating-point value `a'.
6463 | The operation is performed according to the IEC/IEEE Standard for Binary
6464 | Floating-Point Arithmetic.
6465 *----------------------------------------------------------------------------*/
6466 
6467 float128 float128_sqrt(float128 a, float_status *status)
6468 {
6469     flag aSign;
6470     int32_t aExp, zExp;
6471     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6472     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6473 
6474     aSig1 = extractFloat128Frac1( a );
6475     aSig0 = extractFloat128Frac0( a );
6476     aExp = extractFloat128Exp( a );
6477     aSign = extractFloat128Sign( a );
6478     if ( aExp == 0x7FFF ) {
6479         if (aSig0 | aSig1) {
6480             return propagateFloat128NaN(a, a, status);
6481         }
6482         if ( ! aSign ) return a;
6483         goto invalid;
6484     }
6485     if ( aSign ) {
6486         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6487  invalid:
6488         float_raise(float_flag_invalid, status);
6489         return float128_default_nan(status);
6490     }
6491     if ( aExp == 0 ) {
6492         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6493         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6494     }
6495     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6496     aSig0 |= LIT64( 0x0001000000000000 );
6497     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6498     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6499     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6500     doubleZSig0 = zSig0<<1;
6501     mul64To128( zSig0, zSig0, &term0, &term1 );
6502     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6503     while ( (int64_t) rem0 < 0 ) {
6504         --zSig0;
6505         doubleZSig0 -= 2;
6506         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6507     }
6508     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6509     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6510         if ( zSig1 == 0 ) zSig1 = 1;
6511         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6512         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6513         mul64To128( zSig1, zSig1, &term2, &term3 );
6514         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6515         while ( (int64_t) rem1 < 0 ) {
6516             --zSig1;
6517             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6518             term3 |= 1;
6519             term2 |= doubleZSig0;
6520             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6521         }
6522         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6523     }
6524     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6525     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6526 
6527 }
6528 
6529 /*----------------------------------------------------------------------------
6530 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6531 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6532 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6533 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6534 *----------------------------------------------------------------------------*/
6535 
6536 int float128_eq(float128 a, float128 b, float_status *status)
6537 {
6538 
6539     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6540               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6541          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6542               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6543        ) {
6544         float_raise(float_flag_invalid, status);
6545         return 0;
6546     }
6547     return
6548            ( a.low == b.low )
6549         && (    ( a.high == b.high )
6550              || (    ( a.low == 0 )
6551                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6552            );
6553 
6554 }
6555 
6556 /*----------------------------------------------------------------------------
6557 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6558 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6559 | exception is raised if either operand is a NaN.  The comparison is performed
6560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6561 *----------------------------------------------------------------------------*/
6562 
6563 int float128_le(float128 a, float128 b, float_status *status)
6564 {
6565     flag aSign, bSign;
6566 
6567     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6568               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6569          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6570               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6571        ) {
6572         float_raise(float_flag_invalid, status);
6573         return 0;
6574     }
6575     aSign = extractFloat128Sign( a );
6576     bSign = extractFloat128Sign( b );
6577     if ( aSign != bSign ) {
6578         return
6579                aSign
6580             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6581                  == 0 );
6582     }
6583     return
6584           aSign ? le128( b.high, b.low, a.high, a.low )
6585         : le128( a.high, a.low, b.high, b.low );
6586 
6587 }
6588 
6589 /*----------------------------------------------------------------------------
6590 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6591 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6592 | raised if either operand is a NaN.  The comparison is performed according
6593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6594 *----------------------------------------------------------------------------*/
6595 
6596 int float128_lt(float128 a, float128 b, float_status *status)
6597 {
6598     flag aSign, bSign;
6599 
6600     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6601               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6602          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6603               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6604        ) {
6605         float_raise(float_flag_invalid, status);
6606         return 0;
6607     }
6608     aSign = extractFloat128Sign( a );
6609     bSign = extractFloat128Sign( b );
6610     if ( aSign != bSign ) {
6611         return
6612                aSign
6613             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6614                  != 0 );
6615     }
6616     return
6617           aSign ? lt128( b.high, b.low, a.high, a.low )
6618         : lt128( a.high, a.low, b.high, b.low );
6619 
6620 }
6621 
6622 /*----------------------------------------------------------------------------
6623 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6624 | be compared, and 0 otherwise.  The invalid exception is raised if either
6625 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6626 | Standard for Binary Floating-Point Arithmetic.
6627 *----------------------------------------------------------------------------*/
6628 
6629 int float128_unordered(float128 a, float128 b, float_status *status)
6630 {
6631     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6632               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6633          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6634               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6635        ) {
6636         float_raise(float_flag_invalid, status);
6637         return 1;
6638     }
6639     return 0;
6640 }
6641 
6642 /*----------------------------------------------------------------------------
6643 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6644 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6645 | exception.  The comparison is performed according to the IEC/IEEE Standard
6646 | for Binary Floating-Point Arithmetic.
6647 *----------------------------------------------------------------------------*/
6648 
6649 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6650 {
6651 
6652     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6653               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6654          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6655               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6656        ) {
6657         if (float128_is_signaling_nan(a, status)
6658          || float128_is_signaling_nan(b, status)) {
6659             float_raise(float_flag_invalid, status);
6660         }
6661         return 0;
6662     }
6663     return
6664            ( a.low == b.low )
6665         && (    ( a.high == b.high )
6666              || (    ( a.low == 0 )
6667                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6668            );
6669 
6670 }
6671 
6672 /*----------------------------------------------------------------------------
6673 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6674 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6675 | cause an exception.  Otherwise, the comparison is performed according to the
6676 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6677 *----------------------------------------------------------------------------*/
6678 
6679 int float128_le_quiet(float128 a, float128 b, float_status *status)
6680 {
6681     flag aSign, bSign;
6682 
6683     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6684               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6685          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6686               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6687        ) {
6688         if (float128_is_signaling_nan(a, status)
6689          || float128_is_signaling_nan(b, status)) {
6690             float_raise(float_flag_invalid, status);
6691         }
6692         return 0;
6693     }
6694     aSign = extractFloat128Sign( a );
6695     bSign = extractFloat128Sign( b );
6696     if ( aSign != bSign ) {
6697         return
6698                aSign
6699             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6700                  == 0 );
6701     }
6702     return
6703           aSign ? le128( b.high, b.low, a.high, a.low )
6704         : le128( a.high, a.low, b.high, b.low );
6705 
6706 }
6707 
6708 /*----------------------------------------------------------------------------
6709 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6710 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6711 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6712 | Standard for Binary Floating-Point Arithmetic.
6713 *----------------------------------------------------------------------------*/
6714 
6715 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6716 {
6717     flag aSign, bSign;
6718 
6719     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6720               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6721          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6722               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6723        ) {
6724         if (float128_is_signaling_nan(a, status)
6725          || float128_is_signaling_nan(b, status)) {
6726             float_raise(float_flag_invalid, status);
6727         }
6728         return 0;
6729     }
6730     aSign = extractFloat128Sign( a );
6731     bSign = extractFloat128Sign( b );
6732     if ( aSign != bSign ) {
6733         return
6734                aSign
6735             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6736                  != 0 );
6737     }
6738     return
6739           aSign ? lt128( b.high, b.low, a.high, a.low )
6740         : lt128( a.high, a.low, b.high, b.low );
6741 
6742 }
6743 
6744 /*----------------------------------------------------------------------------
6745 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6746 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6747 | comparison is performed according to the IEC/IEEE Standard for Binary
6748 | Floating-Point Arithmetic.
6749 *----------------------------------------------------------------------------*/
6750 
6751 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6752 {
6753     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6754               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6755          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6756               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6757        ) {
6758         if (float128_is_signaling_nan(a, status)
6759          || float128_is_signaling_nan(b, status)) {
6760             float_raise(float_flag_invalid, status);
6761         }
6762         return 1;
6763     }
6764     return 0;
6765 }
6766 
6767 #define COMPARE(s, nan_exp)                                                  \
6768 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
6769                                       int is_quiet, float_status *status)    \
6770 {                                                                            \
6771     flag aSign, bSign;                                                       \
6772     uint ## s ## _t av, bv;                                                  \
6773     a = float ## s ## _squash_input_denormal(a, status);                     \
6774     b = float ## s ## _squash_input_denormal(b, status);                     \
6775                                                                              \
6776     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6777          extractFloat ## s ## Frac( a ) ) ||                                 \
6778         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6779           extractFloat ## s ## Frac( b ) )) {                                \
6780         if (!is_quiet ||                                                     \
6781             float ## s ## _is_signaling_nan(a, status) ||                  \
6782             float ## s ## _is_signaling_nan(b, status)) {                 \
6783             float_raise(float_flag_invalid, status);                         \
6784         }                                                                    \
6785         return float_relation_unordered;                                     \
6786     }                                                                        \
6787     aSign = extractFloat ## s ## Sign( a );                                  \
6788     bSign = extractFloat ## s ## Sign( b );                                  \
6789     av = float ## s ## _val(a);                                              \
6790     bv = float ## s ## _val(b);                                              \
6791     if ( aSign != bSign ) {                                                  \
6792         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6793             /* zero case */                                                  \
6794             return float_relation_equal;                                     \
6795         } else {                                                             \
6796             return 1 - (2 * aSign);                                          \
6797         }                                                                    \
6798     } else {                                                                 \
6799         if (av == bv) {                                                      \
6800             return float_relation_equal;                                     \
6801         } else {                                                             \
6802             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6803         }                                                                    \
6804     }                                                                        \
6805 }                                                                            \
6806                                                                              \
6807 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
6808 {                                                                            \
6809     return float ## s ## _compare_internal(a, b, 0, status);                 \
6810 }                                                                            \
6811                                                                              \
6812 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
6813                                  float_status *status)                       \
6814 {                                                                            \
6815     return float ## s ## _compare_internal(a, b, 1, status);                 \
6816 }
6817 
6818 COMPARE(32, 0xff)
6819 COMPARE(64, 0x7ff)
6820 
6821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6822                                             int is_quiet, float_status *status)
6823 {
6824     flag aSign, bSign;
6825 
6826     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6827         float_raise(float_flag_invalid, status);
6828         return float_relation_unordered;
6829     }
6830     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6831           ( extractFloatx80Frac( a )<<1 ) ) ||
6832         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6833           ( extractFloatx80Frac( b )<<1 ) )) {
6834         if (!is_quiet ||
6835             floatx80_is_signaling_nan(a, status) ||
6836             floatx80_is_signaling_nan(b, status)) {
6837             float_raise(float_flag_invalid, status);
6838         }
6839         return float_relation_unordered;
6840     }
6841     aSign = extractFloatx80Sign( a );
6842     bSign = extractFloatx80Sign( b );
6843     if ( aSign != bSign ) {
6844 
6845         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6846              ( ( a.low | b.low ) == 0 ) ) {
6847             /* zero case */
6848             return float_relation_equal;
6849         } else {
6850             return 1 - (2 * aSign);
6851         }
6852     } else {
6853         if (a.low == b.low && a.high == b.high) {
6854             return float_relation_equal;
6855         } else {
6856             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6857         }
6858     }
6859 }
6860 
6861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6862 {
6863     return floatx80_compare_internal(a, b, 0, status);
6864 }
6865 
6866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6867 {
6868     return floatx80_compare_internal(a, b, 1, status);
6869 }
6870 
6871 static inline int float128_compare_internal(float128 a, float128 b,
6872                                             int is_quiet, float_status *status)
6873 {
6874     flag aSign, bSign;
6875 
6876     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6877           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6878         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6879           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6880         if (!is_quiet ||
6881             float128_is_signaling_nan(a, status) ||
6882             float128_is_signaling_nan(b, status)) {
6883             float_raise(float_flag_invalid, status);
6884         }
6885         return float_relation_unordered;
6886     }
6887     aSign = extractFloat128Sign( a );
6888     bSign = extractFloat128Sign( b );
6889     if ( aSign != bSign ) {
6890         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6891             /* zero case */
6892             return float_relation_equal;
6893         } else {
6894             return 1 - (2 * aSign);
6895         }
6896     } else {
6897         if (a.low == b.low && a.high == b.high) {
6898             return float_relation_equal;
6899         } else {
6900             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6901         }
6902     }
6903 }
6904 
6905 int float128_compare(float128 a, float128 b, float_status *status)
6906 {
6907     return float128_compare_internal(a, b, 0, status);
6908 }
6909 
6910 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6911 {
6912     return float128_compare_internal(a, b, 1, status);
6913 }
6914 
6915 /* min() and max() functions. These can't be implemented as
6916  * 'compare and pick one input' because that would mishandle
6917  * NaNs and +0 vs -0.
6918  *
6919  * minnum() and maxnum() functions. These are similar to the min()
6920  * and max() functions but if one of the arguments is a QNaN and
6921  * the other is numerical then the numerical argument is returned.
6922  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6923  * and maxNum() operations. min() and max() are the typical min/max
6924  * semantics provided by many CPUs which predate that specification.
6925  *
6926  * minnummag() and maxnummag() functions correspond to minNumMag()
6927  * and minNumMag() from the IEEE-754 2008.
6928  */
6929 #define MINMAX(s)                                                       \
6930 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6931                                                int ismin, int isieee,   \
6932                                                int ismag,               \
6933                                                float_status *status)    \
6934 {                                                                       \
6935     flag aSign, bSign;                                                  \
6936     uint ## s ## _t av, bv, aav, abv;                                   \
6937     a = float ## s ## _squash_input_denormal(a, status);                \
6938     b = float ## s ## _squash_input_denormal(b, status);                \
6939     if (float ## s ## _is_any_nan(a) ||                                 \
6940         float ## s ## _is_any_nan(b)) {                                 \
6941         if (isieee) {                                                   \
6942             if (float ## s ## _is_quiet_nan(a, status) &&               \
6943                 !float ## s ##_is_any_nan(b)) {                         \
6944                 return b;                                               \
6945             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
6946                        !float ## s ## _is_any_nan(a)) {                \
6947                 return a;                                               \
6948             }                                                           \
6949         }                                                               \
6950         return propagateFloat ## s ## NaN(a, b, status);                \
6951     }                                                                   \
6952     aSign = extractFloat ## s ## Sign(a);                               \
6953     bSign = extractFloat ## s ## Sign(b);                               \
6954     av = float ## s ## _val(a);                                         \
6955     bv = float ## s ## _val(b);                                         \
6956     if (ismag) {                                                        \
6957         aav = float ## s ## _abs(av);                                   \
6958         abv = float ## s ## _abs(bv);                                   \
6959         if (aav != abv) {                                               \
6960             if (ismin) {                                                \
6961                 return (aav < abv) ? a : b;                             \
6962             } else {                                                    \
6963                 return (aav < abv) ? b : a;                             \
6964             }                                                           \
6965         }                                                               \
6966     }                                                                   \
6967     if (aSign != bSign) {                                               \
6968         if (ismin) {                                                    \
6969             return aSign ? a : b;                                       \
6970         } else {                                                        \
6971             return aSign ? b : a;                                       \
6972         }                                                               \
6973     } else {                                                            \
6974         if (ismin) {                                                    \
6975             return (aSign ^ (av < bv)) ? a : b;                         \
6976         } else {                                                        \
6977             return (aSign ^ (av < bv)) ? b : a;                         \
6978         }                                                               \
6979     }                                                                   \
6980 }                                                                       \
6981                                                                         \
6982 float ## s float ## s ## _min(float ## s a, float ## s b,               \
6983                               float_status *status)                     \
6984 {                                                                       \
6985     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
6986 }                                                                       \
6987                                                                         \
6988 float ## s float ## s ## _max(float ## s a, float ## s b,               \
6989                               float_status *status)                     \
6990 {                                                                       \
6991     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
6992 }                                                                       \
6993                                                                         \
6994 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
6995                                  float_status *status)                  \
6996 {                                                                       \
6997     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
6998 }                                                                       \
6999                                                                         \
7000 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7001                                  float_status *status)                  \
7002 {                                                                       \
7003     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7004 }                                                                       \
7005                                                                         \
7006 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7007                                     float_status *status)               \
7008 {                                                                       \
7009     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7010 }                                                                       \
7011                                                                         \
7012 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7013                                     float_status *status)               \
7014 {                                                                       \
7015     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7016 }
7017 
7018 MINMAX(32)
7019 MINMAX(64)
7020 
7021 
7022 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7023 {
7024     flag aSign;
7025     int32_t aExp;
7026     uint64_t aSig;
7027 
7028     if (floatx80_invalid_encoding(a)) {
7029         float_raise(float_flag_invalid, status);
7030         return floatx80_default_nan(status);
7031     }
7032     aSig = extractFloatx80Frac( a );
7033     aExp = extractFloatx80Exp( a );
7034     aSign = extractFloatx80Sign( a );
7035 
7036     if ( aExp == 0x7FFF ) {
7037         if ( aSig<<1 ) {
7038             return propagateFloatx80NaN(a, a, status);
7039         }
7040         return a;
7041     }
7042 
7043     if (aExp == 0) {
7044         if (aSig == 0) {
7045             return a;
7046         }
7047         aExp++;
7048     }
7049 
7050     if (n > 0x10000) {
7051         n = 0x10000;
7052     } else if (n < -0x10000) {
7053         n = -0x10000;
7054     }
7055 
7056     aExp += n;
7057     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7058                                          aSign, aExp, aSig, 0, status);
7059 }
7060 
7061 float128 float128_scalbn(float128 a, int n, float_status *status)
7062 {
7063     flag aSign;
7064     int32_t aExp;
7065     uint64_t aSig0, aSig1;
7066 
7067     aSig1 = extractFloat128Frac1( a );
7068     aSig0 = extractFloat128Frac0( a );
7069     aExp = extractFloat128Exp( a );
7070     aSign = extractFloat128Sign( a );
7071     if ( aExp == 0x7FFF ) {
7072         if ( aSig0 | aSig1 ) {
7073             return propagateFloat128NaN(a, a, status);
7074         }
7075         return a;
7076     }
7077     if (aExp != 0) {
7078         aSig0 |= LIT64( 0x0001000000000000 );
7079     } else if (aSig0 == 0 && aSig1 == 0) {
7080         return a;
7081     } else {
7082         aExp++;
7083     }
7084 
7085     if (n > 0x10000) {
7086         n = 0x10000;
7087     } else if (n < -0x10000) {
7088         n = -0x10000;
7089     }
7090 
7091     aExp += n - 1;
7092     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7093                                          , status);
7094 
7095 }
7096