xref: /qemu/fpu/softfloat.c (revision dbe4d53a590f5689772b683984588b3cf6df163e)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*----------------------------------------------------------------------------
1324 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1325 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1326 | input.  If `zSign' is 1, the input is negated before being converted to an
1327 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1328 | is simply rounded to an integer, with the inexact exception raised if the
1329 | input cannot be represented exactly as an integer.  However, if the fixed-
1330 | point input is too large, the invalid exception is raised and the largest
1331 | positive or negative integer is returned.
1332 *----------------------------------------------------------------------------*/
1333 
1334 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1335 {
1336     int8_t roundingMode;
1337     flag roundNearestEven;
1338     int8_t roundIncrement, roundBits;
1339     int32_t z;
1340 
1341     roundingMode = status->float_rounding_mode;
1342     roundNearestEven = ( roundingMode == float_round_nearest_even );
1343     switch (roundingMode) {
1344     case float_round_nearest_even:
1345     case float_round_ties_away:
1346         roundIncrement = 0x40;
1347         break;
1348     case float_round_to_zero:
1349         roundIncrement = 0;
1350         break;
1351     case float_round_up:
1352         roundIncrement = zSign ? 0 : 0x7f;
1353         break;
1354     case float_round_down:
1355         roundIncrement = zSign ? 0x7f : 0;
1356         break;
1357     default:
1358         abort();
1359     }
1360     roundBits = absZ & 0x7F;
1361     absZ = ( absZ + roundIncrement )>>7;
1362     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1363     z = absZ;
1364     if ( zSign ) z = - z;
1365     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1366         float_raise(float_flag_invalid, status);
1367         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1368     }
1369     if (roundBits) {
1370         status->float_exception_flags |= float_flag_inexact;
1371     }
1372     return z;
1373 
1374 }
1375 
1376 /*----------------------------------------------------------------------------
1377 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1378 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1379 | and returns the properly rounded 64-bit integer corresponding to the input.
1380 | If `zSign' is 1, the input is negated before being converted to an integer.
1381 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1382 | the inexact exception raised if the input cannot be represented exactly as
1383 | an integer.  However, if the fixed-point input is too large, the invalid
1384 | exception is raised and the largest positive or negative integer is
1385 | returned.
1386 *----------------------------------------------------------------------------*/
1387 
1388 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1389                                float_status *status)
1390 {
1391     int8_t roundingMode;
1392     flag roundNearestEven, increment;
1393     int64_t z;
1394 
1395     roundingMode = status->float_rounding_mode;
1396     roundNearestEven = ( roundingMode == float_round_nearest_even );
1397     switch (roundingMode) {
1398     case float_round_nearest_even:
1399     case float_round_ties_away:
1400         increment = ((int64_t) absZ1 < 0);
1401         break;
1402     case float_round_to_zero:
1403         increment = 0;
1404         break;
1405     case float_round_up:
1406         increment = !zSign && absZ1;
1407         break;
1408     case float_round_down:
1409         increment = zSign && absZ1;
1410         break;
1411     default:
1412         abort();
1413     }
1414     if ( increment ) {
1415         ++absZ0;
1416         if ( absZ0 == 0 ) goto overflow;
1417         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1418     }
1419     z = absZ0;
1420     if ( zSign ) z = - z;
1421     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1422  overflow:
1423         float_raise(float_flag_invalid, status);
1424         return
1425               zSign ? (int64_t) LIT64( 0x8000000000000000 )
1426             : LIT64( 0x7FFFFFFFFFFFFFFF );
1427     }
1428     if (absZ1) {
1429         status->float_exception_flags |= float_flag_inexact;
1430     }
1431     return z;
1432 
1433 }
1434 
1435 /*----------------------------------------------------------------------------
1436 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1437 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1438 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1439 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
1440 | with the inexact exception raised if the input cannot be represented exactly
1441 | as an integer.  However, if the fixed-point input is too large, the invalid
1442 | exception is raised and the largest unsigned integer is returned.
1443 *----------------------------------------------------------------------------*/
1444 
1445 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1446                                 uint64_t absZ1, float_status *status)
1447 {
1448     int8_t roundingMode;
1449     flag roundNearestEven, increment;
1450 
1451     roundingMode = status->float_rounding_mode;
1452     roundNearestEven = (roundingMode == float_round_nearest_even);
1453     switch (roundingMode) {
1454     case float_round_nearest_even:
1455     case float_round_ties_away:
1456         increment = ((int64_t)absZ1 < 0);
1457         break;
1458     case float_round_to_zero:
1459         increment = 0;
1460         break;
1461     case float_round_up:
1462         increment = !zSign && absZ1;
1463         break;
1464     case float_round_down:
1465         increment = zSign && absZ1;
1466         break;
1467     default:
1468         abort();
1469     }
1470     if (increment) {
1471         ++absZ0;
1472         if (absZ0 == 0) {
1473             float_raise(float_flag_invalid, status);
1474             return LIT64(0xFFFFFFFFFFFFFFFF);
1475         }
1476         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1477     }
1478 
1479     if (zSign && absZ0) {
1480         float_raise(float_flag_invalid, status);
1481         return 0;
1482     }
1483 
1484     if (absZ1) {
1485         status->float_exception_flags |= float_flag_inexact;
1486     }
1487     return absZ0;
1488 }
1489 
1490 /*----------------------------------------------------------------------------
1491 | If `a' is denormal and we are in flush-to-zero mode then set the
1492 | input-denormal exception and return zero. Otherwise just return the value.
1493 *----------------------------------------------------------------------------*/
1494 float32 float32_squash_input_denormal(float32 a, float_status *status)
1495 {
1496     if (status->flush_inputs_to_zero) {
1497         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1498             float_raise(float_flag_input_denormal, status);
1499             return make_float32(float32_val(a) & 0x80000000);
1500         }
1501     }
1502     return a;
1503 }
1504 
1505 /*----------------------------------------------------------------------------
1506 | Normalizes the subnormal single-precision floating-point value represented
1507 | by the denormalized significand `aSig'.  The normalized exponent and
1508 | significand are stored at the locations pointed to by `zExpPtr' and
1509 | `zSigPtr', respectively.
1510 *----------------------------------------------------------------------------*/
1511 
1512 static void
1513  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
1514 {
1515     int8_t shiftCount;
1516 
1517     shiftCount = countLeadingZeros32( aSig ) - 8;
1518     *zSigPtr = aSig<<shiftCount;
1519     *zExpPtr = 1 - shiftCount;
1520 
1521 }
1522 
1523 /*----------------------------------------------------------------------------
1524 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1525 | single-precision floating-point value, returning the result.  After being
1526 | shifted into the proper positions, the three fields are simply added
1527 | together to form the result.  This means that any integer portion of `zSig'
1528 | will be added into the exponent.  Since a properly normalized significand
1529 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1530 | than the desired result exponent whenever `zSig' is a complete, normalized
1531 | significand.
1532 *----------------------------------------------------------------------------*/
1533 
1534 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
1535 {
1536 
1537     return make_float32(
1538           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
1539 
1540 }
1541 
1542 /*----------------------------------------------------------------------------
1543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1544 | and significand `zSig', and returns the proper single-precision floating-
1545 | point value corresponding to the abstract input.  Ordinarily, the abstract
1546 | value is simply rounded and packed into the single-precision format, with
1547 | the inexact exception raised if the abstract input cannot be represented
1548 | exactly.  However, if the abstract value is too large, the overflow and
1549 | inexact exceptions are raised and an infinity or maximal finite value is
1550 | returned.  If the abstract value is too small, the input value is rounded to
1551 | a subnormal number, and the underflow and inexact exceptions are raised if
1552 | the abstract input cannot be represented exactly as a subnormal single-
1553 | precision floating-point number.
1554 |     The input significand `zSig' has its binary point between bits 30
1555 | and 29, which is 7 bits to the left of the usual location.  This shifted
1556 | significand must be normalized or smaller.  If `zSig' is not normalized,
1557 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1558 | and it must not require rounding.  In the usual case that `zSig' is
1559 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1560 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1561 | Binary Floating-Point Arithmetic.
1562 *----------------------------------------------------------------------------*/
1563 
1564 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1565                                    float_status *status)
1566 {
1567     int8_t roundingMode;
1568     flag roundNearestEven;
1569     int8_t roundIncrement, roundBits;
1570     flag isTiny;
1571 
1572     roundingMode = status->float_rounding_mode;
1573     roundNearestEven = ( roundingMode == float_round_nearest_even );
1574     switch (roundingMode) {
1575     case float_round_nearest_even:
1576     case float_round_ties_away:
1577         roundIncrement = 0x40;
1578         break;
1579     case float_round_to_zero:
1580         roundIncrement = 0;
1581         break;
1582     case float_round_up:
1583         roundIncrement = zSign ? 0 : 0x7f;
1584         break;
1585     case float_round_down:
1586         roundIncrement = zSign ? 0x7f : 0;
1587         break;
1588     default:
1589         abort();
1590         break;
1591     }
1592     roundBits = zSig & 0x7F;
1593     if ( 0xFD <= (uint16_t) zExp ) {
1594         if (    ( 0xFD < zExp )
1595              || (    ( zExp == 0xFD )
1596                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
1597            ) {
1598             float_raise(float_flag_overflow | float_flag_inexact, status);
1599             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
1600         }
1601         if ( zExp < 0 ) {
1602             if (status->flush_to_zero) {
1603                 float_raise(float_flag_output_denormal, status);
1604                 return packFloat32(zSign, 0, 0);
1605             }
1606             isTiny =
1607                 (status->float_detect_tininess
1608                  == float_tininess_before_rounding)
1609                 || ( zExp < -1 )
1610                 || ( zSig + roundIncrement < 0x80000000 );
1611             shift32RightJamming( zSig, - zExp, &zSig );
1612             zExp = 0;
1613             roundBits = zSig & 0x7F;
1614             if (isTiny && roundBits) {
1615                 float_raise(float_flag_underflow, status);
1616             }
1617         }
1618     }
1619     if (roundBits) {
1620         status->float_exception_flags |= float_flag_inexact;
1621     }
1622     zSig = ( zSig + roundIncrement )>>7;
1623     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1624     if ( zSig == 0 ) zExp = 0;
1625     return packFloat32( zSign, zExp, zSig );
1626 
1627 }
1628 
1629 /*----------------------------------------------------------------------------
1630 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1631 | and significand `zSig', and returns the proper single-precision floating-
1632 | point value corresponding to the abstract input.  This routine is just like
1633 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1634 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1635 | floating-point exponent.
1636 *----------------------------------------------------------------------------*/
1637 
1638 static float32
1639  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1640                               float_status *status)
1641 {
1642     int8_t shiftCount;
1643 
1644     shiftCount = countLeadingZeros32( zSig ) - 1;
1645     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1646                                status);
1647 
1648 }
1649 
1650 /*----------------------------------------------------------------------------
1651 | If `a' is denormal and we are in flush-to-zero mode then set the
1652 | input-denormal exception and return zero. Otherwise just return the value.
1653 *----------------------------------------------------------------------------*/
1654 float64 float64_squash_input_denormal(float64 a, float_status *status)
1655 {
1656     if (status->flush_inputs_to_zero) {
1657         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
1658             float_raise(float_flag_input_denormal, status);
1659             return make_float64(float64_val(a) & (1ULL << 63));
1660         }
1661     }
1662     return a;
1663 }
1664 
1665 /*----------------------------------------------------------------------------
1666 | Normalizes the subnormal double-precision floating-point value represented
1667 | by the denormalized significand `aSig'.  The normalized exponent and
1668 | significand are stored at the locations pointed to by `zExpPtr' and
1669 | `zSigPtr', respectively.
1670 *----------------------------------------------------------------------------*/
1671 
1672 static void
1673  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
1674 {
1675     int8_t shiftCount;
1676 
1677     shiftCount = countLeadingZeros64( aSig ) - 11;
1678     *zSigPtr = aSig<<shiftCount;
1679     *zExpPtr = 1 - shiftCount;
1680 
1681 }
1682 
1683 /*----------------------------------------------------------------------------
1684 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1685 | double-precision floating-point value, returning the result.  After being
1686 | shifted into the proper positions, the three fields are simply added
1687 | together to form the result.  This means that any integer portion of `zSig'
1688 | will be added into the exponent.  Since a properly normalized significand
1689 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1690 | than the desired result exponent whenever `zSig' is a complete, normalized
1691 | significand.
1692 *----------------------------------------------------------------------------*/
1693 
1694 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
1695 {
1696 
1697     return make_float64(
1698         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
1699 
1700 }
1701 
1702 /*----------------------------------------------------------------------------
1703 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1704 | and significand `zSig', and returns the proper double-precision floating-
1705 | point value corresponding to the abstract input.  Ordinarily, the abstract
1706 | value is simply rounded and packed into the double-precision format, with
1707 | the inexact exception raised if the abstract input cannot be represented
1708 | exactly.  However, if the abstract value is too large, the overflow and
1709 | inexact exceptions are raised and an infinity or maximal finite value is
1710 | returned.  If the abstract value is too small, the input value is rounded to
1711 | a subnormal number, and the underflow and inexact exceptions are raised if
1712 | the abstract input cannot be represented exactly as a subnormal double-
1713 | precision floating-point number.
1714 |     The input significand `zSig' has its binary point between bits 62
1715 | and 61, which is 10 bits to the left of the usual location.  This shifted
1716 | significand must be normalized or smaller.  If `zSig' is not normalized,
1717 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1718 | and it must not require rounding.  In the usual case that `zSig' is
1719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1720 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1721 | Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1723 
1724 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1725                                    float_status *status)
1726 {
1727     int8_t roundingMode;
1728     flag roundNearestEven;
1729     int roundIncrement, roundBits;
1730     flag isTiny;
1731 
1732     roundingMode = status->float_rounding_mode;
1733     roundNearestEven = ( roundingMode == float_round_nearest_even );
1734     switch (roundingMode) {
1735     case float_round_nearest_even:
1736     case float_round_ties_away:
1737         roundIncrement = 0x200;
1738         break;
1739     case float_round_to_zero:
1740         roundIncrement = 0;
1741         break;
1742     case float_round_up:
1743         roundIncrement = zSign ? 0 : 0x3ff;
1744         break;
1745     case float_round_down:
1746         roundIncrement = zSign ? 0x3ff : 0;
1747         break;
1748     case float_round_to_odd:
1749         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1750         break;
1751     default:
1752         abort();
1753     }
1754     roundBits = zSig & 0x3FF;
1755     if ( 0x7FD <= (uint16_t) zExp ) {
1756         if (    ( 0x7FD < zExp )
1757              || (    ( zExp == 0x7FD )
1758                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
1759            ) {
1760             bool overflow_to_inf = roundingMode != float_round_to_odd &&
1761                                    roundIncrement != 0;
1762             float_raise(float_flag_overflow | float_flag_inexact, status);
1763             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
1764         }
1765         if ( zExp < 0 ) {
1766             if (status->flush_to_zero) {
1767                 float_raise(float_flag_output_denormal, status);
1768                 return packFloat64(zSign, 0, 0);
1769             }
1770             isTiny =
1771                    (status->float_detect_tininess
1772                     == float_tininess_before_rounding)
1773                 || ( zExp < -1 )
1774                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1775             shift64RightJamming( zSig, - zExp, &zSig );
1776             zExp = 0;
1777             roundBits = zSig & 0x3FF;
1778             if (isTiny && roundBits) {
1779                 float_raise(float_flag_underflow, status);
1780             }
1781             if (roundingMode == float_round_to_odd) {
1782                 /*
1783                  * For round-to-odd case, the roundIncrement depends on
1784                  * zSig which just changed.
1785                  */
1786                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1787             }
1788         }
1789     }
1790     if (roundBits) {
1791         status->float_exception_flags |= float_flag_inexact;
1792     }
1793     zSig = ( zSig + roundIncrement )>>10;
1794     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1795     if ( zSig == 0 ) zExp = 0;
1796     return packFloat64( zSign, zExp, zSig );
1797 
1798 }
1799 
1800 /*----------------------------------------------------------------------------
1801 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1802 | and significand `zSig', and returns the proper double-precision floating-
1803 | point value corresponding to the abstract input.  This routine is just like
1804 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1805 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1806 | floating-point exponent.
1807 *----------------------------------------------------------------------------*/
1808 
1809 static float64
1810  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1811                               float_status *status)
1812 {
1813     int8_t shiftCount;
1814 
1815     shiftCount = countLeadingZeros64( zSig ) - 1;
1816     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1817                                status);
1818 
1819 }
1820 
1821 /*----------------------------------------------------------------------------
1822 | Returns the fraction bits of the extended double-precision floating-point
1823 | value `a'.
1824 *----------------------------------------------------------------------------*/
1825 
1826 static inline uint64_t extractFloatx80Frac( floatx80 a )
1827 {
1828 
1829     return a.low;
1830 
1831 }
1832 
1833 /*----------------------------------------------------------------------------
1834 | Returns the exponent bits of the extended double-precision floating-point
1835 | value `a'.
1836 *----------------------------------------------------------------------------*/
1837 
1838 static inline int32_t extractFloatx80Exp( floatx80 a )
1839 {
1840 
1841     return a.high & 0x7FFF;
1842 
1843 }
1844 
1845 /*----------------------------------------------------------------------------
1846 | Returns the sign bit of the extended double-precision floating-point value
1847 | `a'.
1848 *----------------------------------------------------------------------------*/
1849 
1850 static inline flag extractFloatx80Sign( floatx80 a )
1851 {
1852 
1853     return a.high>>15;
1854 
1855 }
1856 
1857 /*----------------------------------------------------------------------------
1858 | Normalizes the subnormal extended double-precision floating-point value
1859 | represented by the denormalized significand `aSig'.  The normalized exponent
1860 | and significand are stored at the locations pointed to by `zExpPtr' and
1861 | `zSigPtr', respectively.
1862 *----------------------------------------------------------------------------*/
1863 
1864 static void
1865  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
1866 {
1867     int8_t shiftCount;
1868 
1869     shiftCount = countLeadingZeros64( aSig );
1870     *zSigPtr = aSig<<shiftCount;
1871     *zExpPtr = 1 - shiftCount;
1872 
1873 }
1874 
1875 /*----------------------------------------------------------------------------
1876 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1877 | extended double-precision floating-point value, returning the result.
1878 *----------------------------------------------------------------------------*/
1879 
1880 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
1881 {
1882     floatx80 z;
1883 
1884     z.low = zSig;
1885     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
1886     return z;
1887 
1888 }
1889 
1890 /*----------------------------------------------------------------------------
1891 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1892 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
1893 | and returns the proper extended double-precision floating-point value
1894 | corresponding to the abstract input.  Ordinarily, the abstract value is
1895 | rounded and packed into the extended double-precision format, with the
1896 | inexact exception raised if the abstract input cannot be represented
1897 | exactly.  However, if the abstract value is too large, the overflow and
1898 | inexact exceptions are raised and an infinity or maximal finite value is
1899 | returned.  If the abstract value is too small, the input value is rounded to
1900 | a subnormal number, and the underflow and inexact exceptions are raised if
1901 | the abstract input cannot be represented exactly as a subnormal extended
1902 | double-precision floating-point number.
1903 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
1904 | number of bits as single or double precision, respectively.  Otherwise, the
1905 | result is rounded to the full precision of the extended double-precision
1906 | format.
1907 |     The input significand must be normalized or smaller.  If the input
1908 | significand is not normalized, `zExp' must be 0; in that case, the result
1909 | returned is a subnormal number, and it must not require rounding.  The
1910 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1911 | Floating-Point Arithmetic.
1912 *----------------------------------------------------------------------------*/
1913 
1914 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
1915                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
1916                                      float_status *status)
1917 {
1918     int8_t roundingMode;
1919     flag roundNearestEven, increment, isTiny;
1920     int64_t roundIncrement, roundMask, roundBits;
1921 
1922     roundingMode = status->float_rounding_mode;
1923     roundNearestEven = ( roundingMode == float_round_nearest_even );
1924     if ( roundingPrecision == 80 ) goto precision80;
1925     if ( roundingPrecision == 64 ) {
1926         roundIncrement = LIT64( 0x0000000000000400 );
1927         roundMask = LIT64( 0x00000000000007FF );
1928     }
1929     else if ( roundingPrecision == 32 ) {
1930         roundIncrement = LIT64( 0x0000008000000000 );
1931         roundMask = LIT64( 0x000000FFFFFFFFFF );
1932     }
1933     else {
1934         goto precision80;
1935     }
1936     zSig0 |= ( zSig1 != 0 );
1937     switch (roundingMode) {
1938     case float_round_nearest_even:
1939     case float_round_ties_away:
1940         break;
1941     case float_round_to_zero:
1942         roundIncrement = 0;
1943         break;
1944     case float_round_up:
1945         roundIncrement = zSign ? 0 : roundMask;
1946         break;
1947     case float_round_down:
1948         roundIncrement = zSign ? roundMask : 0;
1949         break;
1950     default:
1951         abort();
1952     }
1953     roundBits = zSig0 & roundMask;
1954     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
1955         if (    ( 0x7FFE < zExp )
1956              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1957            ) {
1958             goto overflow;
1959         }
1960         if ( zExp <= 0 ) {
1961             if (status->flush_to_zero) {
1962                 float_raise(float_flag_output_denormal, status);
1963                 return packFloatx80(zSign, 0, 0);
1964             }
1965             isTiny =
1966                    (status->float_detect_tininess
1967                     == float_tininess_before_rounding)
1968                 || ( zExp < 0 )
1969                 || ( zSig0 <= zSig0 + roundIncrement );
1970             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1971             zExp = 0;
1972             roundBits = zSig0 & roundMask;
1973             if (isTiny && roundBits) {
1974                 float_raise(float_flag_underflow, status);
1975             }
1976             if (roundBits) {
1977                 status->float_exception_flags |= float_flag_inexact;
1978             }
1979             zSig0 += roundIncrement;
1980             if ( (int64_t) zSig0 < 0 ) zExp = 1;
1981             roundIncrement = roundMask + 1;
1982             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1983                 roundMask |= roundIncrement;
1984             }
1985             zSig0 &= ~ roundMask;
1986             return packFloatx80( zSign, zExp, zSig0 );
1987         }
1988     }
1989     if (roundBits) {
1990         status->float_exception_flags |= float_flag_inexact;
1991     }
1992     zSig0 += roundIncrement;
1993     if ( zSig0 < roundIncrement ) {
1994         ++zExp;
1995         zSig0 = LIT64( 0x8000000000000000 );
1996     }
1997     roundIncrement = roundMask + 1;
1998     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1999         roundMask |= roundIncrement;
2000     }
2001     zSig0 &= ~ roundMask;
2002     if ( zSig0 == 0 ) zExp = 0;
2003     return packFloatx80( zSign, zExp, zSig0 );
2004  precision80:
2005     switch (roundingMode) {
2006     case float_round_nearest_even:
2007     case float_round_ties_away:
2008         increment = ((int64_t)zSig1 < 0);
2009         break;
2010     case float_round_to_zero:
2011         increment = 0;
2012         break;
2013     case float_round_up:
2014         increment = !zSign && zSig1;
2015         break;
2016     case float_round_down:
2017         increment = zSign && zSig1;
2018         break;
2019     default:
2020         abort();
2021     }
2022     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2023         if (    ( 0x7FFE < zExp )
2024              || (    ( zExp == 0x7FFE )
2025                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2026                   && increment
2027                 )
2028            ) {
2029             roundMask = 0;
2030  overflow:
2031             float_raise(float_flag_overflow | float_flag_inexact, status);
2032             if (    ( roundingMode == float_round_to_zero )
2033                  || ( zSign && ( roundingMode == float_round_up ) )
2034                  || ( ! zSign && ( roundingMode == float_round_down ) )
2035                ) {
2036                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2037             }
2038             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2039         }
2040         if ( zExp <= 0 ) {
2041             isTiny =
2042                    (status->float_detect_tininess
2043                     == float_tininess_before_rounding)
2044                 || ( zExp < 0 )
2045                 || ! increment
2046                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2047             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2048             zExp = 0;
2049             if (isTiny && zSig1) {
2050                 float_raise(float_flag_underflow, status);
2051             }
2052             if (zSig1) {
2053                 status->float_exception_flags |= float_flag_inexact;
2054             }
2055             switch (roundingMode) {
2056             case float_round_nearest_even:
2057             case float_round_ties_away:
2058                 increment = ((int64_t)zSig1 < 0);
2059                 break;
2060             case float_round_to_zero:
2061                 increment = 0;
2062                 break;
2063             case float_round_up:
2064                 increment = !zSign && zSig1;
2065                 break;
2066             case float_round_down:
2067                 increment = zSign && zSig1;
2068                 break;
2069             default:
2070                 abort();
2071             }
2072             if ( increment ) {
2073                 ++zSig0;
2074                 zSig0 &=
2075                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2076                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2077             }
2078             return packFloatx80( zSign, zExp, zSig0 );
2079         }
2080     }
2081     if (zSig1) {
2082         status->float_exception_flags |= float_flag_inexact;
2083     }
2084     if ( increment ) {
2085         ++zSig0;
2086         if ( zSig0 == 0 ) {
2087             ++zExp;
2088             zSig0 = LIT64( 0x8000000000000000 );
2089         }
2090         else {
2091             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2092         }
2093     }
2094     else {
2095         if ( zSig0 == 0 ) zExp = 0;
2096     }
2097     return packFloatx80( zSign, zExp, zSig0 );
2098 
2099 }
2100 
2101 /*----------------------------------------------------------------------------
2102 | Takes an abstract floating-point value having sign `zSign', exponent
2103 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2104 | and returns the proper extended double-precision floating-point value
2105 | corresponding to the abstract input.  This routine is just like
2106 | `roundAndPackFloatx80' except that the input significand does not have to be
2107 | normalized.
2108 *----------------------------------------------------------------------------*/
2109 
2110 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2111                                               flag zSign, int32_t zExp,
2112                                               uint64_t zSig0, uint64_t zSig1,
2113                                               float_status *status)
2114 {
2115     int8_t shiftCount;
2116 
2117     if ( zSig0 == 0 ) {
2118         zSig0 = zSig1;
2119         zSig1 = 0;
2120         zExp -= 64;
2121     }
2122     shiftCount = countLeadingZeros64( zSig0 );
2123     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2124     zExp -= shiftCount;
2125     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2126                                 zSig0, zSig1, status);
2127 
2128 }
2129 
2130 /*----------------------------------------------------------------------------
2131 | Returns the least-significant 64 fraction bits of the quadruple-precision
2132 | floating-point value `a'.
2133 *----------------------------------------------------------------------------*/
2134 
2135 static inline uint64_t extractFloat128Frac1( float128 a )
2136 {
2137 
2138     return a.low;
2139 
2140 }
2141 
2142 /*----------------------------------------------------------------------------
2143 | Returns the most-significant 48 fraction bits of the quadruple-precision
2144 | floating-point value `a'.
2145 *----------------------------------------------------------------------------*/
2146 
2147 static inline uint64_t extractFloat128Frac0( float128 a )
2148 {
2149 
2150     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2151 
2152 }
2153 
2154 /*----------------------------------------------------------------------------
2155 | Returns the exponent bits of the quadruple-precision floating-point value
2156 | `a'.
2157 *----------------------------------------------------------------------------*/
2158 
2159 static inline int32_t extractFloat128Exp( float128 a )
2160 {
2161 
2162     return ( a.high>>48 ) & 0x7FFF;
2163 
2164 }
2165 
2166 /*----------------------------------------------------------------------------
2167 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2168 *----------------------------------------------------------------------------*/
2169 
2170 static inline flag extractFloat128Sign( float128 a )
2171 {
2172 
2173     return a.high>>63;
2174 
2175 }
2176 
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal quadruple-precision floating-point value
2179 | represented by the denormalized significand formed by the concatenation of
2180 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2181 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2182 | significand are stored at the location pointed to by `zSig0Ptr', and the
2183 | least significant 64 bits of the normalized significand are stored at the
2184 | location pointed to by `zSig1Ptr'.
2185 *----------------------------------------------------------------------------*/
2186 
2187 static void
2188  normalizeFloat128Subnormal(
2189      uint64_t aSig0,
2190      uint64_t aSig1,
2191      int32_t *zExpPtr,
2192      uint64_t *zSig0Ptr,
2193      uint64_t *zSig1Ptr
2194  )
2195 {
2196     int8_t shiftCount;
2197 
2198     if ( aSig0 == 0 ) {
2199         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2200         if ( shiftCount < 0 ) {
2201             *zSig0Ptr = aSig1>>( - shiftCount );
2202             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2203         }
2204         else {
2205             *zSig0Ptr = aSig1<<shiftCount;
2206             *zSig1Ptr = 0;
2207         }
2208         *zExpPtr = - shiftCount - 63;
2209     }
2210     else {
2211         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2212         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2213         *zExpPtr = 1 - shiftCount;
2214     }
2215 
2216 }
2217 
2218 /*----------------------------------------------------------------------------
2219 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2220 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2221 | floating-point value, returning the result.  After being shifted into the
2222 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2223 | added together to form the most significant 32 bits of the result.  This
2224 | means that any integer portion of `zSig0' will be added into the exponent.
2225 | Since a properly normalized significand will have an integer portion equal
2226 | to 1, the `zExp' input should be 1 less than the desired result exponent
2227 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2228 | significand.
2229 *----------------------------------------------------------------------------*/
2230 
2231 static inline float128
2232  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2233 {
2234     float128 z;
2235 
2236     z.low = zSig1;
2237     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2238     return z;
2239 
2240 }
2241 
2242 /*----------------------------------------------------------------------------
2243 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2244 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2245 | and `zSig2', and returns the proper quadruple-precision floating-point value
2246 | corresponding to the abstract input.  Ordinarily, the abstract value is
2247 | simply rounded and packed into the quadruple-precision format, with the
2248 | inexact exception raised if the abstract input cannot be represented
2249 | exactly.  However, if the abstract value is too large, the overflow and
2250 | inexact exceptions are raised and an infinity or maximal finite value is
2251 | returned.  If the abstract value is too small, the input value is rounded to
2252 | a subnormal number, and the underflow and inexact exceptions are raised if
2253 | the abstract input cannot be represented exactly as a subnormal quadruple-
2254 | precision floating-point number.
2255 |     The input significand must be normalized or smaller.  If the input
2256 | significand is not normalized, `zExp' must be 0; in that case, the result
2257 | returned is a subnormal number, and it must not require rounding.  In the
2258 | usual case that the input significand is normalized, `zExp' must be 1 less
2259 | than the ``true'' floating-point exponent.  The handling of underflow and
2260 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2261 *----------------------------------------------------------------------------*/
2262 
2263 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2264                                      uint64_t zSig0, uint64_t zSig1,
2265                                      uint64_t zSig2, float_status *status)
2266 {
2267     int8_t roundingMode;
2268     flag roundNearestEven, increment, isTiny;
2269 
2270     roundingMode = status->float_rounding_mode;
2271     roundNearestEven = ( roundingMode == float_round_nearest_even );
2272     switch (roundingMode) {
2273     case float_round_nearest_even:
2274     case float_round_ties_away:
2275         increment = ((int64_t)zSig2 < 0);
2276         break;
2277     case float_round_to_zero:
2278         increment = 0;
2279         break;
2280     case float_round_up:
2281         increment = !zSign && zSig2;
2282         break;
2283     case float_round_down:
2284         increment = zSign && zSig2;
2285         break;
2286     case float_round_to_odd:
2287         increment = !(zSig1 & 0x1) && zSig2;
2288         break;
2289     default:
2290         abort();
2291     }
2292     if ( 0x7FFD <= (uint32_t) zExp ) {
2293         if (    ( 0x7FFD < zExp )
2294              || (    ( zExp == 0x7FFD )
2295                   && eq128(
2296                          LIT64( 0x0001FFFFFFFFFFFF ),
2297                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2298                          zSig0,
2299                          zSig1
2300                      )
2301                   && increment
2302                 )
2303            ) {
2304             float_raise(float_flag_overflow | float_flag_inexact, status);
2305             if (    ( roundingMode == float_round_to_zero )
2306                  || ( zSign && ( roundingMode == float_round_up ) )
2307                  || ( ! zSign && ( roundingMode == float_round_down ) )
2308                  || (roundingMode == float_round_to_odd)
2309                ) {
2310                 return
2311                     packFloat128(
2312                         zSign,
2313                         0x7FFE,
2314                         LIT64( 0x0000FFFFFFFFFFFF ),
2315                         LIT64( 0xFFFFFFFFFFFFFFFF )
2316                     );
2317             }
2318             return packFloat128( zSign, 0x7FFF, 0, 0 );
2319         }
2320         if ( zExp < 0 ) {
2321             if (status->flush_to_zero) {
2322                 float_raise(float_flag_output_denormal, status);
2323                 return packFloat128(zSign, 0, 0, 0);
2324             }
2325             isTiny =
2326                    (status->float_detect_tininess
2327                     == float_tininess_before_rounding)
2328                 || ( zExp < -1 )
2329                 || ! increment
2330                 || lt128(
2331                        zSig0,
2332                        zSig1,
2333                        LIT64( 0x0001FFFFFFFFFFFF ),
2334                        LIT64( 0xFFFFFFFFFFFFFFFF )
2335                    );
2336             shift128ExtraRightJamming(
2337                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2338             zExp = 0;
2339             if (isTiny && zSig2) {
2340                 float_raise(float_flag_underflow, status);
2341             }
2342             switch (roundingMode) {
2343             case float_round_nearest_even:
2344             case float_round_ties_away:
2345                 increment = ((int64_t)zSig2 < 0);
2346                 break;
2347             case float_round_to_zero:
2348                 increment = 0;
2349                 break;
2350             case float_round_up:
2351                 increment = !zSign && zSig2;
2352                 break;
2353             case float_round_down:
2354                 increment = zSign && zSig2;
2355                 break;
2356             case float_round_to_odd:
2357                 increment = !(zSig1 & 0x1) && zSig2;
2358                 break;
2359             default:
2360                 abort();
2361             }
2362         }
2363     }
2364     if (zSig2) {
2365         status->float_exception_flags |= float_flag_inexact;
2366     }
2367     if ( increment ) {
2368         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2369         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2370     }
2371     else {
2372         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2373     }
2374     return packFloat128( zSign, zExp, zSig0, zSig1 );
2375 
2376 }
2377 
2378 /*----------------------------------------------------------------------------
2379 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2380 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2381 | returns the proper quadruple-precision floating-point value corresponding
2382 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2383 | except that the input significand has fewer bits and does not have to be
2384 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2385 | point exponent.
2386 *----------------------------------------------------------------------------*/
2387 
2388 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2389                                               uint64_t zSig0, uint64_t zSig1,
2390                                               float_status *status)
2391 {
2392     int8_t shiftCount;
2393     uint64_t zSig2;
2394 
2395     if ( zSig0 == 0 ) {
2396         zSig0 = zSig1;
2397         zSig1 = 0;
2398         zExp -= 64;
2399     }
2400     shiftCount = countLeadingZeros64( zSig0 ) - 15;
2401     if ( 0 <= shiftCount ) {
2402         zSig2 = 0;
2403         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2404     }
2405     else {
2406         shift128ExtraRightJamming(
2407             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2408     }
2409     zExp -= shiftCount;
2410     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2411 
2412 }
2413 
2414 /*----------------------------------------------------------------------------
2415 | Returns the result of converting the 32-bit two's complement integer `a'
2416 | to the single-precision floating-point format.  The conversion is performed
2417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2418 *----------------------------------------------------------------------------*/
2419 
2420 float32 int32_to_float32(int32_t a, float_status *status)
2421 {
2422     flag zSign;
2423 
2424     if ( a == 0 ) return float32_zero;
2425     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
2426     zSign = ( a < 0 );
2427     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
2428 }
2429 
2430 /*----------------------------------------------------------------------------
2431 | Returns the result of converting the 32-bit two's complement integer `a'
2432 | to the double-precision floating-point format.  The conversion is performed
2433 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2434 *----------------------------------------------------------------------------*/
2435 
2436 float64 int32_to_float64(int32_t a, float_status *status)
2437 {
2438     flag zSign;
2439     uint32_t absA;
2440     int8_t shiftCount;
2441     uint64_t zSig;
2442 
2443     if ( a == 0 ) return float64_zero;
2444     zSign = ( a < 0 );
2445     absA = zSign ? - a : a;
2446     shiftCount = countLeadingZeros32( absA ) + 21;
2447     zSig = absA;
2448     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2449 
2450 }
2451 
2452 /*----------------------------------------------------------------------------
2453 | Returns the result of converting the 32-bit two's complement integer `a'
2454 | to the extended double-precision floating-point format.  The conversion
2455 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2456 | Arithmetic.
2457 *----------------------------------------------------------------------------*/
2458 
2459 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2460 {
2461     flag zSign;
2462     uint32_t absA;
2463     int8_t shiftCount;
2464     uint64_t zSig;
2465 
2466     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2467     zSign = ( a < 0 );
2468     absA = zSign ? - a : a;
2469     shiftCount = countLeadingZeros32( absA ) + 32;
2470     zSig = absA;
2471     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2472 
2473 }
2474 
2475 /*----------------------------------------------------------------------------
2476 | Returns the result of converting the 32-bit two's complement integer `a' to
2477 | the quadruple-precision floating-point format.  The conversion is performed
2478 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2479 *----------------------------------------------------------------------------*/
2480 
2481 float128 int32_to_float128(int32_t a, float_status *status)
2482 {
2483     flag zSign;
2484     uint32_t absA;
2485     int8_t shiftCount;
2486     uint64_t zSig0;
2487 
2488     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2489     zSign = ( a < 0 );
2490     absA = zSign ? - a : a;
2491     shiftCount = countLeadingZeros32( absA ) + 17;
2492     zSig0 = absA;
2493     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2494 
2495 }
2496 
2497 /*----------------------------------------------------------------------------
2498 | Returns the result of converting the 64-bit two's complement integer `a'
2499 | to the single-precision floating-point format.  The conversion is performed
2500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2501 *----------------------------------------------------------------------------*/
2502 
2503 float32 int64_to_float32(int64_t a, float_status *status)
2504 {
2505     flag zSign;
2506     uint64_t absA;
2507     int8_t shiftCount;
2508 
2509     if ( a == 0 ) return float32_zero;
2510     zSign = ( a < 0 );
2511     absA = zSign ? - a : a;
2512     shiftCount = countLeadingZeros64( absA ) - 40;
2513     if ( 0 <= shiftCount ) {
2514         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2515     }
2516     else {
2517         shiftCount += 7;
2518         if ( shiftCount < 0 ) {
2519             shift64RightJamming( absA, - shiftCount, &absA );
2520         }
2521         else {
2522             absA <<= shiftCount;
2523         }
2524         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
2525     }
2526 
2527 }
2528 
2529 /*----------------------------------------------------------------------------
2530 | Returns the result of converting the 64-bit two's complement integer `a'
2531 | to the double-precision floating-point format.  The conversion is performed
2532 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2533 *----------------------------------------------------------------------------*/
2534 
2535 float64 int64_to_float64(int64_t a, float_status *status)
2536 {
2537     flag zSign;
2538 
2539     if ( a == 0 ) return float64_zero;
2540     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
2541         return packFloat64( 1, 0x43E, 0 );
2542     }
2543     zSign = ( a < 0 );
2544     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
2545 }
2546 
2547 /*----------------------------------------------------------------------------
2548 | Returns the result of converting the 64-bit two's complement integer `a'
2549 | to the extended double-precision floating-point format.  The conversion
2550 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2551 | Arithmetic.
2552 *----------------------------------------------------------------------------*/
2553 
2554 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2555 {
2556     flag zSign;
2557     uint64_t absA;
2558     int8_t shiftCount;
2559 
2560     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2561     zSign = ( a < 0 );
2562     absA = zSign ? - a : a;
2563     shiftCount = countLeadingZeros64( absA );
2564     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2565 
2566 }
2567 
2568 /*----------------------------------------------------------------------------
2569 | Returns the result of converting the 64-bit two's complement integer `a' to
2570 | the quadruple-precision floating-point format.  The conversion is performed
2571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2572 *----------------------------------------------------------------------------*/
2573 
2574 float128 int64_to_float128(int64_t a, float_status *status)
2575 {
2576     flag zSign;
2577     uint64_t absA;
2578     int8_t shiftCount;
2579     int32_t zExp;
2580     uint64_t zSig0, zSig1;
2581 
2582     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2583     zSign = ( a < 0 );
2584     absA = zSign ? - a : a;
2585     shiftCount = countLeadingZeros64( absA ) + 49;
2586     zExp = 0x406E - shiftCount;
2587     if ( 64 <= shiftCount ) {
2588         zSig1 = 0;
2589         zSig0 = absA;
2590         shiftCount -= 64;
2591     }
2592     else {
2593         zSig1 = absA;
2594         zSig0 = 0;
2595     }
2596     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2597     return packFloat128( zSign, zExp, zSig0, zSig1 );
2598 
2599 }
2600 
2601 /*----------------------------------------------------------------------------
2602 | Returns the result of converting the 64-bit unsigned integer `a'
2603 | to the single-precision floating-point format.  The conversion is performed
2604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2605 *----------------------------------------------------------------------------*/
2606 
2607 float32 uint64_to_float32(uint64_t a, float_status *status)
2608 {
2609     int shiftcount;
2610 
2611     if (a == 0) {
2612         return float32_zero;
2613     }
2614 
2615     /* Determine (left) shift needed to put first set bit into bit posn 23
2616      * (since packFloat32() expects the binary point between bits 23 and 22);
2617      * this is the fast case for smallish numbers.
2618      */
2619     shiftcount = countLeadingZeros64(a) - 40;
2620     if (shiftcount >= 0) {
2621         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2622     }
2623     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2624      * expects the binary point between bits 30 and 29, hence the + 7.
2625      */
2626     shiftcount += 7;
2627     if (shiftcount < 0) {
2628         shift64RightJamming(a, -shiftcount, &a);
2629     } else {
2630         a <<= shiftcount;
2631     }
2632 
2633     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
2634 }
2635 
2636 /*----------------------------------------------------------------------------
2637 | Returns the result of converting the 64-bit unsigned integer `a'
2638 | to the double-precision floating-point format.  The conversion is performed
2639 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2640 *----------------------------------------------------------------------------*/
2641 
2642 float64 uint64_to_float64(uint64_t a, float_status *status)
2643 {
2644     int exp = 0x43C;
2645     int shiftcount;
2646 
2647     if (a == 0) {
2648         return float64_zero;
2649     }
2650 
2651     shiftcount = countLeadingZeros64(a) - 1;
2652     if (shiftcount < 0) {
2653         shift64RightJamming(a, -shiftcount, &a);
2654     } else {
2655         a <<= shiftcount;
2656     }
2657     return roundAndPackFloat64(0, exp - shiftcount, a, status);
2658 }
2659 
2660 /*----------------------------------------------------------------------------
2661 | Returns the result of converting the 64-bit unsigned integer `a'
2662 | to the quadruple-precision floating-point format.  The conversion is performed
2663 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664 *----------------------------------------------------------------------------*/
2665 
2666 float128 uint64_to_float128(uint64_t a, float_status *status)
2667 {
2668     if (a == 0) {
2669         return float128_zero;
2670     }
2671     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
2672 }
2673 
2674 /*----------------------------------------------------------------------------
2675 | Returns the result of converting the single-precision floating-point value
2676 | `a' to the 32-bit two's complement integer format.  The conversion is
2677 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2678 | Arithmetic---which means in particular that the conversion is rounded
2679 | according to the current rounding mode.  If `a' is a NaN, the largest
2680 | positive integer is returned.  Otherwise, if the conversion overflows, the
2681 | largest integer with the same sign as `a' is returned.
2682 *----------------------------------------------------------------------------*/
2683 
2684 int32_t float32_to_int32(float32 a, float_status *status)
2685 {
2686     flag aSign;
2687     int aExp;
2688     int shiftCount;
2689     uint32_t aSig;
2690     uint64_t aSig64;
2691 
2692     a = float32_squash_input_denormal(a, status);
2693     aSig = extractFloat32Frac( a );
2694     aExp = extractFloat32Exp( a );
2695     aSign = extractFloat32Sign( a );
2696     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2697     if ( aExp ) aSig |= 0x00800000;
2698     shiftCount = 0xAF - aExp;
2699     aSig64 = aSig;
2700     aSig64 <<= 32;
2701     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
2702     return roundAndPackInt32(aSign, aSig64, status);
2703 
2704 }
2705 
2706 /*----------------------------------------------------------------------------
2707 | Returns the result of converting the single-precision floating-point value
2708 | `a' to the 32-bit two's complement integer format.  The conversion is
2709 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2710 | Arithmetic, except that the conversion is always rounded toward zero.
2711 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2712 | the conversion overflows, the largest integer with the same sign as `a' is
2713 | returned.
2714 *----------------------------------------------------------------------------*/
2715 
2716 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
2717 {
2718     flag aSign;
2719     int aExp;
2720     int shiftCount;
2721     uint32_t aSig;
2722     int32_t z;
2723     a = float32_squash_input_denormal(a, status);
2724 
2725     aSig = extractFloat32Frac( a );
2726     aExp = extractFloat32Exp( a );
2727     aSign = extractFloat32Sign( a );
2728     shiftCount = aExp - 0x9E;
2729     if ( 0 <= shiftCount ) {
2730         if ( float32_val(a) != 0xCF000000 ) {
2731             float_raise(float_flag_invalid, status);
2732             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2733         }
2734         return (int32_t) 0x80000000;
2735     }
2736     else if ( aExp <= 0x7E ) {
2737         if (aExp | aSig) {
2738             status->float_exception_flags |= float_flag_inexact;
2739         }
2740         return 0;
2741     }
2742     aSig = ( aSig | 0x00800000 )<<8;
2743     z = aSig>>( - shiftCount );
2744     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2745         status->float_exception_flags |= float_flag_inexact;
2746     }
2747     if ( aSign ) z = - z;
2748     return z;
2749 
2750 }
2751 
2752 /*----------------------------------------------------------------------------
2753 | Returns the result of converting the single-precision floating-point value
2754 | `a' to the 16-bit two's complement integer format.  The conversion is
2755 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2756 | Arithmetic, except that the conversion is always rounded toward zero.
2757 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2758 | the conversion overflows, the largest integer with the same sign as `a' is
2759 | returned.
2760 *----------------------------------------------------------------------------*/
2761 
2762 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
2763 {
2764     flag aSign;
2765     int aExp;
2766     int shiftCount;
2767     uint32_t aSig;
2768     int32_t z;
2769 
2770     aSig = extractFloat32Frac( a );
2771     aExp = extractFloat32Exp( a );
2772     aSign = extractFloat32Sign( a );
2773     shiftCount = aExp - 0x8E;
2774     if ( 0 <= shiftCount ) {
2775         if ( float32_val(a) != 0xC7000000 ) {
2776             float_raise(float_flag_invalid, status);
2777             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2778                 return 0x7FFF;
2779             }
2780         }
2781         return (int32_t) 0xffff8000;
2782     }
2783     else if ( aExp <= 0x7E ) {
2784         if ( aExp | aSig ) {
2785             status->float_exception_flags |= float_flag_inexact;
2786         }
2787         return 0;
2788     }
2789     shiftCount -= 0x10;
2790     aSig = ( aSig | 0x00800000 )<<8;
2791     z = aSig>>( - shiftCount );
2792     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2793         status->float_exception_flags |= float_flag_inexact;
2794     }
2795     if ( aSign ) {
2796         z = - z;
2797     }
2798     return z;
2799 
2800 }
2801 
2802 /*----------------------------------------------------------------------------
2803 | Returns the result of converting the single-precision floating-point value
2804 | `a' to the 64-bit two's complement integer format.  The conversion is
2805 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2806 | Arithmetic---which means in particular that the conversion is rounded
2807 | according to the current rounding mode.  If `a' is a NaN, the largest
2808 | positive integer is returned.  Otherwise, if the conversion overflows, the
2809 | largest integer with the same sign as `a' is returned.
2810 *----------------------------------------------------------------------------*/
2811 
2812 int64_t float32_to_int64(float32 a, float_status *status)
2813 {
2814     flag aSign;
2815     int aExp;
2816     int shiftCount;
2817     uint32_t aSig;
2818     uint64_t aSig64, aSigExtra;
2819     a = float32_squash_input_denormal(a, status);
2820 
2821     aSig = extractFloat32Frac( a );
2822     aExp = extractFloat32Exp( a );
2823     aSign = extractFloat32Sign( a );
2824     shiftCount = 0xBE - aExp;
2825     if ( shiftCount < 0 ) {
2826         float_raise(float_flag_invalid, status);
2827         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2828             return LIT64( 0x7FFFFFFFFFFFFFFF );
2829         }
2830         return (int64_t) LIT64( 0x8000000000000000 );
2831     }
2832     if ( aExp ) aSig |= 0x00800000;
2833     aSig64 = aSig;
2834     aSig64 <<= 40;
2835     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
2836     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
2837 
2838 }
2839 
2840 /*----------------------------------------------------------------------------
2841 | Returns the result of converting the single-precision floating-point value
2842 | `a' to the 64-bit unsigned integer format.  The conversion is
2843 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2844 | Arithmetic---which means in particular that the conversion is rounded
2845 | according to the current rounding mode.  If `a' is a NaN, the largest
2846 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
2847 | largest unsigned integer is returned.  If the 'a' is negative, the result
2848 | is rounded and zero is returned; values that do not round to zero will
2849 | raise the inexact exception flag.
2850 *----------------------------------------------------------------------------*/
2851 
2852 uint64_t float32_to_uint64(float32 a, float_status *status)
2853 {
2854     flag aSign;
2855     int aExp;
2856     int shiftCount;
2857     uint32_t aSig;
2858     uint64_t aSig64, aSigExtra;
2859     a = float32_squash_input_denormal(a, status);
2860 
2861     aSig = extractFloat32Frac(a);
2862     aExp = extractFloat32Exp(a);
2863     aSign = extractFloat32Sign(a);
2864     if ((aSign) && (aExp > 126)) {
2865         float_raise(float_flag_invalid, status);
2866         if (float32_is_any_nan(a)) {
2867             return LIT64(0xFFFFFFFFFFFFFFFF);
2868         } else {
2869             return 0;
2870         }
2871     }
2872     shiftCount = 0xBE - aExp;
2873     if (aExp) {
2874         aSig |= 0x00800000;
2875     }
2876     if (shiftCount < 0) {
2877         float_raise(float_flag_invalid, status);
2878         return LIT64(0xFFFFFFFFFFFFFFFF);
2879     }
2880 
2881     aSig64 = aSig;
2882     aSig64 <<= 40;
2883     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
2884     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2885 }
2886 
2887 /*----------------------------------------------------------------------------
2888 | Returns the result of converting the single-precision floating-point value
2889 | `a' to the 64-bit unsigned integer format.  The conversion is
2890 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2891 | Arithmetic, except that the conversion is always rounded toward zero.  If
2892 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
2893 | conversion overflows, the largest unsigned integer is returned.  If the
2894 | 'a' is negative, the result is rounded and zero is returned; values that do
2895 | not round to zero will raise the inexact flag.
2896 *----------------------------------------------------------------------------*/
2897 
2898 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
2899 {
2900     signed char current_rounding_mode = status->float_rounding_mode;
2901     set_float_rounding_mode(float_round_to_zero, status);
2902     int64_t v = float32_to_uint64(a, status);
2903     set_float_rounding_mode(current_rounding_mode, status);
2904     return v;
2905 }
2906 
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of converting the single-precision floating-point value
2909 | `a' to the 64-bit two's complement integer format.  The conversion is
2910 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2911 | Arithmetic, except that the conversion is always rounded toward zero.  If
2912 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2913 | conversion overflows, the largest integer with the same sign as `a' is
2914 | returned.
2915 *----------------------------------------------------------------------------*/
2916 
2917 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
2918 {
2919     flag aSign;
2920     int aExp;
2921     int shiftCount;
2922     uint32_t aSig;
2923     uint64_t aSig64;
2924     int64_t z;
2925     a = float32_squash_input_denormal(a, status);
2926 
2927     aSig = extractFloat32Frac( a );
2928     aExp = extractFloat32Exp( a );
2929     aSign = extractFloat32Sign( a );
2930     shiftCount = aExp - 0xBE;
2931     if ( 0 <= shiftCount ) {
2932         if ( float32_val(a) != 0xDF000000 ) {
2933             float_raise(float_flag_invalid, status);
2934             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2935                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2936             }
2937         }
2938         return (int64_t) LIT64( 0x8000000000000000 );
2939     }
2940     else if ( aExp <= 0x7E ) {
2941         if (aExp | aSig) {
2942             status->float_exception_flags |= float_flag_inexact;
2943         }
2944         return 0;
2945     }
2946     aSig64 = aSig | 0x00800000;
2947     aSig64 <<= 40;
2948     z = aSig64>>( - shiftCount );
2949     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
2950         status->float_exception_flags |= float_flag_inexact;
2951     }
2952     if ( aSign ) z = - z;
2953     return z;
2954 
2955 }
2956 
2957 /*----------------------------------------------------------------------------
2958 | Returns the result of converting the single-precision floating-point value
2959 | `a' to the double-precision floating-point format.  The conversion is
2960 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2961 | Arithmetic.
2962 *----------------------------------------------------------------------------*/
2963 
2964 float64 float32_to_float64(float32 a, float_status *status)
2965 {
2966     flag aSign;
2967     int aExp;
2968     uint32_t aSig;
2969     a = float32_squash_input_denormal(a, status);
2970 
2971     aSig = extractFloat32Frac( a );
2972     aExp = extractFloat32Exp( a );
2973     aSign = extractFloat32Sign( a );
2974     if ( aExp == 0xFF ) {
2975         if (aSig) {
2976             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2977         }
2978         return packFloat64( aSign, 0x7FF, 0 );
2979     }
2980     if ( aExp == 0 ) {
2981         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2982         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2983         --aExp;
2984     }
2985     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
2986 
2987 }
2988 
2989 /*----------------------------------------------------------------------------
2990 | Returns the result of converting the single-precision floating-point value
2991 | `a' to the extended double-precision floating-point format.  The conversion
2992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2993 | Arithmetic.
2994 *----------------------------------------------------------------------------*/
2995 
2996 floatx80 float32_to_floatx80(float32 a, float_status *status)
2997 {
2998     flag aSign;
2999     int aExp;
3000     uint32_t aSig;
3001 
3002     a = float32_squash_input_denormal(a, status);
3003     aSig = extractFloat32Frac( a );
3004     aExp = extractFloat32Exp( a );
3005     aSign = extractFloat32Sign( a );
3006     if ( aExp == 0xFF ) {
3007         if (aSig) {
3008             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3009         }
3010         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3011     }
3012     if ( aExp == 0 ) {
3013         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3015     }
3016     aSig |= 0x00800000;
3017     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3018 
3019 }
3020 
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of converting the single-precision floating-point value
3023 | `a' to the double-precision floating-point format.  The conversion is
3024 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3025 | Arithmetic.
3026 *----------------------------------------------------------------------------*/
3027 
3028 float128 float32_to_float128(float32 a, float_status *status)
3029 {
3030     flag aSign;
3031     int aExp;
3032     uint32_t aSig;
3033 
3034     a = float32_squash_input_denormal(a, status);
3035     aSig = extractFloat32Frac( a );
3036     aExp = extractFloat32Exp( a );
3037     aSign = extractFloat32Sign( a );
3038     if ( aExp == 0xFF ) {
3039         if (aSig) {
3040             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3041         }
3042         return packFloat128( aSign, 0x7FFF, 0, 0 );
3043     }
3044     if ( aExp == 0 ) {
3045         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3046         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3047         --aExp;
3048     }
3049     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3050 
3051 }
3052 
3053 /*----------------------------------------------------------------------------
3054 | Returns the remainder of the single-precision floating-point value `a'
3055 | with respect to the corresponding value `b'.  The operation is performed
3056 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3057 *----------------------------------------------------------------------------*/
3058 
3059 float32 float32_rem(float32 a, float32 b, float_status *status)
3060 {
3061     flag aSign, zSign;
3062     int aExp, bExp, expDiff;
3063     uint32_t aSig, bSig;
3064     uint32_t q;
3065     uint64_t aSig64, bSig64, q64;
3066     uint32_t alternateASig;
3067     int32_t sigMean;
3068     a = float32_squash_input_denormal(a, status);
3069     b = float32_squash_input_denormal(b, status);
3070 
3071     aSig = extractFloat32Frac( a );
3072     aExp = extractFloat32Exp( a );
3073     aSign = extractFloat32Sign( a );
3074     bSig = extractFloat32Frac( b );
3075     bExp = extractFloat32Exp( b );
3076     if ( aExp == 0xFF ) {
3077         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3078             return propagateFloat32NaN(a, b, status);
3079         }
3080         float_raise(float_flag_invalid, status);
3081         return float32_default_nan(status);
3082     }
3083     if ( bExp == 0xFF ) {
3084         if (bSig) {
3085             return propagateFloat32NaN(a, b, status);
3086         }
3087         return a;
3088     }
3089     if ( bExp == 0 ) {
3090         if ( bSig == 0 ) {
3091             float_raise(float_flag_invalid, status);
3092             return float32_default_nan(status);
3093         }
3094         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3095     }
3096     if ( aExp == 0 ) {
3097         if ( aSig == 0 ) return a;
3098         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3099     }
3100     expDiff = aExp - bExp;
3101     aSig |= 0x00800000;
3102     bSig |= 0x00800000;
3103     if ( expDiff < 32 ) {
3104         aSig <<= 8;
3105         bSig <<= 8;
3106         if ( expDiff < 0 ) {
3107             if ( expDiff < -1 ) return a;
3108             aSig >>= 1;
3109         }
3110         q = ( bSig <= aSig );
3111         if ( q ) aSig -= bSig;
3112         if ( 0 < expDiff ) {
3113             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3114             q >>= 32 - expDiff;
3115             bSig >>= 2;
3116             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3117         }
3118         else {
3119             aSig >>= 2;
3120             bSig >>= 2;
3121         }
3122     }
3123     else {
3124         if ( bSig <= aSig ) aSig -= bSig;
3125         aSig64 = ( (uint64_t) aSig )<<40;
3126         bSig64 = ( (uint64_t) bSig )<<40;
3127         expDiff -= 64;
3128         while ( 0 < expDiff ) {
3129             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3130             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3131             aSig64 = - ( ( bSig * q64 )<<38 );
3132             expDiff -= 62;
3133         }
3134         expDiff += 64;
3135         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3136         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3137         q = q64>>( 64 - expDiff );
3138         bSig <<= 6;
3139         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3140     }
3141     do {
3142         alternateASig = aSig;
3143         ++q;
3144         aSig -= bSig;
3145     } while ( 0 <= (int32_t) aSig );
3146     sigMean = aSig + alternateASig;
3147     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3148         aSig = alternateASig;
3149     }
3150     zSign = ( (int32_t) aSig < 0 );
3151     if ( zSign ) aSig = - aSig;
3152     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3153 }
3154 
3155 
3156 /*----------------------------------------------------------------------------
3157 | Returns the square root of the single-precision floating-point value `a'.
3158 | The operation is performed according to the IEC/IEEE Standard for Binary
3159 | Floating-Point Arithmetic.
3160 *----------------------------------------------------------------------------*/
3161 
3162 float32 float32_sqrt(float32 a, float_status *status)
3163 {
3164     flag aSign;
3165     int aExp, zExp;
3166     uint32_t aSig, zSig;
3167     uint64_t rem, term;
3168     a = float32_squash_input_denormal(a, status);
3169 
3170     aSig = extractFloat32Frac( a );
3171     aExp = extractFloat32Exp( a );
3172     aSign = extractFloat32Sign( a );
3173     if ( aExp == 0xFF ) {
3174         if (aSig) {
3175             return propagateFloat32NaN(a, float32_zero, status);
3176         }
3177         if ( ! aSign ) return a;
3178         float_raise(float_flag_invalid, status);
3179         return float32_default_nan(status);
3180     }
3181     if ( aSign ) {
3182         if ( ( aExp | aSig ) == 0 ) return a;
3183         float_raise(float_flag_invalid, status);
3184         return float32_default_nan(status);
3185     }
3186     if ( aExp == 0 ) {
3187         if ( aSig == 0 ) return float32_zero;
3188         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3189     }
3190     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3191     aSig = ( aSig | 0x00800000 )<<8;
3192     zSig = estimateSqrt32( aExp, aSig ) + 2;
3193     if ( ( zSig & 0x7F ) <= 5 ) {
3194         if ( zSig < 2 ) {
3195             zSig = 0x7FFFFFFF;
3196             goto roundAndPack;
3197         }
3198         aSig >>= aExp & 1;
3199         term = ( (uint64_t) zSig ) * zSig;
3200         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3201         while ( (int64_t) rem < 0 ) {
3202             --zSig;
3203             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3204         }
3205         zSig |= ( rem != 0 );
3206     }
3207     shift32RightJamming( zSig, 1, &zSig );
3208  roundAndPack:
3209     return roundAndPackFloat32(0, zExp, zSig, status);
3210 
3211 }
3212 
3213 /*----------------------------------------------------------------------------
3214 | Returns the binary exponential of the single-precision floating-point value
3215 | `a'. The operation is performed according to the IEC/IEEE Standard for
3216 | Binary Floating-Point Arithmetic.
3217 |
3218 | Uses the following identities:
3219 |
3220 | 1. -------------------------------------------------------------------------
3221 |      x    x*ln(2)
3222 |     2  = e
3223 |
3224 | 2. -------------------------------------------------------------------------
3225 |                      2     3     4     5           n
3226 |      x        x     x     x     x     x           x
3227 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3228 |               1!    2!    3!    4!    5!          n!
3229 *----------------------------------------------------------------------------*/
3230 
3231 static const float64 float32_exp2_coefficients[15] =
3232 {
3233     const_float64( 0x3ff0000000000000ll ), /*  1 */
3234     const_float64( 0x3fe0000000000000ll ), /*  2 */
3235     const_float64( 0x3fc5555555555555ll ), /*  3 */
3236     const_float64( 0x3fa5555555555555ll ), /*  4 */
3237     const_float64( 0x3f81111111111111ll ), /*  5 */
3238     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3239     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3240     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3241     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3242     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3243     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3244     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3245     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3246     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3247     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3248 };
3249 
3250 float32 float32_exp2(float32 a, float_status *status)
3251 {
3252     flag aSign;
3253     int aExp;
3254     uint32_t aSig;
3255     float64 r, x, xn;
3256     int i;
3257     a = float32_squash_input_denormal(a, status);
3258 
3259     aSig = extractFloat32Frac( a );
3260     aExp = extractFloat32Exp( a );
3261     aSign = extractFloat32Sign( a );
3262 
3263     if ( aExp == 0xFF) {
3264         if (aSig) {
3265             return propagateFloat32NaN(a, float32_zero, status);
3266         }
3267         return (aSign) ? float32_zero : a;
3268     }
3269     if (aExp == 0) {
3270         if (aSig == 0) return float32_one;
3271     }
3272 
3273     float_raise(float_flag_inexact, status);
3274 
3275     /* ******************************* */
3276     /* using float64 for approximation */
3277     /* ******************************* */
3278     x = float32_to_float64(a, status);
3279     x = float64_mul(x, float64_ln2, status);
3280 
3281     xn = x;
3282     r = float64_one;
3283     for (i = 0 ; i < 15 ; i++) {
3284         float64 f;
3285 
3286         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3287         r = float64_add(r, f, status);
3288 
3289         xn = float64_mul(xn, x, status);
3290     }
3291 
3292     return float64_to_float32(r, status);
3293 }
3294 
3295 /*----------------------------------------------------------------------------
3296 | Returns the binary log of the single-precision floating-point value `a'.
3297 | The operation is performed according to the IEC/IEEE Standard for Binary
3298 | Floating-Point Arithmetic.
3299 *----------------------------------------------------------------------------*/
3300 float32 float32_log2(float32 a, float_status *status)
3301 {
3302     flag aSign, zSign;
3303     int aExp;
3304     uint32_t aSig, zSig, i;
3305 
3306     a = float32_squash_input_denormal(a, status);
3307     aSig = extractFloat32Frac( a );
3308     aExp = extractFloat32Exp( a );
3309     aSign = extractFloat32Sign( a );
3310 
3311     if ( aExp == 0 ) {
3312         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3313         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3314     }
3315     if ( aSign ) {
3316         float_raise(float_flag_invalid, status);
3317         return float32_default_nan(status);
3318     }
3319     if ( aExp == 0xFF ) {
3320         if (aSig) {
3321             return propagateFloat32NaN(a, float32_zero, status);
3322         }
3323         return a;
3324     }
3325 
3326     aExp -= 0x7F;
3327     aSig |= 0x00800000;
3328     zSign = aExp < 0;
3329     zSig = aExp << 23;
3330 
3331     for (i = 1 << 22; i > 0; i >>= 1) {
3332         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3333         if ( aSig & 0x01000000 ) {
3334             aSig >>= 1;
3335             zSig |= i;
3336         }
3337     }
3338 
3339     if ( zSign )
3340         zSig = -zSig;
3341 
3342     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3343 }
3344 
3345 /*----------------------------------------------------------------------------
3346 | Returns 1 if the single-precision floating-point value `a' is equal to
3347 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3348 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3349 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350 *----------------------------------------------------------------------------*/
3351 
3352 int float32_eq(float32 a, float32 b, float_status *status)
3353 {
3354     uint32_t av, bv;
3355     a = float32_squash_input_denormal(a, status);
3356     b = float32_squash_input_denormal(b, status);
3357 
3358     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3359          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3360        ) {
3361         float_raise(float_flag_invalid, status);
3362         return 0;
3363     }
3364     av = float32_val(a);
3365     bv = float32_val(b);
3366     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3367 }
3368 
3369 /*----------------------------------------------------------------------------
3370 | Returns 1 if the single-precision floating-point value `a' is less than
3371 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3372 | exception is raised if either operand is a NaN.  The comparison is performed
3373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3374 *----------------------------------------------------------------------------*/
3375 
3376 int float32_le(float32 a, float32 b, float_status *status)
3377 {
3378     flag aSign, bSign;
3379     uint32_t av, bv;
3380     a = float32_squash_input_denormal(a, status);
3381     b = float32_squash_input_denormal(b, status);
3382 
3383     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3384          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3385        ) {
3386         float_raise(float_flag_invalid, status);
3387         return 0;
3388     }
3389     aSign = extractFloat32Sign( a );
3390     bSign = extractFloat32Sign( b );
3391     av = float32_val(a);
3392     bv = float32_val(b);
3393     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3394     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3395 
3396 }
3397 
3398 /*----------------------------------------------------------------------------
3399 | Returns 1 if the single-precision floating-point value `a' is less than
3400 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3401 | raised if either operand is a NaN.  The comparison is performed according
3402 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3403 *----------------------------------------------------------------------------*/
3404 
3405 int float32_lt(float32 a, float32 b, float_status *status)
3406 {
3407     flag aSign, bSign;
3408     uint32_t av, bv;
3409     a = float32_squash_input_denormal(a, status);
3410     b = float32_squash_input_denormal(b, status);
3411 
3412     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3413          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3414        ) {
3415         float_raise(float_flag_invalid, status);
3416         return 0;
3417     }
3418     aSign = extractFloat32Sign( a );
3419     bSign = extractFloat32Sign( b );
3420     av = float32_val(a);
3421     bv = float32_val(b);
3422     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3423     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3424 
3425 }
3426 
3427 /*----------------------------------------------------------------------------
3428 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3429 | be compared, and 0 otherwise.  The invalid exception is raised if either
3430 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3431 | Standard for Binary Floating-Point Arithmetic.
3432 *----------------------------------------------------------------------------*/
3433 
3434 int float32_unordered(float32 a, float32 b, float_status *status)
3435 {
3436     a = float32_squash_input_denormal(a, status);
3437     b = float32_squash_input_denormal(b, status);
3438 
3439     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3440          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3441        ) {
3442         float_raise(float_flag_invalid, status);
3443         return 1;
3444     }
3445     return 0;
3446 }
3447 
3448 /*----------------------------------------------------------------------------
3449 | Returns 1 if the single-precision floating-point value `a' is equal to
3450 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3451 | exception.  The comparison is performed according to the IEC/IEEE Standard
3452 | for Binary Floating-Point Arithmetic.
3453 *----------------------------------------------------------------------------*/
3454 
3455 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3456 {
3457     a = float32_squash_input_denormal(a, status);
3458     b = float32_squash_input_denormal(b, status);
3459 
3460     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3461          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3462        ) {
3463         if (float32_is_signaling_nan(a, status)
3464          || float32_is_signaling_nan(b, status)) {
3465             float_raise(float_flag_invalid, status);
3466         }
3467         return 0;
3468     }
3469     return ( float32_val(a) == float32_val(b) ) ||
3470             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3471 }
3472 
3473 /*----------------------------------------------------------------------------
3474 | Returns 1 if the single-precision floating-point value `a' is less than or
3475 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3476 | cause an exception.  Otherwise, the comparison is performed according to the
3477 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3478 *----------------------------------------------------------------------------*/
3479 
3480 int float32_le_quiet(float32 a, float32 b, float_status *status)
3481 {
3482     flag aSign, bSign;
3483     uint32_t av, bv;
3484     a = float32_squash_input_denormal(a, status);
3485     b = float32_squash_input_denormal(b, status);
3486 
3487     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3488          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3489        ) {
3490         if (float32_is_signaling_nan(a, status)
3491          || float32_is_signaling_nan(b, status)) {
3492             float_raise(float_flag_invalid, status);
3493         }
3494         return 0;
3495     }
3496     aSign = extractFloat32Sign( a );
3497     bSign = extractFloat32Sign( b );
3498     av = float32_val(a);
3499     bv = float32_val(b);
3500     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3501     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3502 
3503 }
3504 
3505 /*----------------------------------------------------------------------------
3506 | Returns 1 if the single-precision floating-point value `a' is less than
3507 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3508 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3509 | Standard for Binary Floating-Point Arithmetic.
3510 *----------------------------------------------------------------------------*/
3511 
3512 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3513 {
3514     flag aSign, bSign;
3515     uint32_t av, bv;
3516     a = float32_squash_input_denormal(a, status);
3517     b = float32_squash_input_denormal(b, status);
3518 
3519     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3520          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3521        ) {
3522         if (float32_is_signaling_nan(a, status)
3523          || float32_is_signaling_nan(b, status)) {
3524             float_raise(float_flag_invalid, status);
3525         }
3526         return 0;
3527     }
3528     aSign = extractFloat32Sign( a );
3529     bSign = extractFloat32Sign( b );
3530     av = float32_val(a);
3531     bv = float32_val(b);
3532     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3533     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3534 
3535 }
3536 
3537 /*----------------------------------------------------------------------------
3538 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3539 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3540 | comparison is performed according to the IEC/IEEE Standard for Binary
3541 | Floating-Point Arithmetic.
3542 *----------------------------------------------------------------------------*/
3543 
3544 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3545 {
3546     a = float32_squash_input_denormal(a, status);
3547     b = float32_squash_input_denormal(b, status);
3548 
3549     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551        ) {
3552         if (float32_is_signaling_nan(a, status)
3553          || float32_is_signaling_nan(b, status)) {
3554             float_raise(float_flag_invalid, status);
3555         }
3556         return 1;
3557     }
3558     return 0;
3559 }
3560 
3561 /*----------------------------------------------------------------------------
3562 | Returns the result of converting the double-precision floating-point value
3563 | `a' to the 32-bit two's complement integer format.  The conversion is
3564 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3565 | Arithmetic---which means in particular that the conversion is rounded
3566 | according to the current rounding mode.  If `a' is a NaN, the largest
3567 | positive integer is returned.  Otherwise, if the conversion overflows, the
3568 | largest integer with the same sign as `a' is returned.
3569 *----------------------------------------------------------------------------*/
3570 
3571 int32_t float64_to_int32(float64 a, float_status *status)
3572 {
3573     flag aSign;
3574     int aExp;
3575     int shiftCount;
3576     uint64_t aSig;
3577     a = float64_squash_input_denormal(a, status);
3578 
3579     aSig = extractFloat64Frac( a );
3580     aExp = extractFloat64Exp( a );
3581     aSign = extractFloat64Sign( a );
3582     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3583     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3584     shiftCount = 0x42C - aExp;
3585     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3586     return roundAndPackInt32(aSign, aSig, status);
3587 
3588 }
3589 
3590 /*----------------------------------------------------------------------------
3591 | Returns the result of converting the double-precision floating-point value
3592 | `a' to the 32-bit two's complement integer format.  The conversion is
3593 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3594 | Arithmetic, except that the conversion is always rounded toward zero.
3595 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3596 | the conversion overflows, the largest integer with the same sign as `a' is
3597 | returned.
3598 *----------------------------------------------------------------------------*/
3599 
3600 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3601 {
3602     flag aSign;
3603     int aExp;
3604     int shiftCount;
3605     uint64_t aSig, savedASig;
3606     int32_t z;
3607     a = float64_squash_input_denormal(a, status);
3608 
3609     aSig = extractFloat64Frac( a );
3610     aExp = extractFloat64Exp( a );
3611     aSign = extractFloat64Sign( a );
3612     if ( 0x41E < aExp ) {
3613         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3614         goto invalid;
3615     }
3616     else if ( aExp < 0x3FF ) {
3617         if (aExp || aSig) {
3618             status->float_exception_flags |= float_flag_inexact;
3619         }
3620         return 0;
3621     }
3622     aSig |= LIT64( 0x0010000000000000 );
3623     shiftCount = 0x433 - aExp;
3624     savedASig = aSig;
3625     aSig >>= shiftCount;
3626     z = aSig;
3627     if ( aSign ) z = - z;
3628     if ( ( z < 0 ) ^ aSign ) {
3629  invalid:
3630         float_raise(float_flag_invalid, status);
3631         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3632     }
3633     if ( ( aSig<<shiftCount ) != savedASig ) {
3634         status->float_exception_flags |= float_flag_inexact;
3635     }
3636     return z;
3637 
3638 }
3639 
3640 /*----------------------------------------------------------------------------
3641 | Returns the result of converting the double-precision floating-point value
3642 | `a' to the 16-bit two's complement integer format.  The conversion is
3643 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3644 | Arithmetic, except that the conversion is always rounded toward zero.
3645 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3646 | the conversion overflows, the largest integer with the same sign as `a' is
3647 | returned.
3648 *----------------------------------------------------------------------------*/
3649 
3650 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3651 {
3652     flag aSign;
3653     int aExp;
3654     int shiftCount;
3655     uint64_t aSig, savedASig;
3656     int32_t z;
3657 
3658     aSig = extractFloat64Frac( a );
3659     aExp = extractFloat64Exp( a );
3660     aSign = extractFloat64Sign( a );
3661     if ( 0x40E < aExp ) {
3662         if ( ( aExp == 0x7FF ) && aSig ) {
3663             aSign = 0;
3664         }
3665         goto invalid;
3666     }
3667     else if ( aExp < 0x3FF ) {
3668         if ( aExp || aSig ) {
3669             status->float_exception_flags |= float_flag_inexact;
3670         }
3671         return 0;
3672     }
3673     aSig |= LIT64( 0x0010000000000000 );
3674     shiftCount = 0x433 - aExp;
3675     savedASig = aSig;
3676     aSig >>= shiftCount;
3677     z = aSig;
3678     if ( aSign ) {
3679         z = - z;
3680     }
3681     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3682  invalid:
3683         float_raise(float_flag_invalid, status);
3684         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3685     }
3686     if ( ( aSig<<shiftCount ) != savedASig ) {
3687         status->float_exception_flags |= float_flag_inexact;
3688     }
3689     return z;
3690 }
3691 
3692 /*----------------------------------------------------------------------------
3693 | Returns the result of converting the double-precision floating-point value
3694 | `a' to the 64-bit two's complement integer format.  The conversion is
3695 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3696 | Arithmetic---which means in particular that the conversion is rounded
3697 | according to the current rounding mode.  If `a' is a NaN, the largest
3698 | positive integer is returned.  Otherwise, if the conversion overflows, the
3699 | largest integer with the same sign as `a' is returned.
3700 *----------------------------------------------------------------------------*/
3701 
3702 int64_t float64_to_int64(float64 a, float_status *status)
3703 {
3704     flag aSign;
3705     int aExp;
3706     int shiftCount;
3707     uint64_t aSig, aSigExtra;
3708     a = float64_squash_input_denormal(a, status);
3709 
3710     aSig = extractFloat64Frac( a );
3711     aExp = extractFloat64Exp( a );
3712     aSign = extractFloat64Sign( a );
3713     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3714     shiftCount = 0x433 - aExp;
3715     if ( shiftCount <= 0 ) {
3716         if ( 0x43E < aExp ) {
3717             float_raise(float_flag_invalid, status);
3718             if (    ! aSign
3719                  || (    ( aExp == 0x7FF )
3720                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3721                ) {
3722                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3723             }
3724             return (int64_t) LIT64( 0x8000000000000000 );
3725         }
3726         aSigExtra = 0;
3727         aSig <<= - shiftCount;
3728     }
3729     else {
3730         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3731     }
3732     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3733 
3734 }
3735 
3736 /*----------------------------------------------------------------------------
3737 | Returns the result of converting the double-precision floating-point value
3738 | `a' to the 64-bit two's complement integer format.  The conversion is
3739 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3740 | Arithmetic, except that the conversion is always rounded toward zero.
3741 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3742 | the conversion overflows, the largest integer with the same sign as `a' is
3743 | returned.
3744 *----------------------------------------------------------------------------*/
3745 
3746 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3747 {
3748     flag aSign;
3749     int aExp;
3750     int shiftCount;
3751     uint64_t aSig;
3752     int64_t z;
3753     a = float64_squash_input_denormal(a, status);
3754 
3755     aSig = extractFloat64Frac( a );
3756     aExp = extractFloat64Exp( a );
3757     aSign = extractFloat64Sign( a );
3758     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3759     shiftCount = aExp - 0x433;
3760     if ( 0 <= shiftCount ) {
3761         if ( 0x43E <= aExp ) {
3762             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3763                 float_raise(float_flag_invalid, status);
3764                 if (    ! aSign
3765                      || (    ( aExp == 0x7FF )
3766                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3767                    ) {
3768                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3769                 }
3770             }
3771             return (int64_t) LIT64( 0x8000000000000000 );
3772         }
3773         z = aSig<<shiftCount;
3774     }
3775     else {
3776         if ( aExp < 0x3FE ) {
3777             if (aExp | aSig) {
3778                 status->float_exception_flags |= float_flag_inexact;
3779             }
3780             return 0;
3781         }
3782         z = aSig>>( - shiftCount );
3783         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3784             status->float_exception_flags |= float_flag_inexact;
3785         }
3786     }
3787     if ( aSign ) z = - z;
3788     return z;
3789 
3790 }
3791 
3792 /*----------------------------------------------------------------------------
3793 | Returns the result of converting the double-precision floating-point value
3794 | `a' to the single-precision floating-point format.  The conversion is
3795 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3796 | Arithmetic.
3797 *----------------------------------------------------------------------------*/
3798 
3799 float32 float64_to_float32(float64 a, float_status *status)
3800 {
3801     flag aSign;
3802     int aExp;
3803     uint64_t aSig;
3804     uint32_t zSig;
3805     a = float64_squash_input_denormal(a, status);
3806 
3807     aSig = extractFloat64Frac( a );
3808     aExp = extractFloat64Exp( a );
3809     aSign = extractFloat64Sign( a );
3810     if ( aExp == 0x7FF ) {
3811         if (aSig) {
3812             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3813         }
3814         return packFloat32( aSign, 0xFF, 0 );
3815     }
3816     shift64RightJamming( aSig, 22, &aSig );
3817     zSig = aSig;
3818     if ( aExp || zSig ) {
3819         zSig |= 0x40000000;
3820         aExp -= 0x381;
3821     }
3822     return roundAndPackFloat32(aSign, aExp, zSig, status);
3823 
3824 }
3825 
3826 
3827 /*----------------------------------------------------------------------------
3828 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3829 | half-precision floating-point value, returning the result.  After being
3830 | shifted into the proper positions, the three fields are simply added
3831 | together to form the result.  This means that any integer portion of `zSig'
3832 | will be added into the exponent.  Since a properly normalized significand
3833 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3834 | than the desired result exponent whenever `zSig' is a complete, normalized
3835 | significand.
3836 *----------------------------------------------------------------------------*/
3837 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3838 {
3839     return make_float16(
3840         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3841 }
3842 
3843 /*----------------------------------------------------------------------------
3844 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3845 | and significand `zSig', and returns the proper half-precision floating-
3846 | point value corresponding to the abstract input.  Ordinarily, the abstract
3847 | value is simply rounded and packed into the half-precision format, with
3848 | the inexact exception raised if the abstract input cannot be represented
3849 | exactly.  However, if the abstract value is too large, the overflow and
3850 | inexact exceptions are raised and an infinity or maximal finite value is
3851 | returned.  If the abstract value is too small, the input value is rounded to
3852 | a subnormal number, and the underflow and inexact exceptions are raised if
3853 | the abstract input cannot be represented exactly as a subnormal half-
3854 | precision floating-point number.
3855 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3856 | ARM-style "alternative representation", which omits the NaN and Inf
3857 | encodings in order to raise the maximum representable exponent by one.
3858 |     The input significand `zSig' has its binary point between bits 22
3859 | and 23, which is 13 bits to the left of the usual location.  This shifted
3860 | significand must be normalized or smaller.  If `zSig' is not normalized,
3861 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3862 | and it must not require rounding.  In the usual case that `zSig' is
3863 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3864 | Note the slightly odd position of the binary point in zSig compared with the
3865 | other roundAndPackFloat functions. This should probably be fixed if we
3866 | need to implement more float16 routines than just conversion.
3867 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3868 | Binary Floating-Point Arithmetic.
3869 *----------------------------------------------------------------------------*/
3870 
3871 static float16 roundAndPackFloat16(flag zSign, int zExp,
3872                                    uint32_t zSig, flag ieee,
3873                                    float_status *status)
3874 {
3875     int maxexp = ieee ? 29 : 30;
3876     uint32_t mask;
3877     uint32_t increment;
3878     bool rounding_bumps_exp;
3879     bool is_tiny = false;
3880 
3881     /* Calculate the mask of bits of the mantissa which are not
3882      * representable in half-precision and will be lost.
3883      */
3884     if (zExp < 1) {
3885         /* Will be denormal in halfprec */
3886         mask = 0x00ffffff;
3887         if (zExp >= -11) {
3888             mask >>= 11 + zExp;
3889         }
3890     } else {
3891         /* Normal number in halfprec */
3892         mask = 0x00001fff;
3893     }
3894 
3895     switch (status->float_rounding_mode) {
3896     case float_round_nearest_even:
3897         increment = (mask + 1) >> 1;
3898         if ((zSig & mask) == increment) {
3899             increment = zSig & (increment << 1);
3900         }
3901         break;
3902     case float_round_ties_away:
3903         increment = (mask + 1) >> 1;
3904         break;
3905     case float_round_up:
3906         increment = zSign ? 0 : mask;
3907         break;
3908     case float_round_down:
3909         increment = zSign ? mask : 0;
3910         break;
3911     default: /* round_to_zero */
3912         increment = 0;
3913         break;
3914     }
3915 
3916     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3917 
3918     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3919         if (ieee) {
3920             float_raise(float_flag_overflow | float_flag_inexact, status);
3921             return packFloat16(zSign, 0x1f, 0);
3922         } else {
3923             float_raise(float_flag_invalid, status);
3924             return packFloat16(zSign, 0x1f, 0x3ff);
3925         }
3926     }
3927 
3928     if (zExp < 0) {
3929         /* Note that flush-to-zero does not affect half-precision results */
3930         is_tiny =
3931             (status->float_detect_tininess == float_tininess_before_rounding)
3932             || (zExp < -1)
3933             || (!rounding_bumps_exp);
3934     }
3935     if (zSig & mask) {
3936         float_raise(float_flag_inexact, status);
3937         if (is_tiny) {
3938             float_raise(float_flag_underflow, status);
3939         }
3940     }
3941 
3942     zSig += increment;
3943     if (rounding_bumps_exp) {
3944         zSig >>= 1;
3945         zExp++;
3946     }
3947 
3948     if (zExp < -10) {
3949         return packFloat16(zSign, 0, 0);
3950     }
3951     if (zExp < 0) {
3952         zSig >>= -zExp;
3953         zExp = 0;
3954     }
3955     return packFloat16(zSign, zExp, zSig >> 13);
3956 }
3957 
3958 /*----------------------------------------------------------------------------
3959 | If `a' is denormal and we are in flush-to-zero mode then set the
3960 | input-denormal exception and return zero. Otherwise just return the value.
3961 *----------------------------------------------------------------------------*/
3962 float16 float16_squash_input_denormal(float16 a, float_status *status)
3963 {
3964     if (status->flush_inputs_to_zero) {
3965         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3966             float_raise(float_flag_input_denormal, status);
3967             return make_float16(float16_val(a) & 0x8000);
3968         }
3969     }
3970     return a;
3971 }
3972 
3973 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3974                                       uint32_t *zSigPtr)
3975 {
3976     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3977     *zSigPtr = aSig << shiftCount;
3978     *zExpPtr = 1 - shiftCount;
3979 }
3980 
3981 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3982    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3983 
3984 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3985 {
3986     flag aSign;
3987     int aExp;
3988     uint32_t aSig;
3989 
3990     aSign = extractFloat16Sign(a);
3991     aExp = extractFloat16Exp(a);
3992     aSig = extractFloat16Frac(a);
3993 
3994     if (aExp == 0x1f && ieee) {
3995         if (aSig) {
3996             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3997         }
3998         return packFloat32(aSign, 0xff, 0);
3999     }
4000     if (aExp == 0) {
4001         if (aSig == 0) {
4002             return packFloat32(aSign, 0, 0);
4003         }
4004 
4005         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4006         aExp--;
4007     }
4008     return packFloat32( aSign, aExp + 0x70, aSig << 13);
4009 }
4010 
4011 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
4012 {
4013     flag aSign;
4014     int aExp;
4015     uint32_t aSig;
4016 
4017     a = float32_squash_input_denormal(a, status);
4018 
4019     aSig = extractFloat32Frac( a );
4020     aExp = extractFloat32Exp( a );
4021     aSign = extractFloat32Sign( a );
4022     if ( aExp == 0xFF ) {
4023         if (aSig) {
4024             /* Input is a NaN */
4025             if (!ieee) {
4026                 float_raise(float_flag_invalid, status);
4027                 return packFloat16(aSign, 0, 0);
4028             }
4029             return commonNaNToFloat16(
4030                 float32ToCommonNaN(a, status), status);
4031         }
4032         /* Infinity */
4033         if (!ieee) {
4034             float_raise(float_flag_invalid, status);
4035             return packFloat16(aSign, 0x1f, 0x3ff);
4036         }
4037         return packFloat16(aSign, 0x1f, 0);
4038     }
4039     if (aExp == 0 && aSig == 0) {
4040         return packFloat16(aSign, 0, 0);
4041     }
4042     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4043      * even if the input is denormal; however this is harmless because
4044      * the largest possible single-precision denormal is still smaller
4045      * than the smallest representable half-precision denormal, and so we
4046      * will end up ignoring aSig and returning via the "always return zero"
4047      * codepath.
4048      */
4049     aSig |= 0x00800000;
4050     aExp -= 0x71;
4051 
4052     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
4053 }
4054 
4055 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
4056 {
4057     flag aSign;
4058     int aExp;
4059     uint32_t aSig;
4060 
4061     aSign = extractFloat16Sign(a);
4062     aExp = extractFloat16Exp(a);
4063     aSig = extractFloat16Frac(a);
4064 
4065     if (aExp == 0x1f && ieee) {
4066         if (aSig) {
4067             return commonNaNToFloat64(
4068                 float16ToCommonNaN(a, status), status);
4069         }
4070         return packFloat64(aSign, 0x7ff, 0);
4071     }
4072     if (aExp == 0) {
4073         if (aSig == 0) {
4074             return packFloat64(aSign, 0, 0);
4075         }
4076 
4077         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4078         aExp--;
4079     }
4080     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4081 }
4082 
4083 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4084 {
4085     flag aSign;
4086     int aExp;
4087     uint64_t aSig;
4088     uint32_t zSig;
4089 
4090     a = float64_squash_input_denormal(a, status);
4091 
4092     aSig = extractFloat64Frac(a);
4093     aExp = extractFloat64Exp(a);
4094     aSign = extractFloat64Sign(a);
4095     if (aExp == 0x7FF) {
4096         if (aSig) {
4097             /* Input is a NaN */
4098             if (!ieee) {
4099                 float_raise(float_flag_invalid, status);
4100                 return packFloat16(aSign, 0, 0);
4101             }
4102             return commonNaNToFloat16(
4103                 float64ToCommonNaN(a, status), status);
4104         }
4105         /* Infinity */
4106         if (!ieee) {
4107             float_raise(float_flag_invalid, status);
4108             return packFloat16(aSign, 0x1f, 0x3ff);
4109         }
4110         return packFloat16(aSign, 0x1f, 0);
4111     }
4112     shift64RightJamming(aSig, 29, &aSig);
4113     zSig = aSig;
4114     if (aExp == 0 && zSig == 0) {
4115         return packFloat16(aSign, 0, 0);
4116     }
4117     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4118      * even if the input is denormal; however this is harmless because
4119      * the largest possible single-precision denormal is still smaller
4120      * than the smallest representable half-precision denormal, and so we
4121      * will end up ignoring aSig and returning via the "always return zero"
4122      * codepath.
4123      */
4124     zSig |= 0x00800000;
4125     aExp -= 0x3F1;
4126 
4127     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4128 }
4129 
4130 /*----------------------------------------------------------------------------
4131 | Returns the result of converting the double-precision floating-point value
4132 | `a' to the extended double-precision floating-point format.  The conversion
4133 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4134 | Arithmetic.
4135 *----------------------------------------------------------------------------*/
4136 
4137 floatx80 float64_to_floatx80(float64 a, float_status *status)
4138 {
4139     flag aSign;
4140     int aExp;
4141     uint64_t aSig;
4142 
4143     a = float64_squash_input_denormal(a, status);
4144     aSig = extractFloat64Frac( a );
4145     aExp = extractFloat64Exp( a );
4146     aSign = extractFloat64Sign( a );
4147     if ( aExp == 0x7FF ) {
4148         if (aSig) {
4149             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4150         }
4151         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4152     }
4153     if ( aExp == 0 ) {
4154         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4155         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4156     }
4157     return
4158         packFloatx80(
4159             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4160 
4161 }
4162 
4163 /*----------------------------------------------------------------------------
4164 | Returns the result of converting the double-precision floating-point value
4165 | `a' to the quadruple-precision floating-point format.  The conversion is
4166 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4167 | Arithmetic.
4168 *----------------------------------------------------------------------------*/
4169 
4170 float128 float64_to_float128(float64 a, float_status *status)
4171 {
4172     flag aSign;
4173     int aExp;
4174     uint64_t aSig, zSig0, zSig1;
4175 
4176     a = float64_squash_input_denormal(a, status);
4177     aSig = extractFloat64Frac( a );
4178     aExp = extractFloat64Exp( a );
4179     aSign = extractFloat64Sign( a );
4180     if ( aExp == 0x7FF ) {
4181         if (aSig) {
4182             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4183         }
4184         return packFloat128( aSign, 0x7FFF, 0, 0 );
4185     }
4186     if ( aExp == 0 ) {
4187         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4188         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4189         --aExp;
4190     }
4191     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4192     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4193 
4194 }
4195 
4196 
4197 /*----------------------------------------------------------------------------
4198 | Returns the remainder of the double-precision floating-point value `a'
4199 | with respect to the corresponding value `b'.  The operation is performed
4200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4201 *----------------------------------------------------------------------------*/
4202 
4203 float64 float64_rem(float64 a, float64 b, float_status *status)
4204 {
4205     flag aSign, zSign;
4206     int aExp, bExp, expDiff;
4207     uint64_t aSig, bSig;
4208     uint64_t q, alternateASig;
4209     int64_t sigMean;
4210 
4211     a = float64_squash_input_denormal(a, status);
4212     b = float64_squash_input_denormal(b, status);
4213     aSig = extractFloat64Frac( a );
4214     aExp = extractFloat64Exp( a );
4215     aSign = extractFloat64Sign( a );
4216     bSig = extractFloat64Frac( b );
4217     bExp = extractFloat64Exp( b );
4218     if ( aExp == 0x7FF ) {
4219         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4220             return propagateFloat64NaN(a, b, status);
4221         }
4222         float_raise(float_flag_invalid, status);
4223         return float64_default_nan(status);
4224     }
4225     if ( bExp == 0x7FF ) {
4226         if (bSig) {
4227             return propagateFloat64NaN(a, b, status);
4228         }
4229         return a;
4230     }
4231     if ( bExp == 0 ) {
4232         if ( bSig == 0 ) {
4233             float_raise(float_flag_invalid, status);
4234             return float64_default_nan(status);
4235         }
4236         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4237     }
4238     if ( aExp == 0 ) {
4239         if ( aSig == 0 ) return a;
4240         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4241     }
4242     expDiff = aExp - bExp;
4243     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4244     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4245     if ( expDiff < 0 ) {
4246         if ( expDiff < -1 ) return a;
4247         aSig >>= 1;
4248     }
4249     q = ( bSig <= aSig );
4250     if ( q ) aSig -= bSig;
4251     expDiff -= 64;
4252     while ( 0 < expDiff ) {
4253         q = estimateDiv128To64( aSig, 0, bSig );
4254         q = ( 2 < q ) ? q - 2 : 0;
4255         aSig = - ( ( bSig>>2 ) * q );
4256         expDiff -= 62;
4257     }
4258     expDiff += 64;
4259     if ( 0 < expDiff ) {
4260         q = estimateDiv128To64( aSig, 0, bSig );
4261         q = ( 2 < q ) ? q - 2 : 0;
4262         q >>= 64 - expDiff;
4263         bSig >>= 2;
4264         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4265     }
4266     else {
4267         aSig >>= 2;
4268         bSig >>= 2;
4269     }
4270     do {
4271         alternateASig = aSig;
4272         ++q;
4273         aSig -= bSig;
4274     } while ( 0 <= (int64_t) aSig );
4275     sigMean = aSig + alternateASig;
4276     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4277         aSig = alternateASig;
4278     }
4279     zSign = ( (int64_t) aSig < 0 );
4280     if ( zSign ) aSig = - aSig;
4281     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4282 
4283 }
4284 
4285 
4286 /*----------------------------------------------------------------------------
4287 | Returns the square root of the double-precision floating-point value `a'.
4288 | The operation is performed according to the IEC/IEEE Standard for Binary
4289 | Floating-Point Arithmetic.
4290 *----------------------------------------------------------------------------*/
4291 
4292 float64 float64_sqrt(float64 a, float_status *status)
4293 {
4294     flag aSign;
4295     int aExp, zExp;
4296     uint64_t aSig, zSig, doubleZSig;
4297     uint64_t rem0, rem1, term0, term1;
4298     a = float64_squash_input_denormal(a, status);
4299 
4300     aSig = extractFloat64Frac( a );
4301     aExp = extractFloat64Exp( a );
4302     aSign = extractFloat64Sign( a );
4303     if ( aExp == 0x7FF ) {
4304         if (aSig) {
4305             return propagateFloat64NaN(a, a, status);
4306         }
4307         if ( ! aSign ) return a;
4308         float_raise(float_flag_invalid, status);
4309         return float64_default_nan(status);
4310     }
4311     if ( aSign ) {
4312         if ( ( aExp | aSig ) == 0 ) return a;
4313         float_raise(float_flag_invalid, status);
4314         return float64_default_nan(status);
4315     }
4316     if ( aExp == 0 ) {
4317         if ( aSig == 0 ) return float64_zero;
4318         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4319     }
4320     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4321     aSig |= LIT64( 0x0010000000000000 );
4322     zSig = estimateSqrt32( aExp, aSig>>21 );
4323     aSig <<= 9 - ( aExp & 1 );
4324     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4325     if ( ( zSig & 0x1FF ) <= 5 ) {
4326         doubleZSig = zSig<<1;
4327         mul64To128( zSig, zSig, &term0, &term1 );
4328         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4329         while ( (int64_t) rem0 < 0 ) {
4330             --zSig;
4331             doubleZSig -= 2;
4332             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4333         }
4334         zSig |= ( ( rem0 | rem1 ) != 0 );
4335     }
4336     return roundAndPackFloat64(0, zExp, zSig, status);
4337 
4338 }
4339 
4340 /*----------------------------------------------------------------------------
4341 | Returns the binary log of the double-precision floating-point value `a'.
4342 | The operation is performed according to the IEC/IEEE Standard for Binary
4343 | Floating-Point Arithmetic.
4344 *----------------------------------------------------------------------------*/
4345 float64 float64_log2(float64 a, float_status *status)
4346 {
4347     flag aSign, zSign;
4348     int aExp;
4349     uint64_t aSig, aSig0, aSig1, zSig, i;
4350     a = float64_squash_input_denormal(a, status);
4351 
4352     aSig = extractFloat64Frac( a );
4353     aExp = extractFloat64Exp( a );
4354     aSign = extractFloat64Sign( a );
4355 
4356     if ( aExp == 0 ) {
4357         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4358         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4359     }
4360     if ( aSign ) {
4361         float_raise(float_flag_invalid, status);
4362         return float64_default_nan(status);
4363     }
4364     if ( aExp == 0x7FF ) {
4365         if (aSig) {
4366             return propagateFloat64NaN(a, float64_zero, status);
4367         }
4368         return a;
4369     }
4370 
4371     aExp -= 0x3FF;
4372     aSig |= LIT64( 0x0010000000000000 );
4373     zSign = aExp < 0;
4374     zSig = (uint64_t)aExp << 52;
4375     for (i = 1LL << 51; i > 0; i >>= 1) {
4376         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4377         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4378         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4379             aSig >>= 1;
4380             zSig |= i;
4381         }
4382     }
4383 
4384     if ( zSign )
4385         zSig = -zSig;
4386     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4387 }
4388 
4389 /*----------------------------------------------------------------------------
4390 | Returns 1 if the double-precision floating-point value `a' is equal to the
4391 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4392 | if either operand is a NaN.  Otherwise, the comparison is performed
4393 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4394 *----------------------------------------------------------------------------*/
4395 
4396 int float64_eq(float64 a, float64 b, float_status *status)
4397 {
4398     uint64_t av, bv;
4399     a = float64_squash_input_denormal(a, status);
4400     b = float64_squash_input_denormal(b, status);
4401 
4402     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4403          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4404        ) {
4405         float_raise(float_flag_invalid, status);
4406         return 0;
4407     }
4408     av = float64_val(a);
4409     bv = float64_val(b);
4410     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4411 
4412 }
4413 
4414 /*----------------------------------------------------------------------------
4415 | Returns 1 if the double-precision floating-point value `a' is less than or
4416 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4417 | exception is raised if either operand is a NaN.  The comparison is performed
4418 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420 
4421 int float64_le(float64 a, float64 b, float_status *status)
4422 {
4423     flag aSign, bSign;
4424     uint64_t av, bv;
4425     a = float64_squash_input_denormal(a, status);
4426     b = float64_squash_input_denormal(b, status);
4427 
4428     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4429          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4430        ) {
4431         float_raise(float_flag_invalid, status);
4432         return 0;
4433     }
4434     aSign = extractFloat64Sign( a );
4435     bSign = extractFloat64Sign( b );
4436     av = float64_val(a);
4437     bv = float64_val(b);
4438     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4439     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4440 
4441 }
4442 
4443 /*----------------------------------------------------------------------------
4444 | Returns 1 if the double-precision floating-point value `a' is less than
4445 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4446 | raised if either operand is a NaN.  The comparison is performed according
4447 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4448 *----------------------------------------------------------------------------*/
4449 
4450 int float64_lt(float64 a, float64 b, float_status *status)
4451 {
4452     flag aSign, bSign;
4453     uint64_t av, bv;
4454 
4455     a = float64_squash_input_denormal(a, status);
4456     b = float64_squash_input_denormal(b, status);
4457     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4458          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4459        ) {
4460         float_raise(float_flag_invalid, status);
4461         return 0;
4462     }
4463     aSign = extractFloat64Sign( a );
4464     bSign = extractFloat64Sign( b );
4465     av = float64_val(a);
4466     bv = float64_val(b);
4467     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4468     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4469 
4470 }
4471 
4472 /*----------------------------------------------------------------------------
4473 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4474 | be compared, and 0 otherwise.  The invalid exception is raised if either
4475 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4476 | Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4478 
4479 int float64_unordered(float64 a, float64 b, float_status *status)
4480 {
4481     a = float64_squash_input_denormal(a, status);
4482     b = float64_squash_input_denormal(b, status);
4483 
4484     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4485          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4486        ) {
4487         float_raise(float_flag_invalid, status);
4488         return 1;
4489     }
4490     return 0;
4491 }
4492 
4493 /*----------------------------------------------------------------------------
4494 | Returns 1 if the double-precision floating-point value `a' is equal to the
4495 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4496 | exception.The comparison is performed according to the IEC/IEEE Standard
4497 | for Binary Floating-Point Arithmetic.
4498 *----------------------------------------------------------------------------*/
4499 
4500 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4501 {
4502     uint64_t av, bv;
4503     a = float64_squash_input_denormal(a, status);
4504     b = float64_squash_input_denormal(b, status);
4505 
4506     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4507          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4508        ) {
4509         if (float64_is_signaling_nan(a, status)
4510          || float64_is_signaling_nan(b, status)) {
4511             float_raise(float_flag_invalid, status);
4512         }
4513         return 0;
4514     }
4515     av = float64_val(a);
4516     bv = float64_val(b);
4517     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4518 
4519 }
4520 
4521 /*----------------------------------------------------------------------------
4522 | Returns 1 if the double-precision floating-point value `a' is less than or
4523 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4524 | cause an exception.  Otherwise, the comparison is performed according to the
4525 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4526 *----------------------------------------------------------------------------*/
4527 
4528 int float64_le_quiet(float64 a, float64 b, float_status *status)
4529 {
4530     flag aSign, bSign;
4531     uint64_t av, bv;
4532     a = float64_squash_input_denormal(a, status);
4533     b = float64_squash_input_denormal(b, status);
4534 
4535     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4536          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4537        ) {
4538         if (float64_is_signaling_nan(a, status)
4539          || float64_is_signaling_nan(b, status)) {
4540             float_raise(float_flag_invalid, status);
4541         }
4542         return 0;
4543     }
4544     aSign = extractFloat64Sign( a );
4545     bSign = extractFloat64Sign( b );
4546     av = float64_val(a);
4547     bv = float64_val(b);
4548     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4549     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4550 
4551 }
4552 
4553 /*----------------------------------------------------------------------------
4554 | Returns 1 if the double-precision floating-point value `a' is less than
4555 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4556 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4557 | Standard for Binary Floating-Point Arithmetic.
4558 *----------------------------------------------------------------------------*/
4559 
4560 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4561 {
4562     flag aSign, bSign;
4563     uint64_t av, bv;
4564     a = float64_squash_input_denormal(a, status);
4565     b = float64_squash_input_denormal(b, status);
4566 
4567     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4568          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4569        ) {
4570         if (float64_is_signaling_nan(a, status)
4571          || float64_is_signaling_nan(b, status)) {
4572             float_raise(float_flag_invalid, status);
4573         }
4574         return 0;
4575     }
4576     aSign = extractFloat64Sign( a );
4577     bSign = extractFloat64Sign( b );
4578     av = float64_val(a);
4579     bv = float64_val(b);
4580     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4581     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4582 
4583 }
4584 
4585 /*----------------------------------------------------------------------------
4586 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4587 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4588 | comparison is performed according to the IEC/IEEE Standard for Binary
4589 | Floating-Point Arithmetic.
4590 *----------------------------------------------------------------------------*/
4591 
4592 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4593 {
4594     a = float64_squash_input_denormal(a, status);
4595     b = float64_squash_input_denormal(b, status);
4596 
4597     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4598          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4599        ) {
4600         if (float64_is_signaling_nan(a, status)
4601          || float64_is_signaling_nan(b, status)) {
4602             float_raise(float_flag_invalid, status);
4603         }
4604         return 1;
4605     }
4606     return 0;
4607 }
4608 
4609 /*----------------------------------------------------------------------------
4610 | Returns the result of converting the extended double-precision floating-
4611 | point value `a' to the 32-bit two's complement integer format.  The
4612 | conversion is performed according to the IEC/IEEE Standard for Binary
4613 | Floating-Point Arithmetic---which means in particular that the conversion
4614 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4615 | largest positive integer is returned.  Otherwise, if the conversion
4616 | overflows, the largest integer with the same sign as `a' is returned.
4617 *----------------------------------------------------------------------------*/
4618 
4619 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4620 {
4621     flag aSign;
4622     int32_t aExp, shiftCount;
4623     uint64_t aSig;
4624 
4625     if (floatx80_invalid_encoding(a)) {
4626         float_raise(float_flag_invalid, status);
4627         return 1 << 31;
4628     }
4629     aSig = extractFloatx80Frac( a );
4630     aExp = extractFloatx80Exp( a );
4631     aSign = extractFloatx80Sign( a );
4632     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4633     shiftCount = 0x4037 - aExp;
4634     if ( shiftCount <= 0 ) shiftCount = 1;
4635     shift64RightJamming( aSig, shiftCount, &aSig );
4636     return roundAndPackInt32(aSign, aSig, status);
4637 
4638 }
4639 
4640 /*----------------------------------------------------------------------------
4641 | Returns the result of converting the extended double-precision floating-
4642 | point value `a' to the 32-bit two's complement integer format.  The
4643 | conversion is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic, except that the conversion is always rounded
4645 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4646 | Otherwise, if the conversion overflows, the largest integer with the same
4647 | sign as `a' is returned.
4648 *----------------------------------------------------------------------------*/
4649 
4650 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4651 {
4652     flag aSign;
4653     int32_t aExp, shiftCount;
4654     uint64_t aSig, savedASig;
4655     int32_t z;
4656 
4657     if (floatx80_invalid_encoding(a)) {
4658         float_raise(float_flag_invalid, status);
4659         return 1 << 31;
4660     }
4661     aSig = extractFloatx80Frac( a );
4662     aExp = extractFloatx80Exp( a );
4663     aSign = extractFloatx80Sign( a );
4664     if ( 0x401E < aExp ) {
4665         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4666         goto invalid;
4667     }
4668     else if ( aExp < 0x3FFF ) {
4669         if (aExp || aSig) {
4670             status->float_exception_flags |= float_flag_inexact;
4671         }
4672         return 0;
4673     }
4674     shiftCount = 0x403E - aExp;
4675     savedASig = aSig;
4676     aSig >>= shiftCount;
4677     z = aSig;
4678     if ( aSign ) z = - z;
4679     if ( ( z < 0 ) ^ aSign ) {
4680  invalid:
4681         float_raise(float_flag_invalid, status);
4682         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4683     }
4684     if ( ( aSig<<shiftCount ) != savedASig ) {
4685         status->float_exception_flags |= float_flag_inexact;
4686     }
4687     return z;
4688 
4689 }
4690 
4691 /*----------------------------------------------------------------------------
4692 | Returns the result of converting the extended double-precision floating-
4693 | point value `a' to the 64-bit two's complement integer format.  The
4694 | conversion is performed according to the IEC/IEEE Standard for Binary
4695 | Floating-Point Arithmetic---which means in particular that the conversion
4696 | is rounded according to the current rounding mode.  If `a' is a NaN,
4697 | the largest positive integer is returned.  Otherwise, if the conversion
4698 | overflows, the largest integer with the same sign as `a' is returned.
4699 *----------------------------------------------------------------------------*/
4700 
4701 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4702 {
4703     flag aSign;
4704     int32_t aExp, shiftCount;
4705     uint64_t aSig, aSigExtra;
4706 
4707     if (floatx80_invalid_encoding(a)) {
4708         float_raise(float_flag_invalid, status);
4709         return 1ULL << 63;
4710     }
4711     aSig = extractFloatx80Frac( a );
4712     aExp = extractFloatx80Exp( a );
4713     aSign = extractFloatx80Sign( a );
4714     shiftCount = 0x403E - aExp;
4715     if ( shiftCount <= 0 ) {
4716         if ( shiftCount ) {
4717             float_raise(float_flag_invalid, status);
4718             if (    ! aSign
4719                  || (    ( aExp == 0x7FFF )
4720                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4721                ) {
4722                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4723             }
4724             return (int64_t) LIT64( 0x8000000000000000 );
4725         }
4726         aSigExtra = 0;
4727     }
4728     else {
4729         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4730     }
4731     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4732 
4733 }
4734 
4735 /*----------------------------------------------------------------------------
4736 | Returns the result of converting the extended double-precision floating-
4737 | point value `a' to the 64-bit two's complement integer format.  The
4738 | conversion is performed according to the IEC/IEEE Standard for Binary
4739 | Floating-Point Arithmetic, except that the conversion is always rounded
4740 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4741 | Otherwise, if the conversion overflows, the largest integer with the same
4742 | sign as `a' is returned.
4743 *----------------------------------------------------------------------------*/
4744 
4745 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4746 {
4747     flag aSign;
4748     int32_t aExp, shiftCount;
4749     uint64_t aSig;
4750     int64_t z;
4751 
4752     if (floatx80_invalid_encoding(a)) {
4753         float_raise(float_flag_invalid, status);
4754         return 1ULL << 63;
4755     }
4756     aSig = extractFloatx80Frac( a );
4757     aExp = extractFloatx80Exp( a );
4758     aSign = extractFloatx80Sign( a );
4759     shiftCount = aExp - 0x403E;
4760     if ( 0 <= shiftCount ) {
4761         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4762         if ( ( a.high != 0xC03E ) || aSig ) {
4763             float_raise(float_flag_invalid, status);
4764             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4765                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4766             }
4767         }
4768         return (int64_t) LIT64( 0x8000000000000000 );
4769     }
4770     else if ( aExp < 0x3FFF ) {
4771         if (aExp | aSig) {
4772             status->float_exception_flags |= float_flag_inexact;
4773         }
4774         return 0;
4775     }
4776     z = aSig>>( - shiftCount );
4777     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4778         status->float_exception_flags |= float_flag_inexact;
4779     }
4780     if ( aSign ) z = - z;
4781     return z;
4782 
4783 }
4784 
4785 /*----------------------------------------------------------------------------
4786 | Returns the result of converting the extended double-precision floating-
4787 | point value `a' to the single-precision floating-point format.  The
4788 | conversion is performed according to the IEC/IEEE Standard for Binary
4789 | Floating-Point Arithmetic.
4790 *----------------------------------------------------------------------------*/
4791 
4792 float32 floatx80_to_float32(floatx80 a, float_status *status)
4793 {
4794     flag aSign;
4795     int32_t aExp;
4796     uint64_t aSig;
4797 
4798     if (floatx80_invalid_encoding(a)) {
4799         float_raise(float_flag_invalid, status);
4800         return float32_default_nan(status);
4801     }
4802     aSig = extractFloatx80Frac( a );
4803     aExp = extractFloatx80Exp( a );
4804     aSign = extractFloatx80Sign( a );
4805     if ( aExp == 0x7FFF ) {
4806         if ( (uint64_t) ( aSig<<1 ) ) {
4807             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4808         }
4809         return packFloat32( aSign, 0xFF, 0 );
4810     }
4811     shift64RightJamming( aSig, 33, &aSig );
4812     if ( aExp || aSig ) aExp -= 0x3F81;
4813     return roundAndPackFloat32(aSign, aExp, aSig, status);
4814 
4815 }
4816 
4817 /*----------------------------------------------------------------------------
4818 | Returns the result of converting the extended double-precision floating-
4819 | point value `a' to the double-precision floating-point format.  The
4820 | conversion is performed according to the IEC/IEEE Standard for Binary
4821 | Floating-Point Arithmetic.
4822 *----------------------------------------------------------------------------*/
4823 
4824 float64 floatx80_to_float64(floatx80 a, float_status *status)
4825 {
4826     flag aSign;
4827     int32_t aExp;
4828     uint64_t aSig, zSig;
4829 
4830     if (floatx80_invalid_encoding(a)) {
4831         float_raise(float_flag_invalid, status);
4832         return float64_default_nan(status);
4833     }
4834     aSig = extractFloatx80Frac( a );
4835     aExp = extractFloatx80Exp( a );
4836     aSign = extractFloatx80Sign( a );
4837     if ( aExp == 0x7FFF ) {
4838         if ( (uint64_t) ( aSig<<1 ) ) {
4839             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4840         }
4841         return packFloat64( aSign, 0x7FF, 0 );
4842     }
4843     shift64RightJamming( aSig, 1, &zSig );
4844     if ( aExp || aSig ) aExp -= 0x3C01;
4845     return roundAndPackFloat64(aSign, aExp, zSig, status);
4846 
4847 }
4848 
4849 /*----------------------------------------------------------------------------
4850 | Returns the result of converting the extended double-precision floating-
4851 | point value `a' to the quadruple-precision floating-point format.  The
4852 | conversion is performed according to the IEC/IEEE Standard for Binary
4853 | Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4855 
4856 float128 floatx80_to_float128(floatx80 a, float_status *status)
4857 {
4858     flag aSign;
4859     int aExp;
4860     uint64_t aSig, zSig0, zSig1;
4861 
4862     if (floatx80_invalid_encoding(a)) {
4863         float_raise(float_flag_invalid, status);
4864         return float128_default_nan(status);
4865     }
4866     aSig = extractFloatx80Frac( a );
4867     aExp = extractFloatx80Exp( a );
4868     aSign = extractFloatx80Sign( a );
4869     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4870         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4871     }
4872     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4873     return packFloat128( aSign, aExp, zSig0, zSig1 );
4874 
4875 }
4876 
4877 /*----------------------------------------------------------------------------
4878 | Rounds the extended double-precision floating-point value `a'
4879 | to the precision provided by floatx80_rounding_precision and returns the
4880 | result as an extended double-precision floating-point value.
4881 | The operation is performed according to the IEC/IEEE Standard for Binary
4882 | Floating-Point Arithmetic.
4883 *----------------------------------------------------------------------------*/
4884 
4885 floatx80 floatx80_round(floatx80 a, float_status *status)
4886 {
4887     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4888                                 extractFloatx80Sign(a),
4889                                 extractFloatx80Exp(a),
4890                                 extractFloatx80Frac(a), 0, status);
4891 }
4892 
4893 /*----------------------------------------------------------------------------
4894 | Rounds the extended double-precision floating-point value `a' to an integer,
4895 | and returns the result as an extended quadruple-precision floating-point
4896 | value.  The operation is performed according to the IEC/IEEE Standard for
4897 | Binary Floating-Point Arithmetic.
4898 *----------------------------------------------------------------------------*/
4899 
4900 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4901 {
4902     flag aSign;
4903     int32_t aExp;
4904     uint64_t lastBitMask, roundBitsMask;
4905     floatx80 z;
4906 
4907     if (floatx80_invalid_encoding(a)) {
4908         float_raise(float_flag_invalid, status);
4909         return floatx80_default_nan(status);
4910     }
4911     aExp = extractFloatx80Exp( a );
4912     if ( 0x403E <= aExp ) {
4913         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4914             return propagateFloatx80NaN(a, a, status);
4915         }
4916         return a;
4917     }
4918     if ( aExp < 0x3FFF ) {
4919         if (    ( aExp == 0 )
4920              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4921             return a;
4922         }
4923         status->float_exception_flags |= float_flag_inexact;
4924         aSign = extractFloatx80Sign( a );
4925         switch (status->float_rounding_mode) {
4926          case float_round_nearest_even:
4927             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4928                ) {
4929                 return
4930                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4931             }
4932             break;
4933         case float_round_ties_away:
4934             if (aExp == 0x3FFE) {
4935                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4936             }
4937             break;
4938          case float_round_down:
4939             return
4940                   aSign ?
4941                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4942                 : packFloatx80( 0, 0, 0 );
4943          case float_round_up:
4944             return
4945                   aSign ? packFloatx80( 1, 0, 0 )
4946                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4947         }
4948         return packFloatx80( aSign, 0, 0 );
4949     }
4950     lastBitMask = 1;
4951     lastBitMask <<= 0x403E - aExp;
4952     roundBitsMask = lastBitMask - 1;
4953     z = a;
4954     switch (status->float_rounding_mode) {
4955     case float_round_nearest_even:
4956         z.low += lastBitMask>>1;
4957         if ((z.low & roundBitsMask) == 0) {
4958             z.low &= ~lastBitMask;
4959         }
4960         break;
4961     case float_round_ties_away:
4962         z.low += lastBitMask >> 1;
4963         break;
4964     case float_round_to_zero:
4965         break;
4966     case float_round_up:
4967         if (!extractFloatx80Sign(z)) {
4968             z.low += roundBitsMask;
4969         }
4970         break;
4971     case float_round_down:
4972         if (extractFloatx80Sign(z)) {
4973             z.low += roundBitsMask;
4974         }
4975         break;
4976     default:
4977         abort();
4978     }
4979     z.low &= ~ roundBitsMask;
4980     if ( z.low == 0 ) {
4981         ++z.high;
4982         z.low = LIT64( 0x8000000000000000 );
4983     }
4984     if (z.low != a.low) {
4985         status->float_exception_flags |= float_flag_inexact;
4986     }
4987     return z;
4988 
4989 }
4990 
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of adding the absolute values of the extended double-
4993 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4994 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4995 | The addition is performed according to the IEC/IEEE Standard for Binary
4996 | Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4998 
4999 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5000                                 float_status *status)
5001 {
5002     int32_t aExp, bExp, zExp;
5003     uint64_t aSig, bSig, zSig0, zSig1;
5004     int32_t expDiff;
5005 
5006     aSig = extractFloatx80Frac( a );
5007     aExp = extractFloatx80Exp( a );
5008     bSig = extractFloatx80Frac( b );
5009     bExp = extractFloatx80Exp( b );
5010     expDiff = aExp - bExp;
5011     if ( 0 < expDiff ) {
5012         if ( aExp == 0x7FFF ) {
5013             if ((uint64_t)(aSig << 1)) {
5014                 return propagateFloatx80NaN(a, b, status);
5015             }
5016             return a;
5017         }
5018         if ( bExp == 0 ) --expDiff;
5019         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5020         zExp = aExp;
5021     }
5022     else if ( expDiff < 0 ) {
5023         if ( bExp == 0x7FFF ) {
5024             if ((uint64_t)(bSig << 1)) {
5025                 return propagateFloatx80NaN(a, b, status);
5026             }
5027             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5028         }
5029         if ( aExp == 0 ) ++expDiff;
5030         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5031         zExp = bExp;
5032     }
5033     else {
5034         if ( aExp == 0x7FFF ) {
5035             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5036                 return propagateFloatx80NaN(a, b, status);
5037             }
5038             return a;
5039         }
5040         zSig1 = 0;
5041         zSig0 = aSig + bSig;
5042         if ( aExp == 0 ) {
5043             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5044             goto roundAndPack;
5045         }
5046         zExp = aExp;
5047         goto shiftRight1;
5048     }
5049     zSig0 = aSig + bSig;
5050     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5051  shiftRight1:
5052     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5053     zSig0 |= LIT64( 0x8000000000000000 );
5054     ++zExp;
5055  roundAndPack:
5056     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5057                                 zSign, zExp, zSig0, zSig1, status);
5058 }
5059 
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of subtracting the absolute values of the extended
5062 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5063 | difference is negated before being returned.  `zSign' is ignored if the
5064 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5065 | Standard for Binary Floating-Point Arithmetic.
5066 *----------------------------------------------------------------------------*/
5067 
5068 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5069                                 float_status *status)
5070 {
5071     int32_t aExp, bExp, zExp;
5072     uint64_t aSig, bSig, zSig0, zSig1;
5073     int32_t expDiff;
5074 
5075     aSig = extractFloatx80Frac( a );
5076     aExp = extractFloatx80Exp( a );
5077     bSig = extractFloatx80Frac( b );
5078     bExp = extractFloatx80Exp( b );
5079     expDiff = aExp - bExp;
5080     if ( 0 < expDiff ) goto aExpBigger;
5081     if ( expDiff < 0 ) goto bExpBigger;
5082     if ( aExp == 0x7FFF ) {
5083         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5084             return propagateFloatx80NaN(a, b, status);
5085         }
5086         float_raise(float_flag_invalid, status);
5087         return floatx80_default_nan(status);
5088     }
5089     if ( aExp == 0 ) {
5090         aExp = 1;
5091         bExp = 1;
5092     }
5093     zSig1 = 0;
5094     if ( bSig < aSig ) goto aBigger;
5095     if ( aSig < bSig ) goto bBigger;
5096     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5097  bExpBigger:
5098     if ( bExp == 0x7FFF ) {
5099         if ((uint64_t)(bSig << 1)) {
5100             return propagateFloatx80NaN(a, b, status);
5101         }
5102         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5103     }
5104     if ( aExp == 0 ) ++expDiff;
5105     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5106  bBigger:
5107     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5108     zExp = bExp;
5109     zSign ^= 1;
5110     goto normalizeRoundAndPack;
5111  aExpBigger:
5112     if ( aExp == 0x7FFF ) {
5113         if ((uint64_t)(aSig << 1)) {
5114             return propagateFloatx80NaN(a, b, status);
5115         }
5116         return a;
5117     }
5118     if ( bExp == 0 ) --expDiff;
5119     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5120  aBigger:
5121     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5122     zExp = aExp;
5123  normalizeRoundAndPack:
5124     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5125                                          zSign, zExp, zSig0, zSig1, status);
5126 }
5127 
5128 /*----------------------------------------------------------------------------
5129 | Returns the result of adding the extended double-precision floating-point
5130 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5131 | Standard for Binary Floating-Point Arithmetic.
5132 *----------------------------------------------------------------------------*/
5133 
5134 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5135 {
5136     flag aSign, bSign;
5137 
5138     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5139         float_raise(float_flag_invalid, status);
5140         return floatx80_default_nan(status);
5141     }
5142     aSign = extractFloatx80Sign( a );
5143     bSign = extractFloatx80Sign( b );
5144     if ( aSign == bSign ) {
5145         return addFloatx80Sigs(a, b, aSign, status);
5146     }
5147     else {
5148         return subFloatx80Sigs(a, b, aSign, status);
5149     }
5150 
5151 }
5152 
5153 /*----------------------------------------------------------------------------
5154 | Returns the result of subtracting the extended double-precision floating-
5155 | point values `a' and `b'.  The operation is performed according to the
5156 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5157 *----------------------------------------------------------------------------*/
5158 
5159 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5160 {
5161     flag aSign, bSign;
5162 
5163     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5164         float_raise(float_flag_invalid, status);
5165         return floatx80_default_nan(status);
5166     }
5167     aSign = extractFloatx80Sign( a );
5168     bSign = extractFloatx80Sign( b );
5169     if ( aSign == bSign ) {
5170         return subFloatx80Sigs(a, b, aSign, status);
5171     }
5172     else {
5173         return addFloatx80Sigs(a, b, aSign, status);
5174     }
5175 
5176 }
5177 
5178 /*----------------------------------------------------------------------------
5179 | Returns the result of multiplying the extended double-precision floating-
5180 | point values `a' and `b'.  The operation is performed according to the
5181 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5182 *----------------------------------------------------------------------------*/
5183 
5184 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5185 {
5186     flag aSign, bSign, zSign;
5187     int32_t aExp, bExp, zExp;
5188     uint64_t aSig, bSig, zSig0, zSig1;
5189 
5190     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5191         float_raise(float_flag_invalid, status);
5192         return floatx80_default_nan(status);
5193     }
5194     aSig = extractFloatx80Frac( a );
5195     aExp = extractFloatx80Exp( a );
5196     aSign = extractFloatx80Sign( a );
5197     bSig = extractFloatx80Frac( b );
5198     bExp = extractFloatx80Exp( b );
5199     bSign = extractFloatx80Sign( b );
5200     zSign = aSign ^ bSign;
5201     if ( aExp == 0x7FFF ) {
5202         if (    (uint64_t) ( aSig<<1 )
5203              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5204             return propagateFloatx80NaN(a, b, status);
5205         }
5206         if ( ( bExp | bSig ) == 0 ) goto invalid;
5207         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5208     }
5209     if ( bExp == 0x7FFF ) {
5210         if ((uint64_t)(bSig << 1)) {
5211             return propagateFloatx80NaN(a, b, status);
5212         }
5213         if ( ( aExp | aSig ) == 0 ) {
5214  invalid:
5215             float_raise(float_flag_invalid, status);
5216             return floatx80_default_nan(status);
5217         }
5218         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5219     }
5220     if ( aExp == 0 ) {
5221         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5222         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5223     }
5224     if ( bExp == 0 ) {
5225         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5226         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5227     }
5228     zExp = aExp + bExp - 0x3FFE;
5229     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5230     if ( 0 < (int64_t) zSig0 ) {
5231         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5232         --zExp;
5233     }
5234     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5235                                 zSign, zExp, zSig0, zSig1, status);
5236 }
5237 
5238 /*----------------------------------------------------------------------------
5239 | Returns the result of dividing the extended double-precision floating-point
5240 | value `a' by the corresponding value `b'.  The operation is performed
5241 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5243 
5244 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5245 {
5246     flag aSign, bSign, zSign;
5247     int32_t aExp, bExp, zExp;
5248     uint64_t aSig, bSig, zSig0, zSig1;
5249     uint64_t rem0, rem1, rem2, term0, term1, term2;
5250 
5251     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5252         float_raise(float_flag_invalid, status);
5253         return floatx80_default_nan(status);
5254     }
5255     aSig = extractFloatx80Frac( a );
5256     aExp = extractFloatx80Exp( a );
5257     aSign = extractFloatx80Sign( a );
5258     bSig = extractFloatx80Frac( b );
5259     bExp = extractFloatx80Exp( b );
5260     bSign = extractFloatx80Sign( b );
5261     zSign = aSign ^ bSign;
5262     if ( aExp == 0x7FFF ) {
5263         if ((uint64_t)(aSig << 1)) {
5264             return propagateFloatx80NaN(a, b, status);
5265         }
5266         if ( bExp == 0x7FFF ) {
5267             if ((uint64_t)(bSig << 1)) {
5268                 return propagateFloatx80NaN(a, b, status);
5269             }
5270             goto invalid;
5271         }
5272         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5273     }
5274     if ( bExp == 0x7FFF ) {
5275         if ((uint64_t)(bSig << 1)) {
5276             return propagateFloatx80NaN(a, b, status);
5277         }
5278         return packFloatx80( zSign, 0, 0 );
5279     }
5280     if ( bExp == 0 ) {
5281         if ( bSig == 0 ) {
5282             if ( ( aExp | aSig ) == 0 ) {
5283  invalid:
5284                 float_raise(float_flag_invalid, status);
5285                 return floatx80_default_nan(status);
5286             }
5287             float_raise(float_flag_divbyzero, status);
5288             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5289         }
5290         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5291     }
5292     if ( aExp == 0 ) {
5293         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5294         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5295     }
5296     zExp = aExp - bExp + 0x3FFE;
5297     rem1 = 0;
5298     if ( bSig <= aSig ) {
5299         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5300         ++zExp;
5301     }
5302     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5303     mul64To128( bSig, zSig0, &term0, &term1 );
5304     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5305     while ( (int64_t) rem0 < 0 ) {
5306         --zSig0;
5307         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5308     }
5309     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5310     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5311         mul64To128( bSig, zSig1, &term1, &term2 );
5312         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5313         while ( (int64_t) rem1 < 0 ) {
5314             --zSig1;
5315             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5316         }
5317         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5318     }
5319     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5320                                 zSign, zExp, zSig0, zSig1, status);
5321 }
5322 
5323 /*----------------------------------------------------------------------------
5324 | Returns the remainder of the extended double-precision floating-point value
5325 | `a' with respect to the corresponding value `b'.  The operation is performed
5326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 
5329 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5330 {
5331     flag aSign, zSign;
5332     int32_t aExp, bExp, expDiff;
5333     uint64_t aSig0, aSig1, bSig;
5334     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5335 
5336     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5337         float_raise(float_flag_invalid, status);
5338         return floatx80_default_nan(status);
5339     }
5340     aSig0 = extractFloatx80Frac( a );
5341     aExp = extractFloatx80Exp( a );
5342     aSign = extractFloatx80Sign( a );
5343     bSig = extractFloatx80Frac( b );
5344     bExp = extractFloatx80Exp( b );
5345     if ( aExp == 0x7FFF ) {
5346         if (    (uint64_t) ( aSig0<<1 )
5347              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5348             return propagateFloatx80NaN(a, b, status);
5349         }
5350         goto invalid;
5351     }
5352     if ( bExp == 0x7FFF ) {
5353         if ((uint64_t)(bSig << 1)) {
5354             return propagateFloatx80NaN(a, b, status);
5355         }
5356         return a;
5357     }
5358     if ( bExp == 0 ) {
5359         if ( bSig == 0 ) {
5360  invalid:
5361             float_raise(float_flag_invalid, status);
5362             return floatx80_default_nan(status);
5363         }
5364         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5365     }
5366     if ( aExp == 0 ) {
5367         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5368         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5369     }
5370     bSig |= LIT64( 0x8000000000000000 );
5371     zSign = aSign;
5372     expDiff = aExp - bExp;
5373     aSig1 = 0;
5374     if ( expDiff < 0 ) {
5375         if ( expDiff < -1 ) return a;
5376         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5377         expDiff = 0;
5378     }
5379     q = ( bSig <= aSig0 );
5380     if ( q ) aSig0 -= bSig;
5381     expDiff -= 64;
5382     while ( 0 < expDiff ) {
5383         q = estimateDiv128To64( aSig0, aSig1, bSig );
5384         q = ( 2 < q ) ? q - 2 : 0;
5385         mul64To128( bSig, q, &term0, &term1 );
5386         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5387         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5388         expDiff -= 62;
5389     }
5390     expDiff += 64;
5391     if ( 0 < expDiff ) {
5392         q = estimateDiv128To64( aSig0, aSig1, bSig );
5393         q = ( 2 < q ) ? q - 2 : 0;
5394         q >>= 64 - expDiff;
5395         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5396         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5397         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5398         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5399             ++q;
5400             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5401         }
5402     }
5403     else {
5404         term1 = 0;
5405         term0 = bSig;
5406     }
5407     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5408     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5409          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5410               && ( q & 1 ) )
5411        ) {
5412         aSig0 = alternateASig0;
5413         aSig1 = alternateASig1;
5414         zSign = ! zSign;
5415     }
5416     return
5417         normalizeRoundAndPackFloatx80(
5418             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5419 
5420 }
5421 
5422 /*----------------------------------------------------------------------------
5423 | Returns the square root of the extended double-precision floating-point
5424 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5425 | for Binary Floating-Point Arithmetic.
5426 *----------------------------------------------------------------------------*/
5427 
5428 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5429 {
5430     flag aSign;
5431     int32_t aExp, zExp;
5432     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5433     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5434 
5435     if (floatx80_invalid_encoding(a)) {
5436         float_raise(float_flag_invalid, status);
5437         return floatx80_default_nan(status);
5438     }
5439     aSig0 = extractFloatx80Frac( a );
5440     aExp = extractFloatx80Exp( a );
5441     aSign = extractFloatx80Sign( a );
5442     if ( aExp == 0x7FFF ) {
5443         if ((uint64_t)(aSig0 << 1)) {
5444             return propagateFloatx80NaN(a, a, status);
5445         }
5446         if ( ! aSign ) return a;
5447         goto invalid;
5448     }
5449     if ( aSign ) {
5450         if ( ( aExp | aSig0 ) == 0 ) return a;
5451  invalid:
5452         float_raise(float_flag_invalid, status);
5453         return floatx80_default_nan(status);
5454     }
5455     if ( aExp == 0 ) {
5456         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5457         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5458     }
5459     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5460     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5461     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5462     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5463     doubleZSig0 = zSig0<<1;
5464     mul64To128( zSig0, zSig0, &term0, &term1 );
5465     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5466     while ( (int64_t) rem0 < 0 ) {
5467         --zSig0;
5468         doubleZSig0 -= 2;
5469         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5470     }
5471     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5472     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5473         if ( zSig1 == 0 ) zSig1 = 1;
5474         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5475         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5476         mul64To128( zSig1, zSig1, &term2, &term3 );
5477         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5478         while ( (int64_t) rem1 < 0 ) {
5479             --zSig1;
5480             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5481             term3 |= 1;
5482             term2 |= doubleZSig0;
5483             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5484         }
5485         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5486     }
5487     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5488     zSig0 |= doubleZSig0;
5489     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5490                                 0, zExp, zSig0, zSig1, status);
5491 }
5492 
5493 /*----------------------------------------------------------------------------
5494 | Returns 1 if the extended double-precision floating-point value `a' is equal
5495 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5496 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5498 *----------------------------------------------------------------------------*/
5499 
5500 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5501 {
5502 
5503     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5504         || (extractFloatx80Exp(a) == 0x7FFF
5505             && (uint64_t) (extractFloatx80Frac(a) << 1))
5506         || (extractFloatx80Exp(b) == 0x7FFF
5507             && (uint64_t) (extractFloatx80Frac(b) << 1))
5508        ) {
5509         float_raise(float_flag_invalid, status);
5510         return 0;
5511     }
5512     return
5513            ( a.low == b.low )
5514         && (    ( a.high == b.high )
5515              || (    ( a.low == 0 )
5516                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5517            );
5518 
5519 }
5520 
5521 /*----------------------------------------------------------------------------
5522 | Returns 1 if the extended double-precision floating-point value `a' is
5523 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5524 | invalid exception is raised if either operand is a NaN.  The comparison is
5525 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5526 | Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528 
5529 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5530 {
5531     flag aSign, bSign;
5532 
5533     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5534         || (extractFloatx80Exp(a) == 0x7FFF
5535             && (uint64_t) (extractFloatx80Frac(a) << 1))
5536         || (extractFloatx80Exp(b) == 0x7FFF
5537             && (uint64_t) (extractFloatx80Frac(b) << 1))
5538        ) {
5539         float_raise(float_flag_invalid, status);
5540         return 0;
5541     }
5542     aSign = extractFloatx80Sign( a );
5543     bSign = extractFloatx80Sign( b );
5544     if ( aSign != bSign ) {
5545         return
5546                aSign
5547             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5548                  == 0 );
5549     }
5550     return
5551           aSign ? le128( b.high, b.low, a.high, a.low )
5552         : le128( a.high, a.low, b.high, b.low );
5553 
5554 }
5555 
5556 /*----------------------------------------------------------------------------
5557 | Returns 1 if the extended double-precision floating-point value `a' is
5558 | less than the corresponding value `b', and 0 otherwise.  The invalid
5559 | exception is raised if either operand is a NaN.  The comparison is performed
5560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5561 *----------------------------------------------------------------------------*/
5562 
5563 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5564 {
5565     flag aSign, bSign;
5566 
5567     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5568         || (extractFloatx80Exp(a) == 0x7FFF
5569             && (uint64_t) (extractFloatx80Frac(a) << 1))
5570         || (extractFloatx80Exp(b) == 0x7FFF
5571             && (uint64_t) (extractFloatx80Frac(b) << 1))
5572        ) {
5573         float_raise(float_flag_invalid, status);
5574         return 0;
5575     }
5576     aSign = extractFloatx80Sign( a );
5577     bSign = extractFloatx80Sign( b );
5578     if ( aSign != bSign ) {
5579         return
5580                aSign
5581             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5582                  != 0 );
5583     }
5584     return
5585           aSign ? lt128( b.high, b.low, a.high, a.low )
5586         : lt128( a.high, a.low, b.high, b.low );
5587 
5588 }
5589 
5590 /*----------------------------------------------------------------------------
5591 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5592 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5593 | either operand is a NaN.   The comparison is performed according to the
5594 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5595 *----------------------------------------------------------------------------*/
5596 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5597 {
5598     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5599         || (extractFloatx80Exp(a) == 0x7FFF
5600             && (uint64_t) (extractFloatx80Frac(a) << 1))
5601         || (extractFloatx80Exp(b) == 0x7FFF
5602             && (uint64_t) (extractFloatx80Frac(b) << 1))
5603        ) {
5604         float_raise(float_flag_invalid, status);
5605         return 1;
5606     }
5607     return 0;
5608 }
5609 
5610 /*----------------------------------------------------------------------------
5611 | Returns 1 if the extended double-precision floating-point value `a' is
5612 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5613 | cause an exception.  The comparison is performed according to the IEC/IEEE
5614 | Standard for Binary Floating-Point Arithmetic.
5615 *----------------------------------------------------------------------------*/
5616 
5617 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5618 {
5619 
5620     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5621         float_raise(float_flag_invalid, status);
5622         return 0;
5623     }
5624     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5625               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5626          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5627               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5628        ) {
5629         if (floatx80_is_signaling_nan(a, status)
5630          || floatx80_is_signaling_nan(b, status)) {
5631             float_raise(float_flag_invalid, status);
5632         }
5633         return 0;
5634     }
5635     return
5636            ( a.low == b.low )
5637         && (    ( a.high == b.high )
5638              || (    ( a.low == 0 )
5639                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5640            );
5641 
5642 }
5643 
5644 /*----------------------------------------------------------------------------
5645 | Returns 1 if the extended double-precision floating-point value `a' is less
5646 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5647 | do not cause an exception.  Otherwise, the comparison is performed according
5648 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650 
5651 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5652 {
5653     flag aSign, bSign;
5654 
5655     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5656         float_raise(float_flag_invalid, status);
5657         return 0;
5658     }
5659     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5660               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5661          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5662               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5663        ) {
5664         if (floatx80_is_signaling_nan(a, status)
5665          || floatx80_is_signaling_nan(b, status)) {
5666             float_raise(float_flag_invalid, status);
5667         }
5668         return 0;
5669     }
5670     aSign = extractFloatx80Sign( a );
5671     bSign = extractFloatx80Sign( b );
5672     if ( aSign != bSign ) {
5673         return
5674                aSign
5675             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5676                  == 0 );
5677     }
5678     return
5679           aSign ? le128( b.high, b.low, a.high, a.low )
5680         : le128( a.high, a.low, b.high, b.low );
5681 
5682 }
5683 
5684 /*----------------------------------------------------------------------------
5685 | Returns 1 if the extended double-precision floating-point value `a' is less
5686 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5687 | an exception.  Otherwise, the comparison is performed according to the
5688 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5689 *----------------------------------------------------------------------------*/
5690 
5691 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5692 {
5693     flag aSign, bSign;
5694 
5695     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5696         float_raise(float_flag_invalid, status);
5697         return 0;
5698     }
5699     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5700               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5701          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5702               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5703        ) {
5704         if (floatx80_is_signaling_nan(a, status)
5705          || floatx80_is_signaling_nan(b, status)) {
5706             float_raise(float_flag_invalid, status);
5707         }
5708         return 0;
5709     }
5710     aSign = extractFloatx80Sign( a );
5711     bSign = extractFloatx80Sign( b );
5712     if ( aSign != bSign ) {
5713         return
5714                aSign
5715             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5716                  != 0 );
5717     }
5718     return
5719           aSign ? lt128( b.high, b.low, a.high, a.low )
5720         : lt128( a.high, a.low, b.high, b.low );
5721 
5722 }
5723 
5724 /*----------------------------------------------------------------------------
5725 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5726 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5727 | The comparison is performed according to the IEC/IEEE Standard for Binary
5728 | Floating-Point Arithmetic.
5729 *----------------------------------------------------------------------------*/
5730 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5731 {
5732     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5733         float_raise(float_flag_invalid, status);
5734         return 1;
5735     }
5736     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5737               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5738          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5739               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5740        ) {
5741         if (floatx80_is_signaling_nan(a, status)
5742          || floatx80_is_signaling_nan(b, status)) {
5743             float_raise(float_flag_invalid, status);
5744         }
5745         return 1;
5746     }
5747     return 0;
5748 }
5749 
5750 /*----------------------------------------------------------------------------
5751 | Returns the result of converting the quadruple-precision floating-point
5752 | value `a' to the 32-bit two's complement integer format.  The conversion
5753 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5754 | Arithmetic---which means in particular that the conversion is rounded
5755 | according to the current rounding mode.  If `a' is a NaN, the largest
5756 | positive integer is returned.  Otherwise, if the conversion overflows, the
5757 | largest integer with the same sign as `a' is returned.
5758 *----------------------------------------------------------------------------*/
5759 
5760 int32_t float128_to_int32(float128 a, float_status *status)
5761 {
5762     flag aSign;
5763     int32_t aExp, shiftCount;
5764     uint64_t aSig0, aSig1;
5765 
5766     aSig1 = extractFloat128Frac1( a );
5767     aSig0 = extractFloat128Frac0( a );
5768     aExp = extractFloat128Exp( a );
5769     aSign = extractFloat128Sign( a );
5770     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5771     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5772     aSig0 |= ( aSig1 != 0 );
5773     shiftCount = 0x4028 - aExp;
5774     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5775     return roundAndPackInt32(aSign, aSig0, status);
5776 
5777 }
5778 
5779 /*----------------------------------------------------------------------------
5780 | Returns the result of converting the quadruple-precision floating-point
5781 | value `a' to the 32-bit two's complement integer format.  The conversion
5782 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5783 | Arithmetic, except that the conversion is always rounded toward zero.  If
5784 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5785 | conversion overflows, the largest integer with the same sign as `a' is
5786 | returned.
5787 *----------------------------------------------------------------------------*/
5788 
5789 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5790 {
5791     flag aSign;
5792     int32_t aExp, shiftCount;
5793     uint64_t aSig0, aSig1, savedASig;
5794     int32_t z;
5795 
5796     aSig1 = extractFloat128Frac1( a );
5797     aSig0 = extractFloat128Frac0( a );
5798     aExp = extractFloat128Exp( a );
5799     aSign = extractFloat128Sign( a );
5800     aSig0 |= ( aSig1 != 0 );
5801     if ( 0x401E < aExp ) {
5802         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5803         goto invalid;
5804     }
5805     else if ( aExp < 0x3FFF ) {
5806         if (aExp || aSig0) {
5807             status->float_exception_flags |= float_flag_inexact;
5808         }
5809         return 0;
5810     }
5811     aSig0 |= LIT64( 0x0001000000000000 );
5812     shiftCount = 0x402F - aExp;
5813     savedASig = aSig0;
5814     aSig0 >>= shiftCount;
5815     z = aSig0;
5816     if ( aSign ) z = - z;
5817     if ( ( z < 0 ) ^ aSign ) {
5818  invalid:
5819         float_raise(float_flag_invalid, status);
5820         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5821     }
5822     if ( ( aSig0<<shiftCount ) != savedASig ) {
5823         status->float_exception_flags |= float_flag_inexact;
5824     }
5825     return z;
5826 
5827 }
5828 
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of converting the quadruple-precision floating-point
5831 | value `a' to the 64-bit two's complement integer format.  The conversion
5832 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833 | Arithmetic---which means in particular that the conversion is rounded
5834 | according to the current rounding mode.  If `a' is a NaN, the largest
5835 | positive integer is returned.  Otherwise, if the conversion overflows, the
5836 | largest integer with the same sign as `a' is returned.
5837 *----------------------------------------------------------------------------*/
5838 
5839 int64_t float128_to_int64(float128 a, float_status *status)
5840 {
5841     flag aSign;
5842     int32_t aExp, shiftCount;
5843     uint64_t aSig0, aSig1;
5844 
5845     aSig1 = extractFloat128Frac1( a );
5846     aSig0 = extractFloat128Frac0( a );
5847     aExp = extractFloat128Exp( a );
5848     aSign = extractFloat128Sign( a );
5849     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5850     shiftCount = 0x402F - aExp;
5851     if ( shiftCount <= 0 ) {
5852         if ( 0x403E < aExp ) {
5853             float_raise(float_flag_invalid, status);
5854             if (    ! aSign
5855                  || (    ( aExp == 0x7FFF )
5856                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5857                     )
5858                ) {
5859                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5860             }
5861             return (int64_t) LIT64( 0x8000000000000000 );
5862         }
5863         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5864     }
5865     else {
5866         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5867     }
5868     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5869 
5870 }
5871 
5872 /*----------------------------------------------------------------------------
5873 | Returns the result of converting the quadruple-precision floating-point
5874 | value `a' to the 64-bit two's complement integer format.  The conversion
5875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5876 | Arithmetic, except that the conversion is always rounded toward zero.
5877 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5878 | the conversion overflows, the largest integer with the same sign as `a' is
5879 | returned.
5880 *----------------------------------------------------------------------------*/
5881 
5882 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5883 {
5884     flag aSign;
5885     int32_t aExp, shiftCount;
5886     uint64_t aSig0, aSig1;
5887     int64_t z;
5888 
5889     aSig1 = extractFloat128Frac1( a );
5890     aSig0 = extractFloat128Frac0( a );
5891     aExp = extractFloat128Exp( a );
5892     aSign = extractFloat128Sign( a );
5893     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5894     shiftCount = aExp - 0x402F;
5895     if ( 0 < shiftCount ) {
5896         if ( 0x403E <= aExp ) {
5897             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5898             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5899                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5900                 if (aSig1) {
5901                     status->float_exception_flags |= float_flag_inexact;
5902                 }
5903             }
5904             else {
5905                 float_raise(float_flag_invalid, status);
5906                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5907                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5908                 }
5909             }
5910             return (int64_t) LIT64( 0x8000000000000000 );
5911         }
5912         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5913         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5914             status->float_exception_flags |= float_flag_inexact;
5915         }
5916     }
5917     else {
5918         if ( aExp < 0x3FFF ) {
5919             if ( aExp | aSig0 | aSig1 ) {
5920                 status->float_exception_flags |= float_flag_inexact;
5921             }
5922             return 0;
5923         }
5924         z = aSig0>>( - shiftCount );
5925         if (    aSig1
5926              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5927             status->float_exception_flags |= float_flag_inexact;
5928         }
5929     }
5930     if ( aSign ) z = - z;
5931     return z;
5932 
5933 }
5934 
5935 /*----------------------------------------------------------------------------
5936 | Returns the result of converting the quadruple-precision floating-point value
5937 | `a' to the 64-bit unsigned integer format.  The conversion is
5938 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5939 | Arithmetic---which means in particular that the conversion is rounded
5940 | according to the current rounding mode.  If `a' is a NaN, the largest
5941 | positive integer is returned.  If the conversion overflows, the
5942 | largest unsigned integer is returned.  If 'a' is negative, the value is
5943 | rounded and zero is returned; negative values that do not round to zero
5944 | will raise the inexact exception.
5945 *----------------------------------------------------------------------------*/
5946 
5947 uint64_t float128_to_uint64(float128 a, float_status *status)
5948 {
5949     flag aSign;
5950     int aExp;
5951     int shiftCount;
5952     uint64_t aSig0, aSig1;
5953 
5954     aSig0 = extractFloat128Frac0(a);
5955     aSig1 = extractFloat128Frac1(a);
5956     aExp = extractFloat128Exp(a);
5957     aSign = extractFloat128Sign(a);
5958     if (aSign && (aExp > 0x3FFE)) {
5959         float_raise(float_flag_invalid, status);
5960         if (float128_is_any_nan(a)) {
5961             return LIT64(0xFFFFFFFFFFFFFFFF);
5962         } else {
5963             return 0;
5964         }
5965     }
5966     if (aExp) {
5967         aSig0 |= LIT64(0x0001000000000000);
5968     }
5969     shiftCount = 0x402F - aExp;
5970     if (shiftCount <= 0) {
5971         if (0x403E < aExp) {
5972             float_raise(float_flag_invalid, status);
5973             return LIT64(0xFFFFFFFFFFFFFFFF);
5974         }
5975         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5976     } else {
5977         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5978     }
5979     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5980 }
5981 
5982 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5983 {
5984     uint64_t v;
5985     signed char current_rounding_mode = status->float_rounding_mode;
5986 
5987     set_float_rounding_mode(float_round_to_zero, status);
5988     v = float128_to_uint64(a, status);
5989     set_float_rounding_mode(current_rounding_mode, status);
5990 
5991     return v;
5992 }
5993 
5994 /*----------------------------------------------------------------------------
5995 | Returns the result of converting the quadruple-precision floating-point
5996 | value `a' to the 32-bit unsigned integer format.  The conversion
5997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5998 | Arithmetic except that the conversion is always rounded toward zero.
5999 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6000 | if the conversion overflows, the largest unsigned integer is returned.
6001 | If 'a' is negative, the value is rounded and zero is returned; negative
6002 | values that do not round to zero will raise the inexact exception.
6003 *----------------------------------------------------------------------------*/
6004 
6005 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6006 {
6007     uint64_t v;
6008     uint32_t res;
6009     int old_exc_flags = get_float_exception_flags(status);
6010 
6011     v = float128_to_uint64_round_to_zero(a, status);
6012     if (v > 0xffffffff) {
6013         res = 0xffffffff;
6014     } else {
6015         return v;
6016     }
6017     set_float_exception_flags(old_exc_flags, status);
6018     float_raise(float_flag_invalid, status);
6019     return res;
6020 }
6021 
6022 /*----------------------------------------------------------------------------
6023 | Returns the result of converting the quadruple-precision floating-point
6024 | value `a' to the single-precision floating-point format.  The conversion
6025 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6026 | Arithmetic.
6027 *----------------------------------------------------------------------------*/
6028 
6029 float32 float128_to_float32(float128 a, float_status *status)
6030 {
6031     flag aSign;
6032     int32_t aExp;
6033     uint64_t aSig0, aSig1;
6034     uint32_t zSig;
6035 
6036     aSig1 = extractFloat128Frac1( a );
6037     aSig0 = extractFloat128Frac0( a );
6038     aExp = extractFloat128Exp( a );
6039     aSign = extractFloat128Sign( a );
6040     if ( aExp == 0x7FFF ) {
6041         if ( aSig0 | aSig1 ) {
6042             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6043         }
6044         return packFloat32( aSign, 0xFF, 0 );
6045     }
6046     aSig0 |= ( aSig1 != 0 );
6047     shift64RightJamming( aSig0, 18, &aSig0 );
6048     zSig = aSig0;
6049     if ( aExp || zSig ) {
6050         zSig |= 0x40000000;
6051         aExp -= 0x3F81;
6052     }
6053     return roundAndPackFloat32(aSign, aExp, zSig, status);
6054 
6055 }
6056 
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of converting the quadruple-precision floating-point
6059 | value `a' to the double-precision floating-point format.  The conversion
6060 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6061 | Arithmetic.
6062 *----------------------------------------------------------------------------*/
6063 
6064 float64 float128_to_float64(float128 a, float_status *status)
6065 {
6066     flag aSign;
6067     int32_t aExp;
6068     uint64_t aSig0, aSig1;
6069 
6070     aSig1 = extractFloat128Frac1( a );
6071     aSig0 = extractFloat128Frac0( a );
6072     aExp = extractFloat128Exp( a );
6073     aSign = extractFloat128Sign( a );
6074     if ( aExp == 0x7FFF ) {
6075         if ( aSig0 | aSig1 ) {
6076             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6077         }
6078         return packFloat64( aSign, 0x7FF, 0 );
6079     }
6080     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6081     aSig0 |= ( aSig1 != 0 );
6082     if ( aExp || aSig0 ) {
6083         aSig0 |= LIT64( 0x4000000000000000 );
6084         aExp -= 0x3C01;
6085     }
6086     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6087 
6088 }
6089 
6090 /*----------------------------------------------------------------------------
6091 | Returns the result of converting the quadruple-precision floating-point
6092 | value `a' to the extended double-precision floating-point format.  The
6093 | conversion is performed according to the IEC/IEEE Standard for Binary
6094 | Floating-Point Arithmetic.
6095 *----------------------------------------------------------------------------*/
6096 
6097 floatx80 float128_to_floatx80(float128 a, float_status *status)
6098 {
6099     flag aSign;
6100     int32_t aExp;
6101     uint64_t aSig0, aSig1;
6102 
6103     aSig1 = extractFloat128Frac1( a );
6104     aSig0 = extractFloat128Frac0( a );
6105     aExp = extractFloat128Exp( a );
6106     aSign = extractFloat128Sign( a );
6107     if ( aExp == 0x7FFF ) {
6108         if ( aSig0 | aSig1 ) {
6109             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6110         }
6111         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6112     }
6113     if ( aExp == 0 ) {
6114         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6115         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6116     }
6117     else {
6118         aSig0 |= LIT64( 0x0001000000000000 );
6119     }
6120     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6121     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6122 
6123 }
6124 
6125 /*----------------------------------------------------------------------------
6126 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6127 | returns the result as a quadruple-precision floating-point value.  The
6128 | operation is performed according to the IEC/IEEE Standard for Binary
6129 | Floating-Point Arithmetic.
6130 *----------------------------------------------------------------------------*/
6131 
6132 float128 float128_round_to_int(float128 a, float_status *status)
6133 {
6134     flag aSign;
6135     int32_t aExp;
6136     uint64_t lastBitMask, roundBitsMask;
6137     float128 z;
6138 
6139     aExp = extractFloat128Exp( a );
6140     if ( 0x402F <= aExp ) {
6141         if ( 0x406F <= aExp ) {
6142             if (    ( aExp == 0x7FFF )
6143                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6144                ) {
6145                 return propagateFloat128NaN(a, a, status);
6146             }
6147             return a;
6148         }
6149         lastBitMask = 1;
6150         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6151         roundBitsMask = lastBitMask - 1;
6152         z = a;
6153         switch (status->float_rounding_mode) {
6154         case float_round_nearest_even:
6155             if ( lastBitMask ) {
6156                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6157                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6158             }
6159             else {
6160                 if ( (int64_t) z.low < 0 ) {
6161                     ++z.high;
6162                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6163                 }
6164             }
6165             break;
6166         case float_round_ties_away:
6167             if (lastBitMask) {
6168                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6169             } else {
6170                 if ((int64_t) z.low < 0) {
6171                     ++z.high;
6172                 }
6173             }
6174             break;
6175         case float_round_to_zero:
6176             break;
6177         case float_round_up:
6178             if (!extractFloat128Sign(z)) {
6179                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6180             }
6181             break;
6182         case float_round_down:
6183             if (extractFloat128Sign(z)) {
6184                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6185             }
6186             break;
6187         default:
6188             abort();
6189         }
6190         z.low &= ~ roundBitsMask;
6191     }
6192     else {
6193         if ( aExp < 0x3FFF ) {
6194             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6195             status->float_exception_flags |= float_flag_inexact;
6196             aSign = extractFloat128Sign( a );
6197             switch (status->float_rounding_mode) {
6198              case float_round_nearest_even:
6199                 if (    ( aExp == 0x3FFE )
6200                      && (   extractFloat128Frac0( a )
6201                           | extractFloat128Frac1( a ) )
6202                    ) {
6203                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6204                 }
6205                 break;
6206             case float_round_ties_away:
6207                 if (aExp == 0x3FFE) {
6208                     return packFloat128(aSign, 0x3FFF, 0, 0);
6209                 }
6210                 break;
6211              case float_round_down:
6212                 return
6213                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6214                     : packFloat128( 0, 0, 0, 0 );
6215              case float_round_up:
6216                 return
6217                       aSign ? packFloat128( 1, 0, 0, 0 )
6218                     : packFloat128( 0, 0x3FFF, 0, 0 );
6219             }
6220             return packFloat128( aSign, 0, 0, 0 );
6221         }
6222         lastBitMask = 1;
6223         lastBitMask <<= 0x402F - aExp;
6224         roundBitsMask = lastBitMask - 1;
6225         z.low = 0;
6226         z.high = a.high;
6227         switch (status->float_rounding_mode) {
6228         case float_round_nearest_even:
6229             z.high += lastBitMask>>1;
6230             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6231                 z.high &= ~ lastBitMask;
6232             }
6233             break;
6234         case float_round_ties_away:
6235             z.high += lastBitMask>>1;
6236             break;
6237         case float_round_to_zero:
6238             break;
6239         case float_round_up:
6240             if (!extractFloat128Sign(z)) {
6241                 z.high |= ( a.low != 0 );
6242                 z.high += roundBitsMask;
6243             }
6244             break;
6245         case float_round_down:
6246             if (extractFloat128Sign(z)) {
6247                 z.high |= (a.low != 0);
6248                 z.high += roundBitsMask;
6249             }
6250             break;
6251         default:
6252             abort();
6253         }
6254         z.high &= ~ roundBitsMask;
6255     }
6256     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6257         status->float_exception_flags |= float_flag_inexact;
6258     }
6259     return z;
6260 
6261 }
6262 
6263 /*----------------------------------------------------------------------------
6264 | Returns the result of adding the absolute values of the quadruple-precision
6265 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6266 | before being returned.  `zSign' is ignored if the result is a NaN.
6267 | The addition is performed according to the IEC/IEEE Standard for Binary
6268 | Floating-Point Arithmetic.
6269 *----------------------------------------------------------------------------*/
6270 
6271 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6272                                 float_status *status)
6273 {
6274     int32_t aExp, bExp, zExp;
6275     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6276     int32_t expDiff;
6277 
6278     aSig1 = extractFloat128Frac1( a );
6279     aSig0 = extractFloat128Frac0( a );
6280     aExp = extractFloat128Exp( a );
6281     bSig1 = extractFloat128Frac1( b );
6282     bSig0 = extractFloat128Frac0( b );
6283     bExp = extractFloat128Exp( b );
6284     expDiff = aExp - bExp;
6285     if ( 0 < expDiff ) {
6286         if ( aExp == 0x7FFF ) {
6287             if (aSig0 | aSig1) {
6288                 return propagateFloat128NaN(a, b, status);
6289             }
6290             return a;
6291         }
6292         if ( bExp == 0 ) {
6293             --expDiff;
6294         }
6295         else {
6296             bSig0 |= LIT64( 0x0001000000000000 );
6297         }
6298         shift128ExtraRightJamming(
6299             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6300         zExp = aExp;
6301     }
6302     else if ( expDiff < 0 ) {
6303         if ( bExp == 0x7FFF ) {
6304             if (bSig0 | bSig1) {
6305                 return propagateFloat128NaN(a, b, status);
6306             }
6307             return packFloat128( zSign, 0x7FFF, 0, 0 );
6308         }
6309         if ( aExp == 0 ) {
6310             ++expDiff;
6311         }
6312         else {
6313             aSig0 |= LIT64( 0x0001000000000000 );
6314         }
6315         shift128ExtraRightJamming(
6316             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6317         zExp = bExp;
6318     }
6319     else {
6320         if ( aExp == 0x7FFF ) {
6321             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6322                 return propagateFloat128NaN(a, b, status);
6323             }
6324             return a;
6325         }
6326         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6327         if ( aExp == 0 ) {
6328             if (status->flush_to_zero) {
6329                 if (zSig0 | zSig1) {
6330                     float_raise(float_flag_output_denormal, status);
6331                 }
6332                 return packFloat128(zSign, 0, 0, 0);
6333             }
6334             return packFloat128( zSign, 0, zSig0, zSig1 );
6335         }
6336         zSig2 = 0;
6337         zSig0 |= LIT64( 0x0002000000000000 );
6338         zExp = aExp;
6339         goto shiftRight1;
6340     }
6341     aSig0 |= LIT64( 0x0001000000000000 );
6342     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6343     --zExp;
6344     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6345     ++zExp;
6346  shiftRight1:
6347     shift128ExtraRightJamming(
6348         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6349  roundAndPack:
6350     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6351 
6352 }
6353 
6354 /*----------------------------------------------------------------------------
6355 | Returns the result of subtracting the absolute values of the quadruple-
6356 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6357 | difference is negated before being returned.  `zSign' is ignored if the
6358 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6359 | Standard for Binary Floating-Point Arithmetic.
6360 *----------------------------------------------------------------------------*/
6361 
6362 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6363                                 float_status *status)
6364 {
6365     int32_t aExp, bExp, zExp;
6366     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6367     int32_t expDiff;
6368 
6369     aSig1 = extractFloat128Frac1( a );
6370     aSig0 = extractFloat128Frac0( a );
6371     aExp = extractFloat128Exp( a );
6372     bSig1 = extractFloat128Frac1( b );
6373     bSig0 = extractFloat128Frac0( b );
6374     bExp = extractFloat128Exp( b );
6375     expDiff = aExp - bExp;
6376     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6377     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6378     if ( 0 < expDiff ) goto aExpBigger;
6379     if ( expDiff < 0 ) goto bExpBigger;
6380     if ( aExp == 0x7FFF ) {
6381         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6382             return propagateFloat128NaN(a, b, status);
6383         }
6384         float_raise(float_flag_invalid, status);
6385         return float128_default_nan(status);
6386     }
6387     if ( aExp == 0 ) {
6388         aExp = 1;
6389         bExp = 1;
6390     }
6391     if ( bSig0 < aSig0 ) goto aBigger;
6392     if ( aSig0 < bSig0 ) goto bBigger;
6393     if ( bSig1 < aSig1 ) goto aBigger;
6394     if ( aSig1 < bSig1 ) goto bBigger;
6395     return packFloat128(status->float_rounding_mode == float_round_down,
6396                         0, 0, 0);
6397  bExpBigger:
6398     if ( bExp == 0x7FFF ) {
6399         if (bSig0 | bSig1) {
6400             return propagateFloat128NaN(a, b, status);
6401         }
6402         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6403     }
6404     if ( aExp == 0 ) {
6405         ++expDiff;
6406     }
6407     else {
6408         aSig0 |= LIT64( 0x4000000000000000 );
6409     }
6410     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6411     bSig0 |= LIT64( 0x4000000000000000 );
6412  bBigger:
6413     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6414     zExp = bExp;
6415     zSign ^= 1;
6416     goto normalizeRoundAndPack;
6417  aExpBigger:
6418     if ( aExp == 0x7FFF ) {
6419         if (aSig0 | aSig1) {
6420             return propagateFloat128NaN(a, b, status);
6421         }
6422         return a;
6423     }
6424     if ( bExp == 0 ) {
6425         --expDiff;
6426     }
6427     else {
6428         bSig0 |= LIT64( 0x4000000000000000 );
6429     }
6430     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6431     aSig0 |= LIT64( 0x4000000000000000 );
6432  aBigger:
6433     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6434     zExp = aExp;
6435  normalizeRoundAndPack:
6436     --zExp;
6437     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6438                                          status);
6439 
6440 }
6441 
6442 /*----------------------------------------------------------------------------
6443 | Returns the result of adding the quadruple-precision floating-point values
6444 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6445 | for Binary Floating-Point Arithmetic.
6446 *----------------------------------------------------------------------------*/
6447 
6448 float128 float128_add(float128 a, float128 b, float_status *status)
6449 {
6450     flag aSign, bSign;
6451 
6452     aSign = extractFloat128Sign( a );
6453     bSign = extractFloat128Sign( b );
6454     if ( aSign == bSign ) {
6455         return addFloat128Sigs(a, b, aSign, status);
6456     }
6457     else {
6458         return subFloat128Sigs(a, b, aSign, status);
6459     }
6460 
6461 }
6462 
6463 /*----------------------------------------------------------------------------
6464 | Returns the result of subtracting the quadruple-precision floating-point
6465 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6466 | Standard for Binary Floating-Point Arithmetic.
6467 *----------------------------------------------------------------------------*/
6468 
6469 float128 float128_sub(float128 a, float128 b, float_status *status)
6470 {
6471     flag aSign, bSign;
6472 
6473     aSign = extractFloat128Sign( a );
6474     bSign = extractFloat128Sign( b );
6475     if ( aSign == bSign ) {
6476         return subFloat128Sigs(a, b, aSign, status);
6477     }
6478     else {
6479         return addFloat128Sigs(a, b, aSign, status);
6480     }
6481 
6482 }
6483 
6484 /*----------------------------------------------------------------------------
6485 | Returns the result of multiplying the quadruple-precision floating-point
6486 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6487 | Standard for Binary Floating-Point Arithmetic.
6488 *----------------------------------------------------------------------------*/
6489 
6490 float128 float128_mul(float128 a, float128 b, float_status *status)
6491 {
6492     flag aSign, bSign, zSign;
6493     int32_t aExp, bExp, zExp;
6494     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6495 
6496     aSig1 = extractFloat128Frac1( a );
6497     aSig0 = extractFloat128Frac0( a );
6498     aExp = extractFloat128Exp( a );
6499     aSign = extractFloat128Sign( a );
6500     bSig1 = extractFloat128Frac1( b );
6501     bSig0 = extractFloat128Frac0( b );
6502     bExp = extractFloat128Exp( b );
6503     bSign = extractFloat128Sign( b );
6504     zSign = aSign ^ bSign;
6505     if ( aExp == 0x7FFF ) {
6506         if (    ( aSig0 | aSig1 )
6507              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6508             return propagateFloat128NaN(a, b, status);
6509         }
6510         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6511         return packFloat128( zSign, 0x7FFF, 0, 0 );
6512     }
6513     if ( bExp == 0x7FFF ) {
6514         if (bSig0 | bSig1) {
6515             return propagateFloat128NaN(a, b, status);
6516         }
6517         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6518  invalid:
6519             float_raise(float_flag_invalid, status);
6520             return float128_default_nan(status);
6521         }
6522         return packFloat128( zSign, 0x7FFF, 0, 0 );
6523     }
6524     if ( aExp == 0 ) {
6525         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6526         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6527     }
6528     if ( bExp == 0 ) {
6529         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6530         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6531     }
6532     zExp = aExp + bExp - 0x4000;
6533     aSig0 |= LIT64( 0x0001000000000000 );
6534     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6535     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6536     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6537     zSig2 |= ( zSig3 != 0 );
6538     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6539         shift128ExtraRightJamming(
6540             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6541         ++zExp;
6542     }
6543     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6544 
6545 }
6546 
6547 /*----------------------------------------------------------------------------
6548 | Returns the result of dividing the quadruple-precision floating-point value
6549 | `a' by the corresponding value `b'.  The operation is performed according to
6550 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6551 *----------------------------------------------------------------------------*/
6552 
6553 float128 float128_div(float128 a, float128 b, float_status *status)
6554 {
6555     flag aSign, bSign, zSign;
6556     int32_t aExp, bExp, zExp;
6557     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6558     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6559 
6560     aSig1 = extractFloat128Frac1( a );
6561     aSig0 = extractFloat128Frac0( a );
6562     aExp = extractFloat128Exp( a );
6563     aSign = extractFloat128Sign( a );
6564     bSig1 = extractFloat128Frac1( b );
6565     bSig0 = extractFloat128Frac0( b );
6566     bExp = extractFloat128Exp( b );
6567     bSign = extractFloat128Sign( b );
6568     zSign = aSign ^ bSign;
6569     if ( aExp == 0x7FFF ) {
6570         if (aSig0 | aSig1) {
6571             return propagateFloat128NaN(a, b, status);
6572         }
6573         if ( bExp == 0x7FFF ) {
6574             if (bSig0 | bSig1) {
6575                 return propagateFloat128NaN(a, b, status);
6576             }
6577             goto invalid;
6578         }
6579         return packFloat128( zSign, 0x7FFF, 0, 0 );
6580     }
6581     if ( bExp == 0x7FFF ) {
6582         if (bSig0 | bSig1) {
6583             return propagateFloat128NaN(a, b, status);
6584         }
6585         return packFloat128( zSign, 0, 0, 0 );
6586     }
6587     if ( bExp == 0 ) {
6588         if ( ( bSig0 | bSig1 ) == 0 ) {
6589             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6590  invalid:
6591                 float_raise(float_flag_invalid, status);
6592                 return float128_default_nan(status);
6593             }
6594             float_raise(float_flag_divbyzero, status);
6595             return packFloat128( zSign, 0x7FFF, 0, 0 );
6596         }
6597         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6598     }
6599     if ( aExp == 0 ) {
6600         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6601         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6602     }
6603     zExp = aExp - bExp + 0x3FFD;
6604     shortShift128Left(
6605         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6606     shortShift128Left(
6607         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6608     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6609         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6610         ++zExp;
6611     }
6612     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6613     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6614     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6615     while ( (int64_t) rem0 < 0 ) {
6616         --zSig0;
6617         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6618     }
6619     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6620     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6621         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6622         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6623         while ( (int64_t) rem1 < 0 ) {
6624             --zSig1;
6625             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6626         }
6627         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6628     }
6629     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6630     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6631 
6632 }
6633 
6634 /*----------------------------------------------------------------------------
6635 | Returns the remainder of the quadruple-precision floating-point value `a'
6636 | with respect to the corresponding value `b'.  The operation is performed
6637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6638 *----------------------------------------------------------------------------*/
6639 
6640 float128 float128_rem(float128 a, float128 b, float_status *status)
6641 {
6642     flag aSign, zSign;
6643     int32_t aExp, bExp, expDiff;
6644     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6645     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6646     int64_t sigMean0;
6647 
6648     aSig1 = extractFloat128Frac1( a );
6649     aSig0 = extractFloat128Frac0( a );
6650     aExp = extractFloat128Exp( a );
6651     aSign = extractFloat128Sign( a );
6652     bSig1 = extractFloat128Frac1( b );
6653     bSig0 = extractFloat128Frac0( b );
6654     bExp = extractFloat128Exp( b );
6655     if ( aExp == 0x7FFF ) {
6656         if (    ( aSig0 | aSig1 )
6657              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6658             return propagateFloat128NaN(a, b, status);
6659         }
6660         goto invalid;
6661     }
6662     if ( bExp == 0x7FFF ) {
6663         if (bSig0 | bSig1) {
6664             return propagateFloat128NaN(a, b, status);
6665         }
6666         return a;
6667     }
6668     if ( bExp == 0 ) {
6669         if ( ( bSig0 | bSig1 ) == 0 ) {
6670  invalid:
6671             float_raise(float_flag_invalid, status);
6672             return float128_default_nan(status);
6673         }
6674         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6675     }
6676     if ( aExp == 0 ) {
6677         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6678         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6679     }
6680     expDiff = aExp - bExp;
6681     if ( expDiff < -1 ) return a;
6682     shortShift128Left(
6683         aSig0 | LIT64( 0x0001000000000000 ),
6684         aSig1,
6685         15 - ( expDiff < 0 ),
6686         &aSig0,
6687         &aSig1
6688     );
6689     shortShift128Left(
6690         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6691     q = le128( bSig0, bSig1, aSig0, aSig1 );
6692     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6693     expDiff -= 64;
6694     while ( 0 < expDiff ) {
6695         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6696         q = ( 4 < q ) ? q - 4 : 0;
6697         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6698         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6699         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6700         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6701         expDiff -= 61;
6702     }
6703     if ( -64 < expDiff ) {
6704         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6705         q = ( 4 < q ) ? q - 4 : 0;
6706         q >>= - expDiff;
6707         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6708         expDiff += 52;
6709         if ( expDiff < 0 ) {
6710             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6711         }
6712         else {
6713             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6714         }
6715         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6716         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6717     }
6718     else {
6719         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6720         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6721     }
6722     do {
6723         alternateASig0 = aSig0;
6724         alternateASig1 = aSig1;
6725         ++q;
6726         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6727     } while ( 0 <= (int64_t) aSig0 );
6728     add128(
6729         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6730     if (    ( sigMean0 < 0 )
6731          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6732         aSig0 = alternateASig0;
6733         aSig1 = alternateASig1;
6734     }
6735     zSign = ( (int64_t) aSig0 < 0 );
6736     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6737     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6738                                          status);
6739 }
6740 
6741 /*----------------------------------------------------------------------------
6742 | Returns the square root of the quadruple-precision floating-point value `a'.
6743 | The operation is performed according to the IEC/IEEE Standard for Binary
6744 | Floating-Point Arithmetic.
6745 *----------------------------------------------------------------------------*/
6746 
6747 float128 float128_sqrt(float128 a, float_status *status)
6748 {
6749     flag aSign;
6750     int32_t aExp, zExp;
6751     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6752     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6753 
6754     aSig1 = extractFloat128Frac1( a );
6755     aSig0 = extractFloat128Frac0( a );
6756     aExp = extractFloat128Exp( a );
6757     aSign = extractFloat128Sign( a );
6758     if ( aExp == 0x7FFF ) {
6759         if (aSig0 | aSig1) {
6760             return propagateFloat128NaN(a, a, status);
6761         }
6762         if ( ! aSign ) return a;
6763         goto invalid;
6764     }
6765     if ( aSign ) {
6766         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6767  invalid:
6768         float_raise(float_flag_invalid, status);
6769         return float128_default_nan(status);
6770     }
6771     if ( aExp == 0 ) {
6772         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6773         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6774     }
6775     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6776     aSig0 |= LIT64( 0x0001000000000000 );
6777     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6778     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6779     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6780     doubleZSig0 = zSig0<<1;
6781     mul64To128( zSig0, zSig0, &term0, &term1 );
6782     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6783     while ( (int64_t) rem0 < 0 ) {
6784         --zSig0;
6785         doubleZSig0 -= 2;
6786         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6787     }
6788     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6789     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6790         if ( zSig1 == 0 ) zSig1 = 1;
6791         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6792         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6793         mul64To128( zSig1, zSig1, &term2, &term3 );
6794         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6795         while ( (int64_t) rem1 < 0 ) {
6796             --zSig1;
6797             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6798             term3 |= 1;
6799             term2 |= doubleZSig0;
6800             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6801         }
6802         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6803     }
6804     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6805     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6806 
6807 }
6808 
6809 /*----------------------------------------------------------------------------
6810 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6811 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6812 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6814 *----------------------------------------------------------------------------*/
6815 
6816 int float128_eq(float128 a, float128 b, float_status *status)
6817 {
6818 
6819     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6820               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6821          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6822               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6823        ) {
6824         float_raise(float_flag_invalid, status);
6825         return 0;
6826     }
6827     return
6828            ( a.low == b.low )
6829         && (    ( a.high == b.high )
6830              || (    ( a.low == 0 )
6831                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6832            );
6833 
6834 }
6835 
6836 /*----------------------------------------------------------------------------
6837 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6838 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6839 | exception is raised if either operand is a NaN.  The comparison is performed
6840 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6841 *----------------------------------------------------------------------------*/
6842 
6843 int float128_le(float128 a, float128 b, float_status *status)
6844 {
6845     flag aSign, bSign;
6846 
6847     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6848               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6849          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6850               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6851        ) {
6852         float_raise(float_flag_invalid, status);
6853         return 0;
6854     }
6855     aSign = extractFloat128Sign( a );
6856     bSign = extractFloat128Sign( b );
6857     if ( aSign != bSign ) {
6858         return
6859                aSign
6860             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6861                  == 0 );
6862     }
6863     return
6864           aSign ? le128( b.high, b.low, a.high, a.low )
6865         : le128( a.high, a.low, b.high, b.low );
6866 
6867 }
6868 
6869 /*----------------------------------------------------------------------------
6870 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6871 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6872 | raised if either operand is a NaN.  The comparison is performed according
6873 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6874 *----------------------------------------------------------------------------*/
6875 
6876 int float128_lt(float128 a, float128 b, float_status *status)
6877 {
6878     flag aSign, bSign;
6879 
6880     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6881               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6882          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6883               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6884        ) {
6885         float_raise(float_flag_invalid, status);
6886         return 0;
6887     }
6888     aSign = extractFloat128Sign( a );
6889     bSign = extractFloat128Sign( b );
6890     if ( aSign != bSign ) {
6891         return
6892                aSign
6893             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6894                  != 0 );
6895     }
6896     return
6897           aSign ? lt128( b.high, b.low, a.high, a.low )
6898         : lt128( a.high, a.low, b.high, b.low );
6899 
6900 }
6901 
6902 /*----------------------------------------------------------------------------
6903 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6904 | be compared, and 0 otherwise.  The invalid exception is raised if either
6905 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6906 | Standard for Binary Floating-Point Arithmetic.
6907 *----------------------------------------------------------------------------*/
6908 
6909 int float128_unordered(float128 a, float128 b, float_status *status)
6910 {
6911     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6912               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6913          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6914               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6915        ) {
6916         float_raise(float_flag_invalid, status);
6917         return 1;
6918     }
6919     return 0;
6920 }
6921 
6922 /*----------------------------------------------------------------------------
6923 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6924 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6925 | exception.  The comparison is performed according to the IEC/IEEE Standard
6926 | for Binary Floating-Point Arithmetic.
6927 *----------------------------------------------------------------------------*/
6928 
6929 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6930 {
6931 
6932     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6933               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6934          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6935               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6936        ) {
6937         if (float128_is_signaling_nan(a, status)
6938          || float128_is_signaling_nan(b, status)) {
6939             float_raise(float_flag_invalid, status);
6940         }
6941         return 0;
6942     }
6943     return
6944            ( a.low == b.low )
6945         && (    ( a.high == b.high )
6946              || (    ( a.low == 0 )
6947                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6948            );
6949 
6950 }
6951 
6952 /*----------------------------------------------------------------------------
6953 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6954 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6955 | cause an exception.  Otherwise, the comparison is performed according to the
6956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6957 *----------------------------------------------------------------------------*/
6958 
6959 int float128_le_quiet(float128 a, float128 b, float_status *status)
6960 {
6961     flag aSign, bSign;
6962 
6963     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6964               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6965          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6966               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6967        ) {
6968         if (float128_is_signaling_nan(a, status)
6969          || float128_is_signaling_nan(b, status)) {
6970             float_raise(float_flag_invalid, status);
6971         }
6972         return 0;
6973     }
6974     aSign = extractFloat128Sign( a );
6975     bSign = extractFloat128Sign( b );
6976     if ( aSign != bSign ) {
6977         return
6978                aSign
6979             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6980                  == 0 );
6981     }
6982     return
6983           aSign ? le128( b.high, b.low, a.high, a.low )
6984         : le128( a.high, a.low, b.high, b.low );
6985 
6986 }
6987 
6988 /*----------------------------------------------------------------------------
6989 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6990 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6991 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6992 | Standard for Binary Floating-Point Arithmetic.
6993 *----------------------------------------------------------------------------*/
6994 
6995 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6996 {
6997     flag aSign, bSign;
6998 
6999     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7000               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7001          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7002               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7003        ) {
7004         if (float128_is_signaling_nan(a, status)
7005          || float128_is_signaling_nan(b, status)) {
7006             float_raise(float_flag_invalid, status);
7007         }
7008         return 0;
7009     }
7010     aSign = extractFloat128Sign( a );
7011     bSign = extractFloat128Sign( b );
7012     if ( aSign != bSign ) {
7013         return
7014                aSign
7015             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7016                  != 0 );
7017     }
7018     return
7019           aSign ? lt128( b.high, b.low, a.high, a.low )
7020         : lt128( a.high, a.low, b.high, b.low );
7021 
7022 }
7023 
7024 /*----------------------------------------------------------------------------
7025 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7026 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7027 | comparison is performed according to the IEC/IEEE Standard for Binary
7028 | Floating-Point Arithmetic.
7029 *----------------------------------------------------------------------------*/
7030 
7031 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7032 {
7033     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7034               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7035          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7036               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7037        ) {
7038         if (float128_is_signaling_nan(a, status)
7039          || float128_is_signaling_nan(b, status)) {
7040             float_raise(float_flag_invalid, status);
7041         }
7042         return 1;
7043     }
7044     return 0;
7045 }
7046 
7047 /* misc functions */
7048 float32 uint32_to_float32(uint32_t a, float_status *status)
7049 {
7050     return int64_to_float32(a, status);
7051 }
7052 
7053 float64 uint32_to_float64(uint32_t a, float_status *status)
7054 {
7055     return int64_to_float64(a, status);
7056 }
7057 
7058 uint32_t float32_to_uint32(float32 a, float_status *status)
7059 {
7060     int64_t v;
7061     uint32_t res;
7062     int old_exc_flags = get_float_exception_flags(status);
7063 
7064     v = float32_to_int64(a, status);
7065     if (v < 0) {
7066         res = 0;
7067     } else if (v > 0xffffffff) {
7068         res = 0xffffffff;
7069     } else {
7070         return v;
7071     }
7072     set_float_exception_flags(old_exc_flags, status);
7073     float_raise(float_flag_invalid, status);
7074     return res;
7075 }
7076 
7077 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7078 {
7079     int64_t v;
7080     uint32_t res;
7081     int old_exc_flags = get_float_exception_flags(status);
7082 
7083     v = float32_to_int64_round_to_zero(a, status);
7084     if (v < 0) {
7085         res = 0;
7086     } else if (v > 0xffffffff) {
7087         res = 0xffffffff;
7088     } else {
7089         return v;
7090     }
7091     set_float_exception_flags(old_exc_flags, status);
7092     float_raise(float_flag_invalid, status);
7093     return res;
7094 }
7095 
7096 int16_t float32_to_int16(float32 a, float_status *status)
7097 {
7098     int32_t v;
7099     int16_t res;
7100     int old_exc_flags = get_float_exception_flags(status);
7101 
7102     v = float32_to_int32(a, status);
7103     if (v < -0x8000) {
7104         res = -0x8000;
7105     } else if (v > 0x7fff) {
7106         res = 0x7fff;
7107     } else {
7108         return v;
7109     }
7110 
7111     set_float_exception_flags(old_exc_flags, status);
7112     float_raise(float_flag_invalid, status);
7113     return res;
7114 }
7115 
7116 uint16_t float32_to_uint16(float32 a, float_status *status)
7117 {
7118     int32_t v;
7119     uint16_t res;
7120     int old_exc_flags = get_float_exception_flags(status);
7121 
7122     v = float32_to_int32(a, status);
7123     if (v < 0) {
7124         res = 0;
7125     } else if (v > 0xffff) {
7126         res = 0xffff;
7127     } else {
7128         return v;
7129     }
7130 
7131     set_float_exception_flags(old_exc_flags, status);
7132     float_raise(float_flag_invalid, status);
7133     return res;
7134 }
7135 
7136 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7137 {
7138     int64_t v;
7139     uint16_t res;
7140     int old_exc_flags = get_float_exception_flags(status);
7141 
7142     v = float32_to_int64_round_to_zero(a, status);
7143     if (v < 0) {
7144         res = 0;
7145     } else if (v > 0xffff) {
7146         res = 0xffff;
7147     } else {
7148         return v;
7149     }
7150     set_float_exception_flags(old_exc_flags, status);
7151     float_raise(float_flag_invalid, status);
7152     return res;
7153 }
7154 
7155 uint32_t float64_to_uint32(float64 a, float_status *status)
7156 {
7157     uint64_t v;
7158     uint32_t res;
7159     int old_exc_flags = get_float_exception_flags(status);
7160 
7161     v = float64_to_uint64(a, status);
7162     if (v > 0xffffffff) {
7163         res = 0xffffffff;
7164     } else {
7165         return v;
7166     }
7167     set_float_exception_flags(old_exc_flags, status);
7168     float_raise(float_flag_invalid, status);
7169     return res;
7170 }
7171 
7172 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7173 {
7174     uint64_t v;
7175     uint32_t res;
7176     int old_exc_flags = get_float_exception_flags(status);
7177 
7178     v = float64_to_uint64_round_to_zero(a, status);
7179     if (v > 0xffffffff) {
7180         res = 0xffffffff;
7181     } else {
7182         return v;
7183     }
7184     set_float_exception_flags(old_exc_flags, status);
7185     float_raise(float_flag_invalid, status);
7186     return res;
7187 }
7188 
7189 int16_t float64_to_int16(float64 a, float_status *status)
7190 {
7191     int64_t v;
7192     int16_t res;
7193     int old_exc_flags = get_float_exception_flags(status);
7194 
7195     v = float64_to_int32(a, status);
7196     if (v < -0x8000) {
7197         res = -0x8000;
7198     } else if (v > 0x7fff) {
7199         res = 0x7fff;
7200     } else {
7201         return v;
7202     }
7203 
7204     set_float_exception_flags(old_exc_flags, status);
7205     float_raise(float_flag_invalid, status);
7206     return res;
7207 }
7208 
7209 uint16_t float64_to_uint16(float64 a, float_status *status)
7210 {
7211     int64_t v;
7212     uint16_t res;
7213     int old_exc_flags = get_float_exception_flags(status);
7214 
7215     v = float64_to_int32(a, status);
7216     if (v < 0) {
7217         res = 0;
7218     } else if (v > 0xffff) {
7219         res = 0xffff;
7220     } else {
7221         return v;
7222     }
7223 
7224     set_float_exception_flags(old_exc_flags, status);
7225     float_raise(float_flag_invalid, status);
7226     return res;
7227 }
7228 
7229 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7230 {
7231     int64_t v;
7232     uint16_t res;
7233     int old_exc_flags = get_float_exception_flags(status);
7234 
7235     v = float64_to_int64_round_to_zero(a, status);
7236     if (v < 0) {
7237         res = 0;
7238     } else if (v > 0xffff) {
7239         res = 0xffff;
7240     } else {
7241         return v;
7242     }
7243     set_float_exception_flags(old_exc_flags, status);
7244     float_raise(float_flag_invalid, status);
7245     return res;
7246 }
7247 
7248 /*----------------------------------------------------------------------------
7249 | Returns the result of converting the double-precision floating-point value
7250 | `a' to the 64-bit unsigned integer format.  The conversion is
7251 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7252 | Arithmetic---which means in particular that the conversion is rounded
7253 | according to the current rounding mode.  If `a' is a NaN, the largest
7254 | positive integer is returned.  If the conversion overflows, the
7255 | largest unsigned integer is returned.  If 'a' is negative, the value is
7256 | rounded and zero is returned; negative values that do not round to zero
7257 | will raise the inexact exception.
7258 *----------------------------------------------------------------------------*/
7259 
7260 uint64_t float64_to_uint64(float64 a, float_status *status)
7261 {
7262     flag aSign;
7263     int aExp;
7264     int shiftCount;
7265     uint64_t aSig, aSigExtra;
7266     a = float64_squash_input_denormal(a, status);
7267 
7268     aSig = extractFloat64Frac(a);
7269     aExp = extractFloat64Exp(a);
7270     aSign = extractFloat64Sign(a);
7271     if (aSign && (aExp > 1022)) {
7272         float_raise(float_flag_invalid, status);
7273         if (float64_is_any_nan(a)) {
7274             return LIT64(0xFFFFFFFFFFFFFFFF);
7275         } else {
7276             return 0;
7277         }
7278     }
7279     if (aExp) {
7280         aSig |= LIT64(0x0010000000000000);
7281     }
7282     shiftCount = 0x433 - aExp;
7283     if (shiftCount <= 0) {
7284         if (0x43E < aExp) {
7285             float_raise(float_flag_invalid, status);
7286             return LIT64(0xFFFFFFFFFFFFFFFF);
7287         }
7288         aSigExtra = 0;
7289         aSig <<= -shiftCount;
7290     } else {
7291         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7292     }
7293     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7294 }
7295 
7296 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7297 {
7298     signed char current_rounding_mode = status->float_rounding_mode;
7299     set_float_rounding_mode(float_round_to_zero, status);
7300     uint64_t v = float64_to_uint64(a, status);
7301     set_float_rounding_mode(current_rounding_mode, status);
7302     return v;
7303 }
7304 
7305 #define COMPARE(s, nan_exp)                                                  \
7306 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7307                                       int is_quiet, float_status *status)    \
7308 {                                                                            \
7309     flag aSign, bSign;                                                       \
7310     uint ## s ## _t av, bv;                                                  \
7311     a = float ## s ## _squash_input_denormal(a, status);                     \
7312     b = float ## s ## _squash_input_denormal(b, status);                     \
7313                                                                              \
7314     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7315          extractFloat ## s ## Frac( a ) ) ||                                 \
7316         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7317           extractFloat ## s ## Frac( b ) )) {                                \
7318         if (!is_quiet ||                                                     \
7319             float ## s ## _is_signaling_nan(a, status) ||                  \
7320             float ## s ## _is_signaling_nan(b, status)) {                 \
7321             float_raise(float_flag_invalid, status);                         \
7322         }                                                                    \
7323         return float_relation_unordered;                                     \
7324     }                                                                        \
7325     aSign = extractFloat ## s ## Sign( a );                                  \
7326     bSign = extractFloat ## s ## Sign( b );                                  \
7327     av = float ## s ## _val(a);                                              \
7328     bv = float ## s ## _val(b);                                              \
7329     if ( aSign != bSign ) {                                                  \
7330         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7331             /* zero case */                                                  \
7332             return float_relation_equal;                                     \
7333         } else {                                                             \
7334             return 1 - (2 * aSign);                                          \
7335         }                                                                    \
7336     } else {                                                                 \
7337         if (av == bv) {                                                      \
7338             return float_relation_equal;                                     \
7339         } else {                                                             \
7340             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7341         }                                                                    \
7342     }                                                                        \
7343 }                                                                            \
7344                                                                              \
7345 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7346 {                                                                            \
7347     return float ## s ## _compare_internal(a, b, 0, status);                 \
7348 }                                                                            \
7349                                                                              \
7350 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7351                                  float_status *status)                       \
7352 {                                                                            \
7353     return float ## s ## _compare_internal(a, b, 1, status);                 \
7354 }
7355 
7356 COMPARE(32, 0xff)
7357 COMPARE(64, 0x7ff)
7358 
7359 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7360                                             int is_quiet, float_status *status)
7361 {
7362     flag aSign, bSign;
7363 
7364     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7365         float_raise(float_flag_invalid, status);
7366         return float_relation_unordered;
7367     }
7368     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7369           ( extractFloatx80Frac( a )<<1 ) ) ||
7370         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7371           ( extractFloatx80Frac( b )<<1 ) )) {
7372         if (!is_quiet ||
7373             floatx80_is_signaling_nan(a, status) ||
7374             floatx80_is_signaling_nan(b, status)) {
7375             float_raise(float_flag_invalid, status);
7376         }
7377         return float_relation_unordered;
7378     }
7379     aSign = extractFloatx80Sign( a );
7380     bSign = extractFloatx80Sign( b );
7381     if ( aSign != bSign ) {
7382 
7383         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7384              ( ( a.low | b.low ) == 0 ) ) {
7385             /* zero case */
7386             return float_relation_equal;
7387         } else {
7388             return 1 - (2 * aSign);
7389         }
7390     } else {
7391         if (a.low == b.low && a.high == b.high) {
7392             return float_relation_equal;
7393         } else {
7394             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7395         }
7396     }
7397 }
7398 
7399 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7400 {
7401     return floatx80_compare_internal(a, b, 0, status);
7402 }
7403 
7404 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7405 {
7406     return floatx80_compare_internal(a, b, 1, status);
7407 }
7408 
7409 static inline int float128_compare_internal(float128 a, float128 b,
7410                                             int is_quiet, float_status *status)
7411 {
7412     flag aSign, bSign;
7413 
7414     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7415           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7416         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7417           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7418         if (!is_quiet ||
7419             float128_is_signaling_nan(a, status) ||
7420             float128_is_signaling_nan(b, status)) {
7421             float_raise(float_flag_invalid, status);
7422         }
7423         return float_relation_unordered;
7424     }
7425     aSign = extractFloat128Sign( a );
7426     bSign = extractFloat128Sign( b );
7427     if ( aSign != bSign ) {
7428         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7429             /* zero case */
7430             return float_relation_equal;
7431         } else {
7432             return 1 - (2 * aSign);
7433         }
7434     } else {
7435         if (a.low == b.low && a.high == b.high) {
7436             return float_relation_equal;
7437         } else {
7438             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7439         }
7440     }
7441 }
7442 
7443 int float128_compare(float128 a, float128 b, float_status *status)
7444 {
7445     return float128_compare_internal(a, b, 0, status);
7446 }
7447 
7448 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7449 {
7450     return float128_compare_internal(a, b, 1, status);
7451 }
7452 
7453 /* min() and max() functions. These can't be implemented as
7454  * 'compare and pick one input' because that would mishandle
7455  * NaNs and +0 vs -0.
7456  *
7457  * minnum() and maxnum() functions. These are similar to the min()
7458  * and max() functions but if one of the arguments is a QNaN and
7459  * the other is numerical then the numerical argument is returned.
7460  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7461  * and maxNum() operations. min() and max() are the typical min/max
7462  * semantics provided by many CPUs which predate that specification.
7463  *
7464  * minnummag() and maxnummag() functions correspond to minNumMag()
7465  * and minNumMag() from the IEEE-754 2008.
7466  */
7467 #define MINMAX(s)                                                       \
7468 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7469                                                int ismin, int isieee,   \
7470                                                int ismag,               \
7471                                                float_status *status)    \
7472 {                                                                       \
7473     flag aSign, bSign;                                                  \
7474     uint ## s ## _t av, bv, aav, abv;                                   \
7475     a = float ## s ## _squash_input_denormal(a, status);                \
7476     b = float ## s ## _squash_input_denormal(b, status);                \
7477     if (float ## s ## _is_any_nan(a) ||                                 \
7478         float ## s ## _is_any_nan(b)) {                                 \
7479         if (isieee) {                                                   \
7480             if (float ## s ## _is_quiet_nan(a, status) &&               \
7481                 !float ## s ##_is_any_nan(b)) {                         \
7482                 return b;                                               \
7483             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7484                        !float ## s ## _is_any_nan(a)) {                \
7485                 return a;                                               \
7486             }                                                           \
7487         }                                                               \
7488         return propagateFloat ## s ## NaN(a, b, status);                \
7489     }                                                                   \
7490     aSign = extractFloat ## s ## Sign(a);                               \
7491     bSign = extractFloat ## s ## Sign(b);                               \
7492     av = float ## s ## _val(a);                                         \
7493     bv = float ## s ## _val(b);                                         \
7494     if (ismag) {                                                        \
7495         aav = float ## s ## _abs(av);                                   \
7496         abv = float ## s ## _abs(bv);                                   \
7497         if (aav != abv) {                                               \
7498             if (ismin) {                                                \
7499                 return (aav < abv) ? a : b;                             \
7500             } else {                                                    \
7501                 return (aav < abv) ? b : a;                             \
7502             }                                                           \
7503         }                                                               \
7504     }                                                                   \
7505     if (aSign != bSign) {                                               \
7506         if (ismin) {                                                    \
7507             return aSign ? a : b;                                       \
7508         } else {                                                        \
7509             return aSign ? b : a;                                       \
7510         }                                                               \
7511     } else {                                                            \
7512         if (ismin) {                                                    \
7513             return (aSign ^ (av < bv)) ? a : b;                         \
7514         } else {                                                        \
7515             return (aSign ^ (av < bv)) ? b : a;                         \
7516         }                                                               \
7517     }                                                                   \
7518 }                                                                       \
7519                                                                         \
7520 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7521                               float_status *status)                     \
7522 {                                                                       \
7523     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7524 }                                                                       \
7525                                                                         \
7526 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7527                               float_status *status)                     \
7528 {                                                                       \
7529     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7530 }                                                                       \
7531                                                                         \
7532 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7533                                  float_status *status)                  \
7534 {                                                                       \
7535     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7536 }                                                                       \
7537                                                                         \
7538 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7539                                  float_status *status)                  \
7540 {                                                                       \
7541     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7542 }                                                                       \
7543                                                                         \
7544 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7545                                     float_status *status)               \
7546 {                                                                       \
7547     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7548 }                                                                       \
7549                                                                         \
7550 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7551                                     float_status *status)               \
7552 {                                                                       \
7553     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7554 }
7555 
7556 MINMAX(32)
7557 MINMAX(64)
7558 
7559 
7560 /* Multiply A by 2 raised to the power N.  */
7561 float32 float32_scalbn(float32 a, int n, float_status *status)
7562 {
7563     flag aSign;
7564     int16_t aExp;
7565     uint32_t aSig;
7566 
7567     a = float32_squash_input_denormal(a, status);
7568     aSig = extractFloat32Frac( a );
7569     aExp = extractFloat32Exp( a );
7570     aSign = extractFloat32Sign( a );
7571 
7572     if ( aExp == 0xFF ) {
7573         if ( aSig ) {
7574             return propagateFloat32NaN(a, a, status);
7575         }
7576         return a;
7577     }
7578     if (aExp != 0) {
7579         aSig |= 0x00800000;
7580     } else if (aSig == 0) {
7581         return a;
7582     } else {
7583         aExp++;
7584     }
7585 
7586     if (n > 0x200) {
7587         n = 0x200;
7588     } else if (n < -0x200) {
7589         n = -0x200;
7590     }
7591 
7592     aExp += n - 1;
7593     aSig <<= 7;
7594     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7595 }
7596 
7597 float64 float64_scalbn(float64 a, int n, float_status *status)
7598 {
7599     flag aSign;
7600     int16_t aExp;
7601     uint64_t aSig;
7602 
7603     a = float64_squash_input_denormal(a, status);
7604     aSig = extractFloat64Frac( a );
7605     aExp = extractFloat64Exp( a );
7606     aSign = extractFloat64Sign( a );
7607 
7608     if ( aExp == 0x7FF ) {
7609         if ( aSig ) {
7610             return propagateFloat64NaN(a, a, status);
7611         }
7612         return a;
7613     }
7614     if (aExp != 0) {
7615         aSig |= LIT64( 0x0010000000000000 );
7616     } else if (aSig == 0) {
7617         return a;
7618     } else {
7619         aExp++;
7620     }
7621 
7622     if (n > 0x1000) {
7623         n = 0x1000;
7624     } else if (n < -0x1000) {
7625         n = -0x1000;
7626     }
7627 
7628     aExp += n - 1;
7629     aSig <<= 10;
7630     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7631 }
7632 
7633 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7634 {
7635     flag aSign;
7636     int32_t aExp;
7637     uint64_t aSig;
7638 
7639     if (floatx80_invalid_encoding(a)) {
7640         float_raise(float_flag_invalid, status);
7641         return floatx80_default_nan(status);
7642     }
7643     aSig = extractFloatx80Frac( a );
7644     aExp = extractFloatx80Exp( a );
7645     aSign = extractFloatx80Sign( a );
7646 
7647     if ( aExp == 0x7FFF ) {
7648         if ( aSig<<1 ) {
7649             return propagateFloatx80NaN(a, a, status);
7650         }
7651         return a;
7652     }
7653 
7654     if (aExp == 0) {
7655         if (aSig == 0) {
7656             return a;
7657         }
7658         aExp++;
7659     }
7660 
7661     if (n > 0x10000) {
7662         n = 0x10000;
7663     } else if (n < -0x10000) {
7664         n = -0x10000;
7665     }
7666 
7667     aExp += n;
7668     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7669                                          aSign, aExp, aSig, 0, status);
7670 }
7671 
7672 float128 float128_scalbn(float128 a, int n, float_status *status)
7673 {
7674     flag aSign;
7675     int32_t aExp;
7676     uint64_t aSig0, aSig1;
7677 
7678     aSig1 = extractFloat128Frac1( a );
7679     aSig0 = extractFloat128Frac0( a );
7680     aExp = extractFloat128Exp( a );
7681     aSign = extractFloat128Sign( a );
7682     if ( aExp == 0x7FFF ) {
7683         if ( aSig0 | aSig1 ) {
7684             return propagateFloat128NaN(a, a, status);
7685         }
7686         return a;
7687     }
7688     if (aExp != 0) {
7689         aSig0 |= LIT64( 0x0001000000000000 );
7690     } else if (aSig0 == 0 && aSig1 == 0) {
7691         return a;
7692     } else {
7693         aExp++;
7694     }
7695 
7696     if (n > 0x10000) {
7697         n = 0x10000;
7698     } else if (n < -0x10000) {
7699         n = -0x10000;
7700     }
7701 
7702     aExp += n - 1;
7703     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7704                                          , status);
7705 
7706 }
7707