xref: /qemu/fpu/softfloat.c (revision c02e1fb80b553d47420f7492de4bc590c2461a86)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333 
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341 
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377 
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396 
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400 
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404 
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408 
1409 #undef FLOAT_TO_INT
1410 
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423 
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429 
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446 
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455 
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488 
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492 
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496 
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500 
1501 #undef FLOAT_TO_UINT
1502 
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510 
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536 
1537     return r;
1538 }
1539 
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545 
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550 
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555 
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561 
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566 
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571 
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577 
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582 
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587 
1588 
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596 
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600 
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614 
1615     return r;
1616 }
1617 
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623 
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628 
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633 
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639 
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644 
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649 
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655 
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660 
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665 
1666 /*----------------------------------------------------------------------------
1667 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1668 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1669 | input.  If `zSign' is 1, the input is negated before being converted to an
1670 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1671 | is simply rounded to an integer, with the inexact exception raised if the
1672 | input cannot be represented exactly as an integer.  However, if the fixed-
1673 | point input is too large, the invalid exception is raised and the largest
1674 | positive or negative integer is returned.
1675 *----------------------------------------------------------------------------*/
1676 
1677 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1678 {
1679     int8_t roundingMode;
1680     flag roundNearestEven;
1681     int8_t roundIncrement, roundBits;
1682     int32_t z;
1683 
1684     roundingMode = status->float_rounding_mode;
1685     roundNearestEven = ( roundingMode == float_round_nearest_even );
1686     switch (roundingMode) {
1687     case float_round_nearest_even:
1688     case float_round_ties_away:
1689         roundIncrement = 0x40;
1690         break;
1691     case float_round_to_zero:
1692         roundIncrement = 0;
1693         break;
1694     case float_round_up:
1695         roundIncrement = zSign ? 0 : 0x7f;
1696         break;
1697     case float_round_down:
1698         roundIncrement = zSign ? 0x7f : 0;
1699         break;
1700     default:
1701         abort();
1702     }
1703     roundBits = absZ & 0x7F;
1704     absZ = ( absZ + roundIncrement )>>7;
1705     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1706     z = absZ;
1707     if ( zSign ) z = - z;
1708     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1709         float_raise(float_flag_invalid, status);
1710         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1711     }
1712     if (roundBits) {
1713         status->float_exception_flags |= float_flag_inexact;
1714     }
1715     return z;
1716 
1717 }
1718 
1719 /*----------------------------------------------------------------------------
1720 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1721 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1722 | and returns the properly rounded 64-bit integer corresponding to the input.
1723 | If `zSign' is 1, the input is negated before being converted to an integer.
1724 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1725 | the inexact exception raised if the input cannot be represented exactly as
1726 | an integer.  However, if the fixed-point input is too large, the invalid
1727 | exception is raised and the largest positive or negative integer is
1728 | returned.
1729 *----------------------------------------------------------------------------*/
1730 
1731 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1732                                float_status *status)
1733 {
1734     int8_t roundingMode;
1735     flag roundNearestEven, increment;
1736     int64_t z;
1737 
1738     roundingMode = status->float_rounding_mode;
1739     roundNearestEven = ( roundingMode == float_round_nearest_even );
1740     switch (roundingMode) {
1741     case float_round_nearest_even:
1742     case float_round_ties_away:
1743         increment = ((int64_t) absZ1 < 0);
1744         break;
1745     case float_round_to_zero:
1746         increment = 0;
1747         break;
1748     case float_round_up:
1749         increment = !zSign && absZ1;
1750         break;
1751     case float_round_down:
1752         increment = zSign && absZ1;
1753         break;
1754     default:
1755         abort();
1756     }
1757     if ( increment ) {
1758         ++absZ0;
1759         if ( absZ0 == 0 ) goto overflow;
1760         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1761     }
1762     z = absZ0;
1763     if ( zSign ) z = - z;
1764     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1765  overflow:
1766         float_raise(float_flag_invalid, status);
1767         return
1768               zSign ? (int64_t) LIT64( 0x8000000000000000 )
1769             : LIT64( 0x7FFFFFFFFFFFFFFF );
1770     }
1771     if (absZ1) {
1772         status->float_exception_flags |= float_flag_inexact;
1773     }
1774     return z;
1775 
1776 }
1777 
1778 /*----------------------------------------------------------------------------
1779 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1780 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1781 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1782 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
1783 | with the inexact exception raised if the input cannot be represented exactly
1784 | as an integer.  However, if the fixed-point input is too large, the invalid
1785 | exception is raised and the largest unsigned integer is returned.
1786 *----------------------------------------------------------------------------*/
1787 
1788 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1789                                 uint64_t absZ1, float_status *status)
1790 {
1791     int8_t roundingMode;
1792     flag roundNearestEven, increment;
1793 
1794     roundingMode = status->float_rounding_mode;
1795     roundNearestEven = (roundingMode == float_round_nearest_even);
1796     switch (roundingMode) {
1797     case float_round_nearest_even:
1798     case float_round_ties_away:
1799         increment = ((int64_t)absZ1 < 0);
1800         break;
1801     case float_round_to_zero:
1802         increment = 0;
1803         break;
1804     case float_round_up:
1805         increment = !zSign && absZ1;
1806         break;
1807     case float_round_down:
1808         increment = zSign && absZ1;
1809         break;
1810     default:
1811         abort();
1812     }
1813     if (increment) {
1814         ++absZ0;
1815         if (absZ0 == 0) {
1816             float_raise(float_flag_invalid, status);
1817             return LIT64(0xFFFFFFFFFFFFFFFF);
1818         }
1819         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1820     }
1821 
1822     if (zSign && absZ0) {
1823         float_raise(float_flag_invalid, status);
1824         return 0;
1825     }
1826 
1827     if (absZ1) {
1828         status->float_exception_flags |= float_flag_inexact;
1829     }
1830     return absZ0;
1831 }
1832 
1833 /*----------------------------------------------------------------------------
1834 | If `a' is denormal and we are in flush-to-zero mode then set the
1835 | input-denormal exception and return zero. Otherwise just return the value.
1836 *----------------------------------------------------------------------------*/
1837 float32 float32_squash_input_denormal(float32 a, float_status *status)
1838 {
1839     if (status->flush_inputs_to_zero) {
1840         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1841             float_raise(float_flag_input_denormal, status);
1842             return make_float32(float32_val(a) & 0x80000000);
1843         }
1844     }
1845     return a;
1846 }
1847 
1848 /*----------------------------------------------------------------------------
1849 | Normalizes the subnormal single-precision floating-point value represented
1850 | by the denormalized significand `aSig'.  The normalized exponent and
1851 | significand are stored at the locations pointed to by `zExpPtr' and
1852 | `zSigPtr', respectively.
1853 *----------------------------------------------------------------------------*/
1854 
1855 static void
1856  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
1857 {
1858     int8_t shiftCount;
1859 
1860     shiftCount = countLeadingZeros32( aSig ) - 8;
1861     *zSigPtr = aSig<<shiftCount;
1862     *zExpPtr = 1 - shiftCount;
1863 
1864 }
1865 
1866 /*----------------------------------------------------------------------------
1867 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1868 | single-precision floating-point value, returning the result.  After being
1869 | shifted into the proper positions, the three fields are simply added
1870 | together to form the result.  This means that any integer portion of `zSig'
1871 | will be added into the exponent.  Since a properly normalized significand
1872 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1873 | than the desired result exponent whenever `zSig' is a complete, normalized
1874 | significand.
1875 *----------------------------------------------------------------------------*/
1876 
1877 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
1878 {
1879 
1880     return make_float32(
1881           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
1882 
1883 }
1884 
1885 /*----------------------------------------------------------------------------
1886 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1887 | and significand `zSig', and returns the proper single-precision floating-
1888 | point value corresponding to the abstract input.  Ordinarily, the abstract
1889 | value is simply rounded and packed into the single-precision format, with
1890 | the inexact exception raised if the abstract input cannot be represented
1891 | exactly.  However, if the abstract value is too large, the overflow and
1892 | inexact exceptions are raised and an infinity or maximal finite value is
1893 | returned.  If the abstract value is too small, the input value is rounded to
1894 | a subnormal number, and the underflow and inexact exceptions are raised if
1895 | the abstract input cannot be represented exactly as a subnormal single-
1896 | precision floating-point number.
1897 |     The input significand `zSig' has its binary point between bits 30
1898 | and 29, which is 7 bits to the left of the usual location.  This shifted
1899 | significand must be normalized or smaller.  If `zSig' is not normalized,
1900 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1901 | and it must not require rounding.  In the usual case that `zSig' is
1902 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1903 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1904 | Binary Floating-Point Arithmetic.
1905 *----------------------------------------------------------------------------*/
1906 
1907 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1908                                    float_status *status)
1909 {
1910     int8_t roundingMode;
1911     flag roundNearestEven;
1912     int8_t roundIncrement, roundBits;
1913     flag isTiny;
1914 
1915     roundingMode = status->float_rounding_mode;
1916     roundNearestEven = ( roundingMode == float_round_nearest_even );
1917     switch (roundingMode) {
1918     case float_round_nearest_even:
1919     case float_round_ties_away:
1920         roundIncrement = 0x40;
1921         break;
1922     case float_round_to_zero:
1923         roundIncrement = 0;
1924         break;
1925     case float_round_up:
1926         roundIncrement = zSign ? 0 : 0x7f;
1927         break;
1928     case float_round_down:
1929         roundIncrement = zSign ? 0x7f : 0;
1930         break;
1931     default:
1932         abort();
1933         break;
1934     }
1935     roundBits = zSig & 0x7F;
1936     if ( 0xFD <= (uint16_t) zExp ) {
1937         if (    ( 0xFD < zExp )
1938              || (    ( zExp == 0xFD )
1939                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
1940            ) {
1941             float_raise(float_flag_overflow | float_flag_inexact, status);
1942             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
1943         }
1944         if ( zExp < 0 ) {
1945             if (status->flush_to_zero) {
1946                 float_raise(float_flag_output_denormal, status);
1947                 return packFloat32(zSign, 0, 0);
1948             }
1949             isTiny =
1950                 (status->float_detect_tininess
1951                  == float_tininess_before_rounding)
1952                 || ( zExp < -1 )
1953                 || ( zSig + roundIncrement < 0x80000000 );
1954             shift32RightJamming( zSig, - zExp, &zSig );
1955             zExp = 0;
1956             roundBits = zSig & 0x7F;
1957             if (isTiny && roundBits) {
1958                 float_raise(float_flag_underflow, status);
1959             }
1960         }
1961     }
1962     if (roundBits) {
1963         status->float_exception_flags |= float_flag_inexact;
1964     }
1965     zSig = ( zSig + roundIncrement )>>7;
1966     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1967     if ( zSig == 0 ) zExp = 0;
1968     return packFloat32( zSign, zExp, zSig );
1969 
1970 }
1971 
1972 /*----------------------------------------------------------------------------
1973 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1974 | and significand `zSig', and returns the proper single-precision floating-
1975 | point value corresponding to the abstract input.  This routine is just like
1976 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1977 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1978 | floating-point exponent.
1979 *----------------------------------------------------------------------------*/
1980 
1981 static float32
1982  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1983                               float_status *status)
1984 {
1985     int8_t shiftCount;
1986 
1987     shiftCount = countLeadingZeros32( zSig ) - 1;
1988     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1989                                status);
1990 
1991 }
1992 
1993 /*----------------------------------------------------------------------------
1994 | If `a' is denormal and we are in flush-to-zero mode then set the
1995 | input-denormal exception and return zero. Otherwise just return the value.
1996 *----------------------------------------------------------------------------*/
1997 float64 float64_squash_input_denormal(float64 a, float_status *status)
1998 {
1999     if (status->flush_inputs_to_zero) {
2000         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2001             float_raise(float_flag_input_denormal, status);
2002             return make_float64(float64_val(a) & (1ULL << 63));
2003         }
2004     }
2005     return a;
2006 }
2007 
2008 /*----------------------------------------------------------------------------
2009 | Normalizes the subnormal double-precision floating-point value represented
2010 | by the denormalized significand `aSig'.  The normalized exponent and
2011 | significand are stored at the locations pointed to by `zExpPtr' and
2012 | `zSigPtr', respectively.
2013 *----------------------------------------------------------------------------*/
2014 
2015 static void
2016  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2017 {
2018     int8_t shiftCount;
2019 
2020     shiftCount = countLeadingZeros64( aSig ) - 11;
2021     *zSigPtr = aSig<<shiftCount;
2022     *zExpPtr = 1 - shiftCount;
2023 
2024 }
2025 
2026 /*----------------------------------------------------------------------------
2027 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2028 | double-precision floating-point value, returning the result.  After being
2029 | shifted into the proper positions, the three fields are simply added
2030 | together to form the result.  This means that any integer portion of `zSig'
2031 | will be added into the exponent.  Since a properly normalized significand
2032 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2033 | than the desired result exponent whenever `zSig' is a complete, normalized
2034 | significand.
2035 *----------------------------------------------------------------------------*/
2036 
2037 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2038 {
2039 
2040     return make_float64(
2041         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2042 
2043 }
2044 
2045 /*----------------------------------------------------------------------------
2046 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2047 | and significand `zSig', and returns the proper double-precision floating-
2048 | point value corresponding to the abstract input.  Ordinarily, the abstract
2049 | value is simply rounded and packed into the double-precision format, with
2050 | the inexact exception raised if the abstract input cannot be represented
2051 | exactly.  However, if the abstract value is too large, the overflow and
2052 | inexact exceptions are raised and an infinity or maximal finite value is
2053 | returned.  If the abstract value is too small, the input value is rounded to
2054 | a subnormal number, and the underflow and inexact exceptions are raised if
2055 | the abstract input cannot be represented exactly as a subnormal double-
2056 | precision floating-point number.
2057 |     The input significand `zSig' has its binary point between bits 62
2058 | and 61, which is 10 bits to the left of the usual location.  This shifted
2059 | significand must be normalized or smaller.  If `zSig' is not normalized,
2060 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2061 | and it must not require rounding.  In the usual case that `zSig' is
2062 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2063 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2064 | Binary Floating-Point Arithmetic.
2065 *----------------------------------------------------------------------------*/
2066 
2067 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2068                                    float_status *status)
2069 {
2070     int8_t roundingMode;
2071     flag roundNearestEven;
2072     int roundIncrement, roundBits;
2073     flag isTiny;
2074 
2075     roundingMode = status->float_rounding_mode;
2076     roundNearestEven = ( roundingMode == float_round_nearest_even );
2077     switch (roundingMode) {
2078     case float_round_nearest_even:
2079     case float_round_ties_away:
2080         roundIncrement = 0x200;
2081         break;
2082     case float_round_to_zero:
2083         roundIncrement = 0;
2084         break;
2085     case float_round_up:
2086         roundIncrement = zSign ? 0 : 0x3ff;
2087         break;
2088     case float_round_down:
2089         roundIncrement = zSign ? 0x3ff : 0;
2090         break;
2091     case float_round_to_odd:
2092         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2093         break;
2094     default:
2095         abort();
2096     }
2097     roundBits = zSig & 0x3FF;
2098     if ( 0x7FD <= (uint16_t) zExp ) {
2099         if (    ( 0x7FD < zExp )
2100              || (    ( zExp == 0x7FD )
2101                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2102            ) {
2103             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2104                                    roundIncrement != 0;
2105             float_raise(float_flag_overflow | float_flag_inexact, status);
2106             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2107         }
2108         if ( zExp < 0 ) {
2109             if (status->flush_to_zero) {
2110                 float_raise(float_flag_output_denormal, status);
2111                 return packFloat64(zSign, 0, 0);
2112             }
2113             isTiny =
2114                    (status->float_detect_tininess
2115                     == float_tininess_before_rounding)
2116                 || ( zExp < -1 )
2117                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2118             shift64RightJamming( zSig, - zExp, &zSig );
2119             zExp = 0;
2120             roundBits = zSig & 0x3FF;
2121             if (isTiny && roundBits) {
2122                 float_raise(float_flag_underflow, status);
2123             }
2124             if (roundingMode == float_round_to_odd) {
2125                 /*
2126                  * For round-to-odd case, the roundIncrement depends on
2127                  * zSig which just changed.
2128                  */
2129                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2130             }
2131         }
2132     }
2133     if (roundBits) {
2134         status->float_exception_flags |= float_flag_inexact;
2135     }
2136     zSig = ( zSig + roundIncrement )>>10;
2137     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2138     if ( zSig == 0 ) zExp = 0;
2139     return packFloat64( zSign, zExp, zSig );
2140 
2141 }
2142 
2143 /*----------------------------------------------------------------------------
2144 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2145 | and significand `zSig', and returns the proper double-precision floating-
2146 | point value corresponding to the abstract input.  This routine is just like
2147 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2148 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2149 | floating-point exponent.
2150 *----------------------------------------------------------------------------*/
2151 
2152 static float64
2153  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2154                               float_status *status)
2155 {
2156     int8_t shiftCount;
2157 
2158     shiftCount = countLeadingZeros64( zSig ) - 1;
2159     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2160                                status);
2161 
2162 }
2163 
2164 /*----------------------------------------------------------------------------
2165 | Returns the fraction bits of the extended double-precision floating-point
2166 | value `a'.
2167 *----------------------------------------------------------------------------*/
2168 
2169 static inline uint64_t extractFloatx80Frac( floatx80 a )
2170 {
2171 
2172     return a.low;
2173 
2174 }
2175 
2176 /*----------------------------------------------------------------------------
2177 | Returns the exponent bits of the extended double-precision floating-point
2178 | value `a'.
2179 *----------------------------------------------------------------------------*/
2180 
2181 static inline int32_t extractFloatx80Exp( floatx80 a )
2182 {
2183 
2184     return a.high & 0x7FFF;
2185 
2186 }
2187 
2188 /*----------------------------------------------------------------------------
2189 | Returns the sign bit of the extended double-precision floating-point value
2190 | `a'.
2191 *----------------------------------------------------------------------------*/
2192 
2193 static inline flag extractFloatx80Sign( floatx80 a )
2194 {
2195 
2196     return a.high>>15;
2197 
2198 }
2199 
2200 /*----------------------------------------------------------------------------
2201 | Normalizes the subnormal extended double-precision floating-point value
2202 | represented by the denormalized significand `aSig'.  The normalized exponent
2203 | and significand are stored at the locations pointed to by `zExpPtr' and
2204 | `zSigPtr', respectively.
2205 *----------------------------------------------------------------------------*/
2206 
2207 static void
2208  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2209 {
2210     int8_t shiftCount;
2211 
2212     shiftCount = countLeadingZeros64( aSig );
2213     *zSigPtr = aSig<<shiftCount;
2214     *zExpPtr = 1 - shiftCount;
2215 
2216 }
2217 
2218 /*----------------------------------------------------------------------------
2219 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2220 | extended double-precision floating-point value, returning the result.
2221 *----------------------------------------------------------------------------*/
2222 
2223 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2224 {
2225     floatx80 z;
2226 
2227     z.low = zSig;
2228     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2229     return z;
2230 
2231 }
2232 
2233 /*----------------------------------------------------------------------------
2234 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2235 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2236 | and returns the proper extended double-precision floating-point value
2237 | corresponding to the abstract input.  Ordinarily, the abstract value is
2238 | rounded and packed into the extended double-precision format, with the
2239 | inexact exception raised if the abstract input cannot be represented
2240 | exactly.  However, if the abstract value is too large, the overflow and
2241 | inexact exceptions are raised and an infinity or maximal finite value is
2242 | returned.  If the abstract value is too small, the input value is rounded to
2243 | a subnormal number, and the underflow and inexact exceptions are raised if
2244 | the abstract input cannot be represented exactly as a subnormal extended
2245 | double-precision floating-point number.
2246 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2247 | number of bits as single or double precision, respectively.  Otherwise, the
2248 | result is rounded to the full precision of the extended double-precision
2249 | format.
2250 |     The input significand must be normalized or smaller.  If the input
2251 | significand is not normalized, `zExp' must be 0; in that case, the result
2252 | returned is a subnormal number, and it must not require rounding.  The
2253 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2254 | Floating-Point Arithmetic.
2255 *----------------------------------------------------------------------------*/
2256 
2257 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2258                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2259                                      float_status *status)
2260 {
2261     int8_t roundingMode;
2262     flag roundNearestEven, increment, isTiny;
2263     int64_t roundIncrement, roundMask, roundBits;
2264 
2265     roundingMode = status->float_rounding_mode;
2266     roundNearestEven = ( roundingMode == float_round_nearest_even );
2267     if ( roundingPrecision == 80 ) goto precision80;
2268     if ( roundingPrecision == 64 ) {
2269         roundIncrement = LIT64( 0x0000000000000400 );
2270         roundMask = LIT64( 0x00000000000007FF );
2271     }
2272     else if ( roundingPrecision == 32 ) {
2273         roundIncrement = LIT64( 0x0000008000000000 );
2274         roundMask = LIT64( 0x000000FFFFFFFFFF );
2275     }
2276     else {
2277         goto precision80;
2278     }
2279     zSig0 |= ( zSig1 != 0 );
2280     switch (roundingMode) {
2281     case float_round_nearest_even:
2282     case float_round_ties_away:
2283         break;
2284     case float_round_to_zero:
2285         roundIncrement = 0;
2286         break;
2287     case float_round_up:
2288         roundIncrement = zSign ? 0 : roundMask;
2289         break;
2290     case float_round_down:
2291         roundIncrement = zSign ? roundMask : 0;
2292         break;
2293     default:
2294         abort();
2295     }
2296     roundBits = zSig0 & roundMask;
2297     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2298         if (    ( 0x7FFE < zExp )
2299              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2300            ) {
2301             goto overflow;
2302         }
2303         if ( zExp <= 0 ) {
2304             if (status->flush_to_zero) {
2305                 float_raise(float_flag_output_denormal, status);
2306                 return packFloatx80(zSign, 0, 0);
2307             }
2308             isTiny =
2309                    (status->float_detect_tininess
2310                     == float_tininess_before_rounding)
2311                 || ( zExp < 0 )
2312                 || ( zSig0 <= zSig0 + roundIncrement );
2313             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2314             zExp = 0;
2315             roundBits = zSig0 & roundMask;
2316             if (isTiny && roundBits) {
2317                 float_raise(float_flag_underflow, status);
2318             }
2319             if (roundBits) {
2320                 status->float_exception_flags |= float_flag_inexact;
2321             }
2322             zSig0 += roundIncrement;
2323             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2324             roundIncrement = roundMask + 1;
2325             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2326                 roundMask |= roundIncrement;
2327             }
2328             zSig0 &= ~ roundMask;
2329             return packFloatx80( zSign, zExp, zSig0 );
2330         }
2331     }
2332     if (roundBits) {
2333         status->float_exception_flags |= float_flag_inexact;
2334     }
2335     zSig0 += roundIncrement;
2336     if ( zSig0 < roundIncrement ) {
2337         ++zExp;
2338         zSig0 = LIT64( 0x8000000000000000 );
2339     }
2340     roundIncrement = roundMask + 1;
2341     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2342         roundMask |= roundIncrement;
2343     }
2344     zSig0 &= ~ roundMask;
2345     if ( zSig0 == 0 ) zExp = 0;
2346     return packFloatx80( zSign, zExp, zSig0 );
2347  precision80:
2348     switch (roundingMode) {
2349     case float_round_nearest_even:
2350     case float_round_ties_away:
2351         increment = ((int64_t)zSig1 < 0);
2352         break;
2353     case float_round_to_zero:
2354         increment = 0;
2355         break;
2356     case float_round_up:
2357         increment = !zSign && zSig1;
2358         break;
2359     case float_round_down:
2360         increment = zSign && zSig1;
2361         break;
2362     default:
2363         abort();
2364     }
2365     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2366         if (    ( 0x7FFE < zExp )
2367              || (    ( zExp == 0x7FFE )
2368                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2369                   && increment
2370                 )
2371            ) {
2372             roundMask = 0;
2373  overflow:
2374             float_raise(float_flag_overflow | float_flag_inexact, status);
2375             if (    ( roundingMode == float_round_to_zero )
2376                  || ( zSign && ( roundingMode == float_round_up ) )
2377                  || ( ! zSign && ( roundingMode == float_round_down ) )
2378                ) {
2379                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2380             }
2381             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2382         }
2383         if ( zExp <= 0 ) {
2384             isTiny =
2385                    (status->float_detect_tininess
2386                     == float_tininess_before_rounding)
2387                 || ( zExp < 0 )
2388                 || ! increment
2389                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2390             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2391             zExp = 0;
2392             if (isTiny && zSig1) {
2393                 float_raise(float_flag_underflow, status);
2394             }
2395             if (zSig1) {
2396                 status->float_exception_flags |= float_flag_inexact;
2397             }
2398             switch (roundingMode) {
2399             case float_round_nearest_even:
2400             case float_round_ties_away:
2401                 increment = ((int64_t)zSig1 < 0);
2402                 break;
2403             case float_round_to_zero:
2404                 increment = 0;
2405                 break;
2406             case float_round_up:
2407                 increment = !zSign && zSig1;
2408                 break;
2409             case float_round_down:
2410                 increment = zSign && zSig1;
2411                 break;
2412             default:
2413                 abort();
2414             }
2415             if ( increment ) {
2416                 ++zSig0;
2417                 zSig0 &=
2418                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2419                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2420             }
2421             return packFloatx80( zSign, zExp, zSig0 );
2422         }
2423     }
2424     if (zSig1) {
2425         status->float_exception_flags |= float_flag_inexact;
2426     }
2427     if ( increment ) {
2428         ++zSig0;
2429         if ( zSig0 == 0 ) {
2430             ++zExp;
2431             zSig0 = LIT64( 0x8000000000000000 );
2432         }
2433         else {
2434             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2435         }
2436     }
2437     else {
2438         if ( zSig0 == 0 ) zExp = 0;
2439     }
2440     return packFloatx80( zSign, zExp, zSig0 );
2441 
2442 }
2443 
2444 /*----------------------------------------------------------------------------
2445 | Takes an abstract floating-point value having sign `zSign', exponent
2446 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2447 | and returns the proper extended double-precision floating-point value
2448 | corresponding to the abstract input.  This routine is just like
2449 | `roundAndPackFloatx80' except that the input significand does not have to be
2450 | normalized.
2451 *----------------------------------------------------------------------------*/
2452 
2453 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2454                                               flag zSign, int32_t zExp,
2455                                               uint64_t zSig0, uint64_t zSig1,
2456                                               float_status *status)
2457 {
2458     int8_t shiftCount;
2459 
2460     if ( zSig0 == 0 ) {
2461         zSig0 = zSig1;
2462         zSig1 = 0;
2463         zExp -= 64;
2464     }
2465     shiftCount = countLeadingZeros64( zSig0 );
2466     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2467     zExp -= shiftCount;
2468     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2469                                 zSig0, zSig1, status);
2470 
2471 }
2472 
2473 /*----------------------------------------------------------------------------
2474 | Returns the least-significant 64 fraction bits of the quadruple-precision
2475 | floating-point value `a'.
2476 *----------------------------------------------------------------------------*/
2477 
2478 static inline uint64_t extractFloat128Frac1( float128 a )
2479 {
2480 
2481     return a.low;
2482 
2483 }
2484 
2485 /*----------------------------------------------------------------------------
2486 | Returns the most-significant 48 fraction bits of the quadruple-precision
2487 | floating-point value `a'.
2488 *----------------------------------------------------------------------------*/
2489 
2490 static inline uint64_t extractFloat128Frac0( float128 a )
2491 {
2492 
2493     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2494 
2495 }
2496 
2497 /*----------------------------------------------------------------------------
2498 | Returns the exponent bits of the quadruple-precision floating-point value
2499 | `a'.
2500 *----------------------------------------------------------------------------*/
2501 
2502 static inline int32_t extractFloat128Exp( float128 a )
2503 {
2504 
2505     return ( a.high>>48 ) & 0x7FFF;
2506 
2507 }
2508 
2509 /*----------------------------------------------------------------------------
2510 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2511 *----------------------------------------------------------------------------*/
2512 
2513 static inline flag extractFloat128Sign( float128 a )
2514 {
2515 
2516     return a.high>>63;
2517 
2518 }
2519 
2520 /*----------------------------------------------------------------------------
2521 | Normalizes the subnormal quadruple-precision floating-point value
2522 | represented by the denormalized significand formed by the concatenation of
2523 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2524 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2525 | significand are stored at the location pointed to by `zSig0Ptr', and the
2526 | least significant 64 bits of the normalized significand are stored at the
2527 | location pointed to by `zSig1Ptr'.
2528 *----------------------------------------------------------------------------*/
2529 
2530 static void
2531  normalizeFloat128Subnormal(
2532      uint64_t aSig0,
2533      uint64_t aSig1,
2534      int32_t *zExpPtr,
2535      uint64_t *zSig0Ptr,
2536      uint64_t *zSig1Ptr
2537  )
2538 {
2539     int8_t shiftCount;
2540 
2541     if ( aSig0 == 0 ) {
2542         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2543         if ( shiftCount < 0 ) {
2544             *zSig0Ptr = aSig1>>( - shiftCount );
2545             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2546         }
2547         else {
2548             *zSig0Ptr = aSig1<<shiftCount;
2549             *zSig1Ptr = 0;
2550         }
2551         *zExpPtr = - shiftCount - 63;
2552     }
2553     else {
2554         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2555         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2556         *zExpPtr = 1 - shiftCount;
2557     }
2558 
2559 }
2560 
2561 /*----------------------------------------------------------------------------
2562 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2563 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2564 | floating-point value, returning the result.  After being shifted into the
2565 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2566 | added together to form the most significant 32 bits of the result.  This
2567 | means that any integer portion of `zSig0' will be added into the exponent.
2568 | Since a properly normalized significand will have an integer portion equal
2569 | to 1, the `zExp' input should be 1 less than the desired result exponent
2570 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2571 | significand.
2572 *----------------------------------------------------------------------------*/
2573 
2574 static inline float128
2575  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2576 {
2577     float128 z;
2578 
2579     z.low = zSig1;
2580     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2581     return z;
2582 
2583 }
2584 
2585 /*----------------------------------------------------------------------------
2586 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2587 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2588 | and `zSig2', and returns the proper quadruple-precision floating-point value
2589 | corresponding to the abstract input.  Ordinarily, the abstract value is
2590 | simply rounded and packed into the quadruple-precision format, with the
2591 | inexact exception raised if the abstract input cannot be represented
2592 | exactly.  However, if the abstract value is too large, the overflow and
2593 | inexact exceptions are raised and an infinity or maximal finite value is
2594 | returned.  If the abstract value is too small, the input value is rounded to
2595 | a subnormal number, and the underflow and inexact exceptions are raised if
2596 | the abstract input cannot be represented exactly as a subnormal quadruple-
2597 | precision floating-point number.
2598 |     The input significand must be normalized or smaller.  If the input
2599 | significand is not normalized, `zExp' must be 0; in that case, the result
2600 | returned is a subnormal number, and it must not require rounding.  In the
2601 | usual case that the input significand is normalized, `zExp' must be 1 less
2602 | than the ``true'' floating-point exponent.  The handling of underflow and
2603 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2604 *----------------------------------------------------------------------------*/
2605 
2606 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2607                                      uint64_t zSig0, uint64_t zSig1,
2608                                      uint64_t zSig2, float_status *status)
2609 {
2610     int8_t roundingMode;
2611     flag roundNearestEven, increment, isTiny;
2612 
2613     roundingMode = status->float_rounding_mode;
2614     roundNearestEven = ( roundingMode == float_round_nearest_even );
2615     switch (roundingMode) {
2616     case float_round_nearest_even:
2617     case float_round_ties_away:
2618         increment = ((int64_t)zSig2 < 0);
2619         break;
2620     case float_round_to_zero:
2621         increment = 0;
2622         break;
2623     case float_round_up:
2624         increment = !zSign && zSig2;
2625         break;
2626     case float_round_down:
2627         increment = zSign && zSig2;
2628         break;
2629     case float_round_to_odd:
2630         increment = !(zSig1 & 0x1) && zSig2;
2631         break;
2632     default:
2633         abort();
2634     }
2635     if ( 0x7FFD <= (uint32_t) zExp ) {
2636         if (    ( 0x7FFD < zExp )
2637              || (    ( zExp == 0x7FFD )
2638                   && eq128(
2639                          LIT64( 0x0001FFFFFFFFFFFF ),
2640                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2641                          zSig0,
2642                          zSig1
2643                      )
2644                   && increment
2645                 )
2646            ) {
2647             float_raise(float_flag_overflow | float_flag_inexact, status);
2648             if (    ( roundingMode == float_round_to_zero )
2649                  || ( zSign && ( roundingMode == float_round_up ) )
2650                  || ( ! zSign && ( roundingMode == float_round_down ) )
2651                  || (roundingMode == float_round_to_odd)
2652                ) {
2653                 return
2654                     packFloat128(
2655                         zSign,
2656                         0x7FFE,
2657                         LIT64( 0x0000FFFFFFFFFFFF ),
2658                         LIT64( 0xFFFFFFFFFFFFFFFF )
2659                     );
2660             }
2661             return packFloat128( zSign, 0x7FFF, 0, 0 );
2662         }
2663         if ( zExp < 0 ) {
2664             if (status->flush_to_zero) {
2665                 float_raise(float_flag_output_denormal, status);
2666                 return packFloat128(zSign, 0, 0, 0);
2667             }
2668             isTiny =
2669                    (status->float_detect_tininess
2670                     == float_tininess_before_rounding)
2671                 || ( zExp < -1 )
2672                 || ! increment
2673                 || lt128(
2674                        zSig0,
2675                        zSig1,
2676                        LIT64( 0x0001FFFFFFFFFFFF ),
2677                        LIT64( 0xFFFFFFFFFFFFFFFF )
2678                    );
2679             shift128ExtraRightJamming(
2680                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2681             zExp = 0;
2682             if (isTiny && zSig2) {
2683                 float_raise(float_flag_underflow, status);
2684             }
2685             switch (roundingMode) {
2686             case float_round_nearest_even:
2687             case float_round_ties_away:
2688                 increment = ((int64_t)zSig2 < 0);
2689                 break;
2690             case float_round_to_zero:
2691                 increment = 0;
2692                 break;
2693             case float_round_up:
2694                 increment = !zSign && zSig2;
2695                 break;
2696             case float_round_down:
2697                 increment = zSign && zSig2;
2698                 break;
2699             case float_round_to_odd:
2700                 increment = !(zSig1 & 0x1) && zSig2;
2701                 break;
2702             default:
2703                 abort();
2704             }
2705         }
2706     }
2707     if (zSig2) {
2708         status->float_exception_flags |= float_flag_inexact;
2709     }
2710     if ( increment ) {
2711         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2712         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2713     }
2714     else {
2715         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2716     }
2717     return packFloat128( zSign, zExp, zSig0, zSig1 );
2718 
2719 }
2720 
2721 /*----------------------------------------------------------------------------
2722 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2723 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2724 | returns the proper quadruple-precision floating-point value corresponding
2725 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2726 | except that the input significand has fewer bits and does not have to be
2727 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2728 | point exponent.
2729 *----------------------------------------------------------------------------*/
2730 
2731 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2732                                               uint64_t zSig0, uint64_t zSig1,
2733                                               float_status *status)
2734 {
2735     int8_t shiftCount;
2736     uint64_t zSig2;
2737 
2738     if ( zSig0 == 0 ) {
2739         zSig0 = zSig1;
2740         zSig1 = 0;
2741         zExp -= 64;
2742     }
2743     shiftCount = countLeadingZeros64( zSig0 ) - 15;
2744     if ( 0 <= shiftCount ) {
2745         zSig2 = 0;
2746         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2747     }
2748     else {
2749         shift128ExtraRightJamming(
2750             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2751     }
2752     zExp -= shiftCount;
2753     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2754 
2755 }
2756 
2757 
2758 /*----------------------------------------------------------------------------
2759 | Returns the result of converting the 32-bit two's complement integer `a'
2760 | to the extended double-precision floating-point format.  The conversion
2761 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2762 | Arithmetic.
2763 *----------------------------------------------------------------------------*/
2764 
2765 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2766 {
2767     flag zSign;
2768     uint32_t absA;
2769     int8_t shiftCount;
2770     uint64_t zSig;
2771 
2772     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2773     zSign = ( a < 0 );
2774     absA = zSign ? - a : a;
2775     shiftCount = countLeadingZeros32( absA ) + 32;
2776     zSig = absA;
2777     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2778 
2779 }
2780 
2781 /*----------------------------------------------------------------------------
2782 | Returns the result of converting the 32-bit two's complement integer `a' to
2783 | the quadruple-precision floating-point format.  The conversion is performed
2784 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2785 *----------------------------------------------------------------------------*/
2786 
2787 float128 int32_to_float128(int32_t a, float_status *status)
2788 {
2789     flag zSign;
2790     uint32_t absA;
2791     int8_t shiftCount;
2792     uint64_t zSig0;
2793 
2794     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2795     zSign = ( a < 0 );
2796     absA = zSign ? - a : a;
2797     shiftCount = countLeadingZeros32( absA ) + 17;
2798     zSig0 = absA;
2799     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2800 
2801 }
2802 
2803 /*----------------------------------------------------------------------------
2804 | Returns the result of converting the 64-bit two's complement integer `a'
2805 | to the extended double-precision floating-point format.  The conversion
2806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2807 | Arithmetic.
2808 *----------------------------------------------------------------------------*/
2809 
2810 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2811 {
2812     flag zSign;
2813     uint64_t absA;
2814     int8_t shiftCount;
2815 
2816     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2817     zSign = ( a < 0 );
2818     absA = zSign ? - a : a;
2819     shiftCount = countLeadingZeros64( absA );
2820     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2821 
2822 }
2823 
2824 /*----------------------------------------------------------------------------
2825 | Returns the result of converting the 64-bit two's complement integer `a' to
2826 | the quadruple-precision floating-point format.  The conversion is performed
2827 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2828 *----------------------------------------------------------------------------*/
2829 
2830 float128 int64_to_float128(int64_t a, float_status *status)
2831 {
2832     flag zSign;
2833     uint64_t absA;
2834     int8_t shiftCount;
2835     int32_t zExp;
2836     uint64_t zSig0, zSig1;
2837 
2838     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2839     zSign = ( a < 0 );
2840     absA = zSign ? - a : a;
2841     shiftCount = countLeadingZeros64( absA ) + 49;
2842     zExp = 0x406E - shiftCount;
2843     if ( 64 <= shiftCount ) {
2844         zSig1 = 0;
2845         zSig0 = absA;
2846         shiftCount -= 64;
2847     }
2848     else {
2849         zSig1 = absA;
2850         zSig0 = 0;
2851     }
2852     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2853     return packFloat128( zSign, zExp, zSig0, zSig1 );
2854 
2855 }
2856 
2857 /*----------------------------------------------------------------------------
2858 | Returns the result of converting the 64-bit unsigned integer `a'
2859 | to the quadruple-precision floating-point format.  The conversion is performed
2860 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2861 *----------------------------------------------------------------------------*/
2862 
2863 float128 uint64_to_float128(uint64_t a, float_status *status)
2864 {
2865     if (a == 0) {
2866         return float128_zero;
2867     }
2868     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
2869 }
2870 
2871 
2872 
2873 
2874 /*----------------------------------------------------------------------------
2875 | Returns the result of converting the single-precision floating-point value
2876 | `a' to the double-precision floating-point format.  The conversion is
2877 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2878 | Arithmetic.
2879 *----------------------------------------------------------------------------*/
2880 
2881 float64 float32_to_float64(float32 a, float_status *status)
2882 {
2883     flag aSign;
2884     int aExp;
2885     uint32_t aSig;
2886     a = float32_squash_input_denormal(a, status);
2887 
2888     aSig = extractFloat32Frac( a );
2889     aExp = extractFloat32Exp( a );
2890     aSign = extractFloat32Sign( a );
2891     if ( aExp == 0xFF ) {
2892         if (aSig) {
2893             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2894         }
2895         return packFloat64( aSign, 0x7FF, 0 );
2896     }
2897     if ( aExp == 0 ) {
2898         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2899         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2900         --aExp;
2901     }
2902     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
2903 
2904 }
2905 
2906 /*----------------------------------------------------------------------------
2907 | Returns the result of converting the single-precision floating-point value
2908 | `a' to the extended double-precision floating-point format.  The conversion
2909 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2910 | Arithmetic.
2911 *----------------------------------------------------------------------------*/
2912 
2913 floatx80 float32_to_floatx80(float32 a, float_status *status)
2914 {
2915     flag aSign;
2916     int aExp;
2917     uint32_t aSig;
2918 
2919     a = float32_squash_input_denormal(a, status);
2920     aSig = extractFloat32Frac( a );
2921     aExp = extractFloat32Exp( a );
2922     aSign = extractFloat32Sign( a );
2923     if ( aExp == 0xFF ) {
2924         if (aSig) {
2925             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2926         }
2927         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2928     }
2929     if ( aExp == 0 ) {
2930         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2931         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2932     }
2933     aSig |= 0x00800000;
2934     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
2935 
2936 }
2937 
2938 /*----------------------------------------------------------------------------
2939 | Returns the result of converting the single-precision floating-point value
2940 | `a' to the double-precision floating-point format.  The conversion is
2941 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2942 | Arithmetic.
2943 *----------------------------------------------------------------------------*/
2944 
2945 float128 float32_to_float128(float32 a, float_status *status)
2946 {
2947     flag aSign;
2948     int aExp;
2949     uint32_t aSig;
2950 
2951     a = float32_squash_input_denormal(a, status);
2952     aSig = extractFloat32Frac( a );
2953     aExp = extractFloat32Exp( a );
2954     aSign = extractFloat32Sign( a );
2955     if ( aExp == 0xFF ) {
2956         if (aSig) {
2957             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2958         }
2959         return packFloat128( aSign, 0x7FFF, 0, 0 );
2960     }
2961     if ( aExp == 0 ) {
2962         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2963         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2964         --aExp;
2965     }
2966     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
2967 
2968 }
2969 
2970 /*----------------------------------------------------------------------------
2971 | Returns the remainder of the single-precision floating-point value `a'
2972 | with respect to the corresponding value `b'.  The operation is performed
2973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2974 *----------------------------------------------------------------------------*/
2975 
2976 float32 float32_rem(float32 a, float32 b, float_status *status)
2977 {
2978     flag aSign, zSign;
2979     int aExp, bExp, expDiff;
2980     uint32_t aSig, bSig;
2981     uint32_t q;
2982     uint64_t aSig64, bSig64, q64;
2983     uint32_t alternateASig;
2984     int32_t sigMean;
2985     a = float32_squash_input_denormal(a, status);
2986     b = float32_squash_input_denormal(b, status);
2987 
2988     aSig = extractFloat32Frac( a );
2989     aExp = extractFloat32Exp( a );
2990     aSign = extractFloat32Sign( a );
2991     bSig = extractFloat32Frac( b );
2992     bExp = extractFloat32Exp( b );
2993     if ( aExp == 0xFF ) {
2994         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2995             return propagateFloat32NaN(a, b, status);
2996         }
2997         float_raise(float_flag_invalid, status);
2998         return float32_default_nan(status);
2999     }
3000     if ( bExp == 0xFF ) {
3001         if (bSig) {
3002             return propagateFloat32NaN(a, b, status);
3003         }
3004         return a;
3005     }
3006     if ( bExp == 0 ) {
3007         if ( bSig == 0 ) {
3008             float_raise(float_flag_invalid, status);
3009             return float32_default_nan(status);
3010         }
3011         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3012     }
3013     if ( aExp == 0 ) {
3014         if ( aSig == 0 ) return a;
3015         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3016     }
3017     expDiff = aExp - bExp;
3018     aSig |= 0x00800000;
3019     bSig |= 0x00800000;
3020     if ( expDiff < 32 ) {
3021         aSig <<= 8;
3022         bSig <<= 8;
3023         if ( expDiff < 0 ) {
3024             if ( expDiff < -1 ) return a;
3025             aSig >>= 1;
3026         }
3027         q = ( bSig <= aSig );
3028         if ( q ) aSig -= bSig;
3029         if ( 0 < expDiff ) {
3030             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3031             q >>= 32 - expDiff;
3032             bSig >>= 2;
3033             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3034         }
3035         else {
3036             aSig >>= 2;
3037             bSig >>= 2;
3038         }
3039     }
3040     else {
3041         if ( bSig <= aSig ) aSig -= bSig;
3042         aSig64 = ( (uint64_t) aSig )<<40;
3043         bSig64 = ( (uint64_t) bSig )<<40;
3044         expDiff -= 64;
3045         while ( 0 < expDiff ) {
3046             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3047             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3048             aSig64 = - ( ( bSig * q64 )<<38 );
3049             expDiff -= 62;
3050         }
3051         expDiff += 64;
3052         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3053         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3054         q = q64>>( 64 - expDiff );
3055         bSig <<= 6;
3056         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3057     }
3058     do {
3059         alternateASig = aSig;
3060         ++q;
3061         aSig -= bSig;
3062     } while ( 0 <= (int32_t) aSig );
3063     sigMean = aSig + alternateASig;
3064     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3065         aSig = alternateASig;
3066     }
3067     zSign = ( (int32_t) aSig < 0 );
3068     if ( zSign ) aSig = - aSig;
3069     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3070 }
3071 
3072 
3073 /*----------------------------------------------------------------------------
3074 | Returns the square root of the single-precision floating-point value `a'.
3075 | The operation is performed according to the IEC/IEEE Standard for Binary
3076 | Floating-Point Arithmetic.
3077 *----------------------------------------------------------------------------*/
3078 
3079 float32 float32_sqrt(float32 a, float_status *status)
3080 {
3081     flag aSign;
3082     int aExp, zExp;
3083     uint32_t aSig, zSig;
3084     uint64_t rem, term;
3085     a = float32_squash_input_denormal(a, status);
3086 
3087     aSig = extractFloat32Frac( a );
3088     aExp = extractFloat32Exp( a );
3089     aSign = extractFloat32Sign( a );
3090     if ( aExp == 0xFF ) {
3091         if (aSig) {
3092             return propagateFloat32NaN(a, float32_zero, status);
3093         }
3094         if ( ! aSign ) return a;
3095         float_raise(float_flag_invalid, status);
3096         return float32_default_nan(status);
3097     }
3098     if ( aSign ) {
3099         if ( ( aExp | aSig ) == 0 ) return a;
3100         float_raise(float_flag_invalid, status);
3101         return float32_default_nan(status);
3102     }
3103     if ( aExp == 0 ) {
3104         if ( aSig == 0 ) return float32_zero;
3105         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3106     }
3107     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3108     aSig = ( aSig | 0x00800000 )<<8;
3109     zSig = estimateSqrt32( aExp, aSig ) + 2;
3110     if ( ( zSig & 0x7F ) <= 5 ) {
3111         if ( zSig < 2 ) {
3112             zSig = 0x7FFFFFFF;
3113             goto roundAndPack;
3114         }
3115         aSig >>= aExp & 1;
3116         term = ( (uint64_t) zSig ) * zSig;
3117         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3118         while ( (int64_t) rem < 0 ) {
3119             --zSig;
3120             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3121         }
3122         zSig |= ( rem != 0 );
3123     }
3124     shift32RightJamming( zSig, 1, &zSig );
3125  roundAndPack:
3126     return roundAndPackFloat32(0, zExp, zSig, status);
3127 
3128 }
3129 
3130 /*----------------------------------------------------------------------------
3131 | Returns the binary exponential of the single-precision floating-point value
3132 | `a'. The operation is performed according to the IEC/IEEE Standard for
3133 | Binary Floating-Point Arithmetic.
3134 |
3135 | Uses the following identities:
3136 |
3137 | 1. -------------------------------------------------------------------------
3138 |      x    x*ln(2)
3139 |     2  = e
3140 |
3141 | 2. -------------------------------------------------------------------------
3142 |                      2     3     4     5           n
3143 |      x        x     x     x     x     x           x
3144 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3145 |               1!    2!    3!    4!    5!          n!
3146 *----------------------------------------------------------------------------*/
3147 
3148 static const float64 float32_exp2_coefficients[15] =
3149 {
3150     const_float64( 0x3ff0000000000000ll ), /*  1 */
3151     const_float64( 0x3fe0000000000000ll ), /*  2 */
3152     const_float64( 0x3fc5555555555555ll ), /*  3 */
3153     const_float64( 0x3fa5555555555555ll ), /*  4 */
3154     const_float64( 0x3f81111111111111ll ), /*  5 */
3155     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3156     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3157     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3158     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3159     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3160     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3161     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3162     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3163     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3164     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3165 };
3166 
3167 float32 float32_exp2(float32 a, float_status *status)
3168 {
3169     flag aSign;
3170     int aExp;
3171     uint32_t aSig;
3172     float64 r, x, xn;
3173     int i;
3174     a = float32_squash_input_denormal(a, status);
3175 
3176     aSig = extractFloat32Frac( a );
3177     aExp = extractFloat32Exp( a );
3178     aSign = extractFloat32Sign( a );
3179 
3180     if ( aExp == 0xFF) {
3181         if (aSig) {
3182             return propagateFloat32NaN(a, float32_zero, status);
3183         }
3184         return (aSign) ? float32_zero : a;
3185     }
3186     if (aExp == 0) {
3187         if (aSig == 0) return float32_one;
3188     }
3189 
3190     float_raise(float_flag_inexact, status);
3191 
3192     /* ******************************* */
3193     /* using float64 for approximation */
3194     /* ******************************* */
3195     x = float32_to_float64(a, status);
3196     x = float64_mul(x, float64_ln2, status);
3197 
3198     xn = x;
3199     r = float64_one;
3200     for (i = 0 ; i < 15 ; i++) {
3201         float64 f;
3202 
3203         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3204         r = float64_add(r, f, status);
3205 
3206         xn = float64_mul(xn, x, status);
3207     }
3208 
3209     return float64_to_float32(r, status);
3210 }
3211 
3212 /*----------------------------------------------------------------------------
3213 | Returns the binary log of the single-precision floating-point value `a'.
3214 | The operation is performed according to the IEC/IEEE Standard for Binary
3215 | Floating-Point Arithmetic.
3216 *----------------------------------------------------------------------------*/
3217 float32 float32_log2(float32 a, float_status *status)
3218 {
3219     flag aSign, zSign;
3220     int aExp;
3221     uint32_t aSig, zSig, i;
3222 
3223     a = float32_squash_input_denormal(a, status);
3224     aSig = extractFloat32Frac( a );
3225     aExp = extractFloat32Exp( a );
3226     aSign = extractFloat32Sign( a );
3227 
3228     if ( aExp == 0 ) {
3229         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3230         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3231     }
3232     if ( aSign ) {
3233         float_raise(float_flag_invalid, status);
3234         return float32_default_nan(status);
3235     }
3236     if ( aExp == 0xFF ) {
3237         if (aSig) {
3238             return propagateFloat32NaN(a, float32_zero, status);
3239         }
3240         return a;
3241     }
3242 
3243     aExp -= 0x7F;
3244     aSig |= 0x00800000;
3245     zSign = aExp < 0;
3246     zSig = aExp << 23;
3247 
3248     for (i = 1 << 22; i > 0; i >>= 1) {
3249         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3250         if ( aSig & 0x01000000 ) {
3251             aSig >>= 1;
3252             zSig |= i;
3253         }
3254     }
3255 
3256     if ( zSign )
3257         zSig = -zSig;
3258 
3259     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3260 }
3261 
3262 /*----------------------------------------------------------------------------
3263 | Returns 1 if the single-precision floating-point value `a' is equal to
3264 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3265 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3266 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3268 
3269 int float32_eq(float32 a, float32 b, float_status *status)
3270 {
3271     uint32_t av, bv;
3272     a = float32_squash_input_denormal(a, status);
3273     b = float32_squash_input_denormal(b, status);
3274 
3275     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3276          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3277        ) {
3278         float_raise(float_flag_invalid, status);
3279         return 0;
3280     }
3281     av = float32_val(a);
3282     bv = float32_val(b);
3283     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3284 }
3285 
3286 /*----------------------------------------------------------------------------
3287 | Returns 1 if the single-precision floating-point value `a' is less than
3288 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3289 | exception is raised if either operand is a NaN.  The comparison is performed
3290 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3291 *----------------------------------------------------------------------------*/
3292 
3293 int float32_le(float32 a, float32 b, float_status *status)
3294 {
3295     flag aSign, bSign;
3296     uint32_t av, bv;
3297     a = float32_squash_input_denormal(a, status);
3298     b = float32_squash_input_denormal(b, status);
3299 
3300     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3301          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3302        ) {
3303         float_raise(float_flag_invalid, status);
3304         return 0;
3305     }
3306     aSign = extractFloat32Sign( a );
3307     bSign = extractFloat32Sign( b );
3308     av = float32_val(a);
3309     bv = float32_val(b);
3310     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3311     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3312 
3313 }
3314 
3315 /*----------------------------------------------------------------------------
3316 | Returns 1 if the single-precision floating-point value `a' is less than
3317 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3318 | raised if either operand is a NaN.  The comparison is performed according
3319 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3320 *----------------------------------------------------------------------------*/
3321 
3322 int float32_lt(float32 a, float32 b, float_status *status)
3323 {
3324     flag aSign, bSign;
3325     uint32_t av, bv;
3326     a = float32_squash_input_denormal(a, status);
3327     b = float32_squash_input_denormal(b, status);
3328 
3329     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3330          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3331        ) {
3332         float_raise(float_flag_invalid, status);
3333         return 0;
3334     }
3335     aSign = extractFloat32Sign( a );
3336     bSign = extractFloat32Sign( b );
3337     av = float32_val(a);
3338     bv = float32_val(b);
3339     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3340     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3341 
3342 }
3343 
3344 /*----------------------------------------------------------------------------
3345 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3346 | be compared, and 0 otherwise.  The invalid exception is raised if either
3347 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3348 | Standard for Binary Floating-Point Arithmetic.
3349 *----------------------------------------------------------------------------*/
3350 
3351 int float32_unordered(float32 a, float32 b, float_status *status)
3352 {
3353     a = float32_squash_input_denormal(a, status);
3354     b = float32_squash_input_denormal(b, status);
3355 
3356     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3357          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3358        ) {
3359         float_raise(float_flag_invalid, status);
3360         return 1;
3361     }
3362     return 0;
3363 }
3364 
3365 /*----------------------------------------------------------------------------
3366 | Returns 1 if the single-precision floating-point value `a' is equal to
3367 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3368 | exception.  The comparison is performed according to the IEC/IEEE Standard
3369 | for Binary Floating-Point Arithmetic.
3370 *----------------------------------------------------------------------------*/
3371 
3372 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3373 {
3374     a = float32_squash_input_denormal(a, status);
3375     b = float32_squash_input_denormal(b, status);
3376 
3377     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3378          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3379        ) {
3380         if (float32_is_signaling_nan(a, status)
3381          || float32_is_signaling_nan(b, status)) {
3382             float_raise(float_flag_invalid, status);
3383         }
3384         return 0;
3385     }
3386     return ( float32_val(a) == float32_val(b) ) ||
3387             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3388 }
3389 
3390 /*----------------------------------------------------------------------------
3391 | Returns 1 if the single-precision floating-point value `a' is less than or
3392 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3393 | cause an exception.  Otherwise, the comparison is performed according to the
3394 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3395 *----------------------------------------------------------------------------*/
3396 
3397 int float32_le_quiet(float32 a, float32 b, float_status *status)
3398 {
3399     flag aSign, bSign;
3400     uint32_t av, bv;
3401     a = float32_squash_input_denormal(a, status);
3402     b = float32_squash_input_denormal(b, status);
3403 
3404     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3405          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3406        ) {
3407         if (float32_is_signaling_nan(a, status)
3408          || float32_is_signaling_nan(b, status)) {
3409             float_raise(float_flag_invalid, status);
3410         }
3411         return 0;
3412     }
3413     aSign = extractFloat32Sign( a );
3414     bSign = extractFloat32Sign( b );
3415     av = float32_val(a);
3416     bv = float32_val(b);
3417     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3418     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3419 
3420 }
3421 
3422 /*----------------------------------------------------------------------------
3423 | Returns 1 if the single-precision floating-point value `a' is less than
3424 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3425 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3426 | Standard for Binary Floating-Point Arithmetic.
3427 *----------------------------------------------------------------------------*/
3428 
3429 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3430 {
3431     flag aSign, bSign;
3432     uint32_t av, bv;
3433     a = float32_squash_input_denormal(a, status);
3434     b = float32_squash_input_denormal(b, status);
3435 
3436     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3437          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3438        ) {
3439         if (float32_is_signaling_nan(a, status)
3440          || float32_is_signaling_nan(b, status)) {
3441             float_raise(float_flag_invalid, status);
3442         }
3443         return 0;
3444     }
3445     aSign = extractFloat32Sign( a );
3446     bSign = extractFloat32Sign( b );
3447     av = float32_val(a);
3448     bv = float32_val(b);
3449     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3450     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3451 
3452 }
3453 
3454 /*----------------------------------------------------------------------------
3455 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3456 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3457 | comparison is performed according to the IEC/IEEE Standard for Binary
3458 | Floating-Point Arithmetic.
3459 *----------------------------------------------------------------------------*/
3460 
3461 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3462 {
3463     a = float32_squash_input_denormal(a, status);
3464     b = float32_squash_input_denormal(b, status);
3465 
3466     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3467          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3468        ) {
3469         if (float32_is_signaling_nan(a, status)
3470          || float32_is_signaling_nan(b, status)) {
3471             float_raise(float_flag_invalid, status);
3472         }
3473         return 1;
3474     }
3475     return 0;
3476 }
3477 
3478 
3479 /*----------------------------------------------------------------------------
3480 | Returns the result of converting the double-precision floating-point value
3481 | `a' to the single-precision floating-point format.  The conversion is
3482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3483 | Arithmetic.
3484 *----------------------------------------------------------------------------*/
3485 
3486 float32 float64_to_float32(float64 a, float_status *status)
3487 {
3488     flag aSign;
3489     int aExp;
3490     uint64_t aSig;
3491     uint32_t zSig;
3492     a = float64_squash_input_denormal(a, status);
3493 
3494     aSig = extractFloat64Frac( a );
3495     aExp = extractFloat64Exp( a );
3496     aSign = extractFloat64Sign( a );
3497     if ( aExp == 0x7FF ) {
3498         if (aSig) {
3499             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3500         }
3501         return packFloat32( aSign, 0xFF, 0 );
3502     }
3503     shift64RightJamming( aSig, 22, &aSig );
3504     zSig = aSig;
3505     if ( aExp || zSig ) {
3506         zSig |= 0x40000000;
3507         aExp -= 0x381;
3508     }
3509     return roundAndPackFloat32(aSign, aExp, zSig, status);
3510 
3511 }
3512 
3513 
3514 /*----------------------------------------------------------------------------
3515 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3516 | half-precision floating-point value, returning the result.  After being
3517 | shifted into the proper positions, the three fields are simply added
3518 | together to form the result.  This means that any integer portion of `zSig'
3519 | will be added into the exponent.  Since a properly normalized significand
3520 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3521 | than the desired result exponent whenever `zSig' is a complete, normalized
3522 | significand.
3523 *----------------------------------------------------------------------------*/
3524 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3525 {
3526     return make_float16(
3527         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3528 }
3529 
3530 /*----------------------------------------------------------------------------
3531 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3532 | and significand `zSig', and returns the proper half-precision floating-
3533 | point value corresponding to the abstract input.  Ordinarily, the abstract
3534 | value is simply rounded and packed into the half-precision format, with
3535 | the inexact exception raised if the abstract input cannot be represented
3536 | exactly.  However, if the abstract value is too large, the overflow and
3537 | inexact exceptions are raised and an infinity or maximal finite value is
3538 | returned.  If the abstract value is too small, the input value is rounded to
3539 | a subnormal number, and the underflow and inexact exceptions are raised if
3540 | the abstract input cannot be represented exactly as a subnormal half-
3541 | precision floating-point number.
3542 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3543 | ARM-style "alternative representation", which omits the NaN and Inf
3544 | encodings in order to raise the maximum representable exponent by one.
3545 |     The input significand `zSig' has its binary point between bits 22
3546 | and 23, which is 13 bits to the left of the usual location.  This shifted
3547 | significand must be normalized or smaller.  If `zSig' is not normalized,
3548 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3549 | and it must not require rounding.  In the usual case that `zSig' is
3550 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3551 | Note the slightly odd position of the binary point in zSig compared with the
3552 | other roundAndPackFloat functions. This should probably be fixed if we
3553 | need to implement more float16 routines than just conversion.
3554 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3555 | Binary Floating-Point Arithmetic.
3556 *----------------------------------------------------------------------------*/
3557 
3558 static float16 roundAndPackFloat16(flag zSign, int zExp,
3559                                    uint32_t zSig, flag ieee,
3560                                    float_status *status)
3561 {
3562     int maxexp = ieee ? 29 : 30;
3563     uint32_t mask;
3564     uint32_t increment;
3565     bool rounding_bumps_exp;
3566     bool is_tiny = false;
3567 
3568     /* Calculate the mask of bits of the mantissa which are not
3569      * representable in half-precision and will be lost.
3570      */
3571     if (zExp < 1) {
3572         /* Will be denormal in halfprec */
3573         mask = 0x00ffffff;
3574         if (zExp >= -11) {
3575             mask >>= 11 + zExp;
3576         }
3577     } else {
3578         /* Normal number in halfprec */
3579         mask = 0x00001fff;
3580     }
3581 
3582     switch (status->float_rounding_mode) {
3583     case float_round_nearest_even:
3584         increment = (mask + 1) >> 1;
3585         if ((zSig & mask) == increment) {
3586             increment = zSig & (increment << 1);
3587         }
3588         break;
3589     case float_round_ties_away:
3590         increment = (mask + 1) >> 1;
3591         break;
3592     case float_round_up:
3593         increment = zSign ? 0 : mask;
3594         break;
3595     case float_round_down:
3596         increment = zSign ? mask : 0;
3597         break;
3598     default: /* round_to_zero */
3599         increment = 0;
3600         break;
3601     }
3602 
3603     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3604 
3605     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3606         if (ieee) {
3607             float_raise(float_flag_overflow | float_flag_inexact, status);
3608             return packFloat16(zSign, 0x1f, 0);
3609         } else {
3610             float_raise(float_flag_invalid, status);
3611             return packFloat16(zSign, 0x1f, 0x3ff);
3612         }
3613     }
3614 
3615     if (zExp < 0) {
3616         /* Note that flush-to-zero does not affect half-precision results */
3617         is_tiny =
3618             (status->float_detect_tininess == float_tininess_before_rounding)
3619             || (zExp < -1)
3620             || (!rounding_bumps_exp);
3621     }
3622     if (zSig & mask) {
3623         float_raise(float_flag_inexact, status);
3624         if (is_tiny) {
3625             float_raise(float_flag_underflow, status);
3626         }
3627     }
3628 
3629     zSig += increment;
3630     if (rounding_bumps_exp) {
3631         zSig >>= 1;
3632         zExp++;
3633     }
3634 
3635     if (zExp < -10) {
3636         return packFloat16(zSign, 0, 0);
3637     }
3638     if (zExp < 0) {
3639         zSig >>= -zExp;
3640         zExp = 0;
3641     }
3642     return packFloat16(zSign, zExp, zSig >> 13);
3643 }
3644 
3645 /*----------------------------------------------------------------------------
3646 | If `a' is denormal and we are in flush-to-zero mode then set the
3647 | input-denormal exception and return zero. Otherwise just return the value.
3648 *----------------------------------------------------------------------------*/
3649 float16 float16_squash_input_denormal(float16 a, float_status *status)
3650 {
3651     if (status->flush_inputs_to_zero) {
3652         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3653             float_raise(float_flag_input_denormal, status);
3654             return make_float16(float16_val(a) & 0x8000);
3655         }
3656     }
3657     return a;
3658 }
3659 
3660 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3661                                       uint32_t *zSigPtr)
3662 {
3663     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3664     *zSigPtr = aSig << shiftCount;
3665     *zExpPtr = 1 - shiftCount;
3666 }
3667 
3668 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3669    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3670 
3671 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3672 {
3673     flag aSign;
3674     int aExp;
3675     uint32_t aSig;
3676 
3677     aSign = extractFloat16Sign(a);
3678     aExp = extractFloat16Exp(a);
3679     aSig = extractFloat16Frac(a);
3680 
3681     if (aExp == 0x1f && ieee) {
3682         if (aSig) {
3683             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3684         }
3685         return packFloat32(aSign, 0xff, 0);
3686     }
3687     if (aExp == 0) {
3688         if (aSig == 0) {
3689             return packFloat32(aSign, 0, 0);
3690         }
3691 
3692         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3693         aExp--;
3694     }
3695     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3696 }
3697 
3698 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3699 {
3700     flag aSign;
3701     int aExp;
3702     uint32_t aSig;
3703 
3704     a = float32_squash_input_denormal(a, status);
3705 
3706     aSig = extractFloat32Frac( a );
3707     aExp = extractFloat32Exp( a );
3708     aSign = extractFloat32Sign( a );
3709     if ( aExp == 0xFF ) {
3710         if (aSig) {
3711             /* Input is a NaN */
3712             if (!ieee) {
3713                 float_raise(float_flag_invalid, status);
3714                 return packFloat16(aSign, 0, 0);
3715             }
3716             return commonNaNToFloat16(
3717                 float32ToCommonNaN(a, status), status);
3718         }
3719         /* Infinity */
3720         if (!ieee) {
3721             float_raise(float_flag_invalid, status);
3722             return packFloat16(aSign, 0x1f, 0x3ff);
3723         }
3724         return packFloat16(aSign, 0x1f, 0);
3725     }
3726     if (aExp == 0 && aSig == 0) {
3727         return packFloat16(aSign, 0, 0);
3728     }
3729     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3730      * even if the input is denormal; however this is harmless because
3731      * the largest possible single-precision denormal is still smaller
3732      * than the smallest representable half-precision denormal, and so we
3733      * will end up ignoring aSig and returning via the "always return zero"
3734      * codepath.
3735      */
3736     aSig |= 0x00800000;
3737     aExp -= 0x71;
3738 
3739     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3740 }
3741 
3742 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3743 {
3744     flag aSign;
3745     int aExp;
3746     uint32_t aSig;
3747 
3748     aSign = extractFloat16Sign(a);
3749     aExp = extractFloat16Exp(a);
3750     aSig = extractFloat16Frac(a);
3751 
3752     if (aExp == 0x1f && ieee) {
3753         if (aSig) {
3754             return commonNaNToFloat64(
3755                 float16ToCommonNaN(a, status), status);
3756         }
3757         return packFloat64(aSign, 0x7ff, 0);
3758     }
3759     if (aExp == 0) {
3760         if (aSig == 0) {
3761             return packFloat64(aSign, 0, 0);
3762         }
3763 
3764         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3765         aExp--;
3766     }
3767     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3768 }
3769 
3770 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3771 {
3772     flag aSign;
3773     int aExp;
3774     uint64_t aSig;
3775     uint32_t zSig;
3776 
3777     a = float64_squash_input_denormal(a, status);
3778 
3779     aSig = extractFloat64Frac(a);
3780     aExp = extractFloat64Exp(a);
3781     aSign = extractFloat64Sign(a);
3782     if (aExp == 0x7FF) {
3783         if (aSig) {
3784             /* Input is a NaN */
3785             if (!ieee) {
3786                 float_raise(float_flag_invalid, status);
3787                 return packFloat16(aSign, 0, 0);
3788             }
3789             return commonNaNToFloat16(
3790                 float64ToCommonNaN(a, status), status);
3791         }
3792         /* Infinity */
3793         if (!ieee) {
3794             float_raise(float_flag_invalid, status);
3795             return packFloat16(aSign, 0x1f, 0x3ff);
3796         }
3797         return packFloat16(aSign, 0x1f, 0);
3798     }
3799     shift64RightJamming(aSig, 29, &aSig);
3800     zSig = aSig;
3801     if (aExp == 0 && zSig == 0) {
3802         return packFloat16(aSign, 0, 0);
3803     }
3804     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3805      * even if the input is denormal; however this is harmless because
3806      * the largest possible single-precision denormal is still smaller
3807      * than the smallest representable half-precision denormal, and so we
3808      * will end up ignoring aSig and returning via the "always return zero"
3809      * codepath.
3810      */
3811     zSig |= 0x00800000;
3812     aExp -= 0x3F1;
3813 
3814     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3815 }
3816 
3817 /*----------------------------------------------------------------------------
3818 | Returns the result of converting the double-precision floating-point value
3819 | `a' to the extended double-precision floating-point format.  The conversion
3820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3821 | Arithmetic.
3822 *----------------------------------------------------------------------------*/
3823 
3824 floatx80 float64_to_floatx80(float64 a, float_status *status)
3825 {
3826     flag aSign;
3827     int aExp;
3828     uint64_t aSig;
3829 
3830     a = float64_squash_input_denormal(a, status);
3831     aSig = extractFloat64Frac( a );
3832     aExp = extractFloat64Exp( a );
3833     aSign = extractFloat64Sign( a );
3834     if ( aExp == 0x7FF ) {
3835         if (aSig) {
3836             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3837         }
3838         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3839     }
3840     if ( aExp == 0 ) {
3841         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3842         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3843     }
3844     return
3845         packFloatx80(
3846             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3847 
3848 }
3849 
3850 /*----------------------------------------------------------------------------
3851 | Returns the result of converting the double-precision floating-point value
3852 | `a' to the quadruple-precision floating-point format.  The conversion is
3853 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3854 | Arithmetic.
3855 *----------------------------------------------------------------------------*/
3856 
3857 float128 float64_to_float128(float64 a, float_status *status)
3858 {
3859     flag aSign;
3860     int aExp;
3861     uint64_t aSig, zSig0, zSig1;
3862 
3863     a = float64_squash_input_denormal(a, status);
3864     aSig = extractFloat64Frac( a );
3865     aExp = extractFloat64Exp( a );
3866     aSign = extractFloat64Sign( a );
3867     if ( aExp == 0x7FF ) {
3868         if (aSig) {
3869             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3870         }
3871         return packFloat128( aSign, 0x7FFF, 0, 0 );
3872     }
3873     if ( aExp == 0 ) {
3874         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3875         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3876         --aExp;
3877     }
3878     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3879     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3880 
3881 }
3882 
3883 
3884 /*----------------------------------------------------------------------------
3885 | Returns the remainder of the double-precision floating-point value `a'
3886 | with respect to the corresponding value `b'.  The operation is performed
3887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3888 *----------------------------------------------------------------------------*/
3889 
3890 float64 float64_rem(float64 a, float64 b, float_status *status)
3891 {
3892     flag aSign, zSign;
3893     int aExp, bExp, expDiff;
3894     uint64_t aSig, bSig;
3895     uint64_t q, alternateASig;
3896     int64_t sigMean;
3897 
3898     a = float64_squash_input_denormal(a, status);
3899     b = float64_squash_input_denormal(b, status);
3900     aSig = extractFloat64Frac( a );
3901     aExp = extractFloat64Exp( a );
3902     aSign = extractFloat64Sign( a );
3903     bSig = extractFloat64Frac( b );
3904     bExp = extractFloat64Exp( b );
3905     if ( aExp == 0x7FF ) {
3906         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3907             return propagateFloat64NaN(a, b, status);
3908         }
3909         float_raise(float_flag_invalid, status);
3910         return float64_default_nan(status);
3911     }
3912     if ( bExp == 0x7FF ) {
3913         if (bSig) {
3914             return propagateFloat64NaN(a, b, status);
3915         }
3916         return a;
3917     }
3918     if ( bExp == 0 ) {
3919         if ( bSig == 0 ) {
3920             float_raise(float_flag_invalid, status);
3921             return float64_default_nan(status);
3922         }
3923         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3924     }
3925     if ( aExp == 0 ) {
3926         if ( aSig == 0 ) return a;
3927         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3928     }
3929     expDiff = aExp - bExp;
3930     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3931     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3932     if ( expDiff < 0 ) {
3933         if ( expDiff < -1 ) return a;
3934         aSig >>= 1;
3935     }
3936     q = ( bSig <= aSig );
3937     if ( q ) aSig -= bSig;
3938     expDiff -= 64;
3939     while ( 0 < expDiff ) {
3940         q = estimateDiv128To64( aSig, 0, bSig );
3941         q = ( 2 < q ) ? q - 2 : 0;
3942         aSig = - ( ( bSig>>2 ) * q );
3943         expDiff -= 62;
3944     }
3945     expDiff += 64;
3946     if ( 0 < expDiff ) {
3947         q = estimateDiv128To64( aSig, 0, bSig );
3948         q = ( 2 < q ) ? q - 2 : 0;
3949         q >>= 64 - expDiff;
3950         bSig >>= 2;
3951         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3952     }
3953     else {
3954         aSig >>= 2;
3955         bSig >>= 2;
3956     }
3957     do {
3958         alternateASig = aSig;
3959         ++q;
3960         aSig -= bSig;
3961     } while ( 0 <= (int64_t) aSig );
3962     sigMean = aSig + alternateASig;
3963     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3964         aSig = alternateASig;
3965     }
3966     zSign = ( (int64_t) aSig < 0 );
3967     if ( zSign ) aSig = - aSig;
3968     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3969 
3970 }
3971 
3972 
3973 /*----------------------------------------------------------------------------
3974 | Returns the square root of the double-precision floating-point value `a'.
3975 | The operation is performed according to the IEC/IEEE Standard for Binary
3976 | Floating-Point Arithmetic.
3977 *----------------------------------------------------------------------------*/
3978 
3979 float64 float64_sqrt(float64 a, float_status *status)
3980 {
3981     flag aSign;
3982     int aExp, zExp;
3983     uint64_t aSig, zSig, doubleZSig;
3984     uint64_t rem0, rem1, term0, term1;
3985     a = float64_squash_input_denormal(a, status);
3986 
3987     aSig = extractFloat64Frac( a );
3988     aExp = extractFloat64Exp( a );
3989     aSign = extractFloat64Sign( a );
3990     if ( aExp == 0x7FF ) {
3991         if (aSig) {
3992             return propagateFloat64NaN(a, a, status);
3993         }
3994         if ( ! aSign ) return a;
3995         float_raise(float_flag_invalid, status);
3996         return float64_default_nan(status);
3997     }
3998     if ( aSign ) {
3999         if ( ( aExp | aSig ) == 0 ) return a;
4000         float_raise(float_flag_invalid, status);
4001         return float64_default_nan(status);
4002     }
4003     if ( aExp == 0 ) {
4004         if ( aSig == 0 ) return float64_zero;
4005         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4006     }
4007     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4008     aSig |= LIT64( 0x0010000000000000 );
4009     zSig = estimateSqrt32( aExp, aSig>>21 );
4010     aSig <<= 9 - ( aExp & 1 );
4011     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4012     if ( ( zSig & 0x1FF ) <= 5 ) {
4013         doubleZSig = zSig<<1;
4014         mul64To128( zSig, zSig, &term0, &term1 );
4015         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4016         while ( (int64_t) rem0 < 0 ) {
4017             --zSig;
4018             doubleZSig -= 2;
4019             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4020         }
4021         zSig |= ( ( rem0 | rem1 ) != 0 );
4022     }
4023     return roundAndPackFloat64(0, zExp, zSig, status);
4024 
4025 }
4026 
4027 /*----------------------------------------------------------------------------
4028 | Returns the binary log of the double-precision floating-point value `a'.
4029 | The operation is performed according to the IEC/IEEE Standard for Binary
4030 | Floating-Point Arithmetic.
4031 *----------------------------------------------------------------------------*/
4032 float64 float64_log2(float64 a, float_status *status)
4033 {
4034     flag aSign, zSign;
4035     int aExp;
4036     uint64_t aSig, aSig0, aSig1, zSig, i;
4037     a = float64_squash_input_denormal(a, status);
4038 
4039     aSig = extractFloat64Frac( a );
4040     aExp = extractFloat64Exp( a );
4041     aSign = extractFloat64Sign( a );
4042 
4043     if ( aExp == 0 ) {
4044         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4045         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4046     }
4047     if ( aSign ) {
4048         float_raise(float_flag_invalid, status);
4049         return float64_default_nan(status);
4050     }
4051     if ( aExp == 0x7FF ) {
4052         if (aSig) {
4053             return propagateFloat64NaN(a, float64_zero, status);
4054         }
4055         return a;
4056     }
4057 
4058     aExp -= 0x3FF;
4059     aSig |= LIT64( 0x0010000000000000 );
4060     zSign = aExp < 0;
4061     zSig = (uint64_t)aExp << 52;
4062     for (i = 1LL << 51; i > 0; i >>= 1) {
4063         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4064         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4065         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4066             aSig >>= 1;
4067             zSig |= i;
4068         }
4069     }
4070 
4071     if ( zSign )
4072         zSig = -zSig;
4073     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4074 }
4075 
4076 /*----------------------------------------------------------------------------
4077 | Returns 1 if the double-precision floating-point value `a' is equal to the
4078 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4079 | if either operand is a NaN.  Otherwise, the comparison is performed
4080 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4081 *----------------------------------------------------------------------------*/
4082 
4083 int float64_eq(float64 a, float64 b, float_status *status)
4084 {
4085     uint64_t av, bv;
4086     a = float64_squash_input_denormal(a, status);
4087     b = float64_squash_input_denormal(b, status);
4088 
4089     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4090          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4091        ) {
4092         float_raise(float_flag_invalid, status);
4093         return 0;
4094     }
4095     av = float64_val(a);
4096     bv = float64_val(b);
4097     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4098 
4099 }
4100 
4101 /*----------------------------------------------------------------------------
4102 | Returns 1 if the double-precision floating-point value `a' is less than or
4103 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4104 | exception is raised if either operand is a NaN.  The comparison is performed
4105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4106 *----------------------------------------------------------------------------*/
4107 
4108 int float64_le(float64 a, float64 b, float_status *status)
4109 {
4110     flag aSign, bSign;
4111     uint64_t av, bv;
4112     a = float64_squash_input_denormal(a, status);
4113     b = float64_squash_input_denormal(b, status);
4114 
4115     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4116          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4117        ) {
4118         float_raise(float_flag_invalid, status);
4119         return 0;
4120     }
4121     aSign = extractFloat64Sign( a );
4122     bSign = extractFloat64Sign( b );
4123     av = float64_val(a);
4124     bv = float64_val(b);
4125     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4126     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4127 
4128 }
4129 
4130 /*----------------------------------------------------------------------------
4131 | Returns 1 if the double-precision floating-point value `a' is less than
4132 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4133 | raised if either operand is a NaN.  The comparison is performed according
4134 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4135 *----------------------------------------------------------------------------*/
4136 
4137 int float64_lt(float64 a, float64 b, float_status *status)
4138 {
4139     flag aSign, bSign;
4140     uint64_t av, bv;
4141 
4142     a = float64_squash_input_denormal(a, status);
4143     b = float64_squash_input_denormal(b, status);
4144     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4145          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4146        ) {
4147         float_raise(float_flag_invalid, status);
4148         return 0;
4149     }
4150     aSign = extractFloat64Sign( a );
4151     bSign = extractFloat64Sign( b );
4152     av = float64_val(a);
4153     bv = float64_val(b);
4154     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4155     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4156 
4157 }
4158 
4159 /*----------------------------------------------------------------------------
4160 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4161 | be compared, and 0 otherwise.  The invalid exception is raised if either
4162 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4163 | Standard for Binary Floating-Point Arithmetic.
4164 *----------------------------------------------------------------------------*/
4165 
4166 int float64_unordered(float64 a, float64 b, float_status *status)
4167 {
4168     a = float64_squash_input_denormal(a, status);
4169     b = float64_squash_input_denormal(b, status);
4170 
4171     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4172          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4173        ) {
4174         float_raise(float_flag_invalid, status);
4175         return 1;
4176     }
4177     return 0;
4178 }
4179 
4180 /*----------------------------------------------------------------------------
4181 | Returns 1 if the double-precision floating-point value `a' is equal to the
4182 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4183 | exception.The comparison is performed according to the IEC/IEEE Standard
4184 | for Binary Floating-Point Arithmetic.
4185 *----------------------------------------------------------------------------*/
4186 
4187 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4188 {
4189     uint64_t av, bv;
4190     a = float64_squash_input_denormal(a, status);
4191     b = float64_squash_input_denormal(b, status);
4192 
4193     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4194          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4195        ) {
4196         if (float64_is_signaling_nan(a, status)
4197          || float64_is_signaling_nan(b, status)) {
4198             float_raise(float_flag_invalid, status);
4199         }
4200         return 0;
4201     }
4202     av = float64_val(a);
4203     bv = float64_val(b);
4204     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4205 
4206 }
4207 
4208 /*----------------------------------------------------------------------------
4209 | Returns 1 if the double-precision floating-point value `a' is less than or
4210 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4211 | cause an exception.  Otherwise, the comparison is performed according to the
4212 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4213 *----------------------------------------------------------------------------*/
4214 
4215 int float64_le_quiet(float64 a, float64 b, float_status *status)
4216 {
4217     flag aSign, bSign;
4218     uint64_t av, bv;
4219     a = float64_squash_input_denormal(a, status);
4220     b = float64_squash_input_denormal(b, status);
4221 
4222     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4223          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4224        ) {
4225         if (float64_is_signaling_nan(a, status)
4226          || float64_is_signaling_nan(b, status)) {
4227             float_raise(float_flag_invalid, status);
4228         }
4229         return 0;
4230     }
4231     aSign = extractFloat64Sign( a );
4232     bSign = extractFloat64Sign( b );
4233     av = float64_val(a);
4234     bv = float64_val(b);
4235     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4236     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4237 
4238 }
4239 
4240 /*----------------------------------------------------------------------------
4241 | Returns 1 if the double-precision floating-point value `a' is less than
4242 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4243 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4244 | Standard for Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4246 
4247 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4248 {
4249     flag aSign, bSign;
4250     uint64_t av, bv;
4251     a = float64_squash_input_denormal(a, status);
4252     b = float64_squash_input_denormal(b, status);
4253 
4254     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4255          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4256        ) {
4257         if (float64_is_signaling_nan(a, status)
4258          || float64_is_signaling_nan(b, status)) {
4259             float_raise(float_flag_invalid, status);
4260         }
4261         return 0;
4262     }
4263     aSign = extractFloat64Sign( a );
4264     bSign = extractFloat64Sign( b );
4265     av = float64_val(a);
4266     bv = float64_val(b);
4267     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4268     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4269 
4270 }
4271 
4272 /*----------------------------------------------------------------------------
4273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4274 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4275 | comparison is performed according to the IEC/IEEE Standard for Binary
4276 | Floating-Point Arithmetic.
4277 *----------------------------------------------------------------------------*/
4278 
4279 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4280 {
4281     a = float64_squash_input_denormal(a, status);
4282     b = float64_squash_input_denormal(b, status);
4283 
4284     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4285          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4286        ) {
4287         if (float64_is_signaling_nan(a, status)
4288          || float64_is_signaling_nan(b, status)) {
4289             float_raise(float_flag_invalid, status);
4290         }
4291         return 1;
4292     }
4293     return 0;
4294 }
4295 
4296 /*----------------------------------------------------------------------------
4297 | Returns the result of converting the extended double-precision floating-
4298 | point value `a' to the 32-bit two's complement integer format.  The
4299 | conversion is performed according to the IEC/IEEE Standard for Binary
4300 | Floating-Point Arithmetic---which means in particular that the conversion
4301 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4302 | largest positive integer is returned.  Otherwise, if the conversion
4303 | overflows, the largest integer with the same sign as `a' is returned.
4304 *----------------------------------------------------------------------------*/
4305 
4306 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4307 {
4308     flag aSign;
4309     int32_t aExp, shiftCount;
4310     uint64_t aSig;
4311 
4312     if (floatx80_invalid_encoding(a)) {
4313         float_raise(float_flag_invalid, status);
4314         return 1 << 31;
4315     }
4316     aSig = extractFloatx80Frac( a );
4317     aExp = extractFloatx80Exp( a );
4318     aSign = extractFloatx80Sign( a );
4319     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4320     shiftCount = 0x4037 - aExp;
4321     if ( shiftCount <= 0 ) shiftCount = 1;
4322     shift64RightJamming( aSig, shiftCount, &aSig );
4323     return roundAndPackInt32(aSign, aSig, status);
4324 
4325 }
4326 
4327 /*----------------------------------------------------------------------------
4328 | Returns the result of converting the extended double-precision floating-
4329 | point value `a' to the 32-bit two's complement integer format.  The
4330 | conversion is performed according to the IEC/IEEE Standard for Binary
4331 | Floating-Point Arithmetic, except that the conversion is always rounded
4332 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4333 | Otherwise, if the conversion overflows, the largest integer with the same
4334 | sign as `a' is returned.
4335 *----------------------------------------------------------------------------*/
4336 
4337 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4338 {
4339     flag aSign;
4340     int32_t aExp, shiftCount;
4341     uint64_t aSig, savedASig;
4342     int32_t z;
4343 
4344     if (floatx80_invalid_encoding(a)) {
4345         float_raise(float_flag_invalid, status);
4346         return 1 << 31;
4347     }
4348     aSig = extractFloatx80Frac( a );
4349     aExp = extractFloatx80Exp( a );
4350     aSign = extractFloatx80Sign( a );
4351     if ( 0x401E < aExp ) {
4352         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4353         goto invalid;
4354     }
4355     else if ( aExp < 0x3FFF ) {
4356         if (aExp || aSig) {
4357             status->float_exception_flags |= float_flag_inexact;
4358         }
4359         return 0;
4360     }
4361     shiftCount = 0x403E - aExp;
4362     savedASig = aSig;
4363     aSig >>= shiftCount;
4364     z = aSig;
4365     if ( aSign ) z = - z;
4366     if ( ( z < 0 ) ^ aSign ) {
4367  invalid:
4368         float_raise(float_flag_invalid, status);
4369         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4370     }
4371     if ( ( aSig<<shiftCount ) != savedASig ) {
4372         status->float_exception_flags |= float_flag_inexact;
4373     }
4374     return z;
4375 
4376 }
4377 
4378 /*----------------------------------------------------------------------------
4379 | Returns the result of converting the extended double-precision floating-
4380 | point value `a' to the 64-bit two's complement integer format.  The
4381 | conversion is performed according to the IEC/IEEE Standard for Binary
4382 | Floating-Point Arithmetic---which means in particular that the conversion
4383 | is rounded according to the current rounding mode.  If `a' is a NaN,
4384 | the largest positive integer is returned.  Otherwise, if the conversion
4385 | overflows, the largest integer with the same sign as `a' is returned.
4386 *----------------------------------------------------------------------------*/
4387 
4388 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4389 {
4390     flag aSign;
4391     int32_t aExp, shiftCount;
4392     uint64_t aSig, aSigExtra;
4393 
4394     if (floatx80_invalid_encoding(a)) {
4395         float_raise(float_flag_invalid, status);
4396         return 1ULL << 63;
4397     }
4398     aSig = extractFloatx80Frac( a );
4399     aExp = extractFloatx80Exp( a );
4400     aSign = extractFloatx80Sign( a );
4401     shiftCount = 0x403E - aExp;
4402     if ( shiftCount <= 0 ) {
4403         if ( shiftCount ) {
4404             float_raise(float_flag_invalid, status);
4405             if (    ! aSign
4406                  || (    ( aExp == 0x7FFF )
4407                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4408                ) {
4409                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4410             }
4411             return (int64_t) LIT64( 0x8000000000000000 );
4412         }
4413         aSigExtra = 0;
4414     }
4415     else {
4416         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4417     }
4418     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4419 
4420 }
4421 
4422 /*----------------------------------------------------------------------------
4423 | Returns the result of converting the extended double-precision floating-
4424 | point value `a' to the 64-bit two's complement integer format.  The
4425 | conversion is performed according to the IEC/IEEE Standard for Binary
4426 | Floating-Point Arithmetic, except that the conversion is always rounded
4427 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4428 | Otherwise, if the conversion overflows, the largest integer with the same
4429 | sign as `a' is returned.
4430 *----------------------------------------------------------------------------*/
4431 
4432 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4433 {
4434     flag aSign;
4435     int32_t aExp, shiftCount;
4436     uint64_t aSig;
4437     int64_t z;
4438 
4439     if (floatx80_invalid_encoding(a)) {
4440         float_raise(float_flag_invalid, status);
4441         return 1ULL << 63;
4442     }
4443     aSig = extractFloatx80Frac( a );
4444     aExp = extractFloatx80Exp( a );
4445     aSign = extractFloatx80Sign( a );
4446     shiftCount = aExp - 0x403E;
4447     if ( 0 <= shiftCount ) {
4448         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4449         if ( ( a.high != 0xC03E ) || aSig ) {
4450             float_raise(float_flag_invalid, status);
4451             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4452                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4453             }
4454         }
4455         return (int64_t) LIT64( 0x8000000000000000 );
4456     }
4457     else if ( aExp < 0x3FFF ) {
4458         if (aExp | aSig) {
4459             status->float_exception_flags |= float_flag_inexact;
4460         }
4461         return 0;
4462     }
4463     z = aSig>>( - shiftCount );
4464     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4465         status->float_exception_flags |= float_flag_inexact;
4466     }
4467     if ( aSign ) z = - z;
4468     return z;
4469 
4470 }
4471 
4472 /*----------------------------------------------------------------------------
4473 | Returns the result of converting the extended double-precision floating-
4474 | point value `a' to the single-precision floating-point format.  The
4475 | conversion is performed according to the IEC/IEEE Standard for Binary
4476 | Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4478 
4479 float32 floatx80_to_float32(floatx80 a, float_status *status)
4480 {
4481     flag aSign;
4482     int32_t aExp;
4483     uint64_t aSig;
4484 
4485     if (floatx80_invalid_encoding(a)) {
4486         float_raise(float_flag_invalid, status);
4487         return float32_default_nan(status);
4488     }
4489     aSig = extractFloatx80Frac( a );
4490     aExp = extractFloatx80Exp( a );
4491     aSign = extractFloatx80Sign( a );
4492     if ( aExp == 0x7FFF ) {
4493         if ( (uint64_t) ( aSig<<1 ) ) {
4494             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4495         }
4496         return packFloat32( aSign, 0xFF, 0 );
4497     }
4498     shift64RightJamming( aSig, 33, &aSig );
4499     if ( aExp || aSig ) aExp -= 0x3F81;
4500     return roundAndPackFloat32(aSign, aExp, aSig, status);
4501 
4502 }
4503 
4504 /*----------------------------------------------------------------------------
4505 | Returns the result of converting the extended double-precision floating-
4506 | point value `a' to the double-precision floating-point format.  The
4507 | conversion is performed according to the IEC/IEEE Standard for Binary
4508 | Floating-Point Arithmetic.
4509 *----------------------------------------------------------------------------*/
4510 
4511 float64 floatx80_to_float64(floatx80 a, float_status *status)
4512 {
4513     flag aSign;
4514     int32_t aExp;
4515     uint64_t aSig, zSig;
4516 
4517     if (floatx80_invalid_encoding(a)) {
4518         float_raise(float_flag_invalid, status);
4519         return float64_default_nan(status);
4520     }
4521     aSig = extractFloatx80Frac( a );
4522     aExp = extractFloatx80Exp( a );
4523     aSign = extractFloatx80Sign( a );
4524     if ( aExp == 0x7FFF ) {
4525         if ( (uint64_t) ( aSig<<1 ) ) {
4526             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4527         }
4528         return packFloat64( aSign, 0x7FF, 0 );
4529     }
4530     shift64RightJamming( aSig, 1, &zSig );
4531     if ( aExp || aSig ) aExp -= 0x3C01;
4532     return roundAndPackFloat64(aSign, aExp, zSig, status);
4533 
4534 }
4535 
4536 /*----------------------------------------------------------------------------
4537 | Returns the result of converting the extended double-precision floating-
4538 | point value `a' to the quadruple-precision floating-point format.  The
4539 | conversion is performed according to the IEC/IEEE Standard for Binary
4540 | Floating-Point Arithmetic.
4541 *----------------------------------------------------------------------------*/
4542 
4543 float128 floatx80_to_float128(floatx80 a, float_status *status)
4544 {
4545     flag aSign;
4546     int aExp;
4547     uint64_t aSig, zSig0, zSig1;
4548 
4549     if (floatx80_invalid_encoding(a)) {
4550         float_raise(float_flag_invalid, status);
4551         return float128_default_nan(status);
4552     }
4553     aSig = extractFloatx80Frac( a );
4554     aExp = extractFloatx80Exp( a );
4555     aSign = extractFloatx80Sign( a );
4556     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4557         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4558     }
4559     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4560     return packFloat128( aSign, aExp, zSig0, zSig1 );
4561 
4562 }
4563 
4564 /*----------------------------------------------------------------------------
4565 | Rounds the extended double-precision floating-point value `a'
4566 | to the precision provided by floatx80_rounding_precision and returns the
4567 | result as an extended double-precision floating-point value.
4568 | The operation is performed according to the IEC/IEEE Standard for Binary
4569 | Floating-Point Arithmetic.
4570 *----------------------------------------------------------------------------*/
4571 
4572 floatx80 floatx80_round(floatx80 a, float_status *status)
4573 {
4574     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4575                                 extractFloatx80Sign(a),
4576                                 extractFloatx80Exp(a),
4577                                 extractFloatx80Frac(a), 0, status);
4578 }
4579 
4580 /*----------------------------------------------------------------------------
4581 | Rounds the extended double-precision floating-point value `a' to an integer,
4582 | and returns the result as an extended quadruple-precision floating-point
4583 | value.  The operation is performed according to the IEC/IEEE Standard for
4584 | Binary Floating-Point Arithmetic.
4585 *----------------------------------------------------------------------------*/
4586 
4587 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4588 {
4589     flag aSign;
4590     int32_t aExp;
4591     uint64_t lastBitMask, roundBitsMask;
4592     floatx80 z;
4593 
4594     if (floatx80_invalid_encoding(a)) {
4595         float_raise(float_flag_invalid, status);
4596         return floatx80_default_nan(status);
4597     }
4598     aExp = extractFloatx80Exp( a );
4599     if ( 0x403E <= aExp ) {
4600         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4601             return propagateFloatx80NaN(a, a, status);
4602         }
4603         return a;
4604     }
4605     if ( aExp < 0x3FFF ) {
4606         if (    ( aExp == 0 )
4607              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4608             return a;
4609         }
4610         status->float_exception_flags |= float_flag_inexact;
4611         aSign = extractFloatx80Sign( a );
4612         switch (status->float_rounding_mode) {
4613          case float_round_nearest_even:
4614             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4615                ) {
4616                 return
4617                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4618             }
4619             break;
4620         case float_round_ties_away:
4621             if (aExp == 0x3FFE) {
4622                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4623             }
4624             break;
4625          case float_round_down:
4626             return
4627                   aSign ?
4628                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4629                 : packFloatx80( 0, 0, 0 );
4630          case float_round_up:
4631             return
4632                   aSign ? packFloatx80( 1, 0, 0 )
4633                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4634         }
4635         return packFloatx80( aSign, 0, 0 );
4636     }
4637     lastBitMask = 1;
4638     lastBitMask <<= 0x403E - aExp;
4639     roundBitsMask = lastBitMask - 1;
4640     z = a;
4641     switch (status->float_rounding_mode) {
4642     case float_round_nearest_even:
4643         z.low += lastBitMask>>1;
4644         if ((z.low & roundBitsMask) == 0) {
4645             z.low &= ~lastBitMask;
4646         }
4647         break;
4648     case float_round_ties_away:
4649         z.low += lastBitMask >> 1;
4650         break;
4651     case float_round_to_zero:
4652         break;
4653     case float_round_up:
4654         if (!extractFloatx80Sign(z)) {
4655             z.low += roundBitsMask;
4656         }
4657         break;
4658     case float_round_down:
4659         if (extractFloatx80Sign(z)) {
4660             z.low += roundBitsMask;
4661         }
4662         break;
4663     default:
4664         abort();
4665     }
4666     z.low &= ~ roundBitsMask;
4667     if ( z.low == 0 ) {
4668         ++z.high;
4669         z.low = LIT64( 0x8000000000000000 );
4670     }
4671     if (z.low != a.low) {
4672         status->float_exception_flags |= float_flag_inexact;
4673     }
4674     return z;
4675 
4676 }
4677 
4678 /*----------------------------------------------------------------------------
4679 | Returns the result of adding the absolute values of the extended double-
4680 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4681 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4682 | The addition is performed according to the IEC/IEEE Standard for Binary
4683 | Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685 
4686 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4687                                 float_status *status)
4688 {
4689     int32_t aExp, bExp, zExp;
4690     uint64_t aSig, bSig, zSig0, zSig1;
4691     int32_t expDiff;
4692 
4693     aSig = extractFloatx80Frac( a );
4694     aExp = extractFloatx80Exp( a );
4695     bSig = extractFloatx80Frac( b );
4696     bExp = extractFloatx80Exp( b );
4697     expDiff = aExp - bExp;
4698     if ( 0 < expDiff ) {
4699         if ( aExp == 0x7FFF ) {
4700             if ((uint64_t)(aSig << 1)) {
4701                 return propagateFloatx80NaN(a, b, status);
4702             }
4703             return a;
4704         }
4705         if ( bExp == 0 ) --expDiff;
4706         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4707         zExp = aExp;
4708     }
4709     else if ( expDiff < 0 ) {
4710         if ( bExp == 0x7FFF ) {
4711             if ((uint64_t)(bSig << 1)) {
4712                 return propagateFloatx80NaN(a, b, status);
4713             }
4714             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4715         }
4716         if ( aExp == 0 ) ++expDiff;
4717         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4718         zExp = bExp;
4719     }
4720     else {
4721         if ( aExp == 0x7FFF ) {
4722             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4723                 return propagateFloatx80NaN(a, b, status);
4724             }
4725             return a;
4726         }
4727         zSig1 = 0;
4728         zSig0 = aSig + bSig;
4729         if ( aExp == 0 ) {
4730             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4731             goto roundAndPack;
4732         }
4733         zExp = aExp;
4734         goto shiftRight1;
4735     }
4736     zSig0 = aSig + bSig;
4737     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4738  shiftRight1:
4739     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4740     zSig0 |= LIT64( 0x8000000000000000 );
4741     ++zExp;
4742  roundAndPack:
4743     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4744                                 zSign, zExp, zSig0, zSig1, status);
4745 }
4746 
4747 /*----------------------------------------------------------------------------
4748 | Returns the result of subtracting the absolute values of the extended
4749 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4750 | difference is negated before being returned.  `zSign' is ignored if the
4751 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4752 | Standard for Binary Floating-Point Arithmetic.
4753 *----------------------------------------------------------------------------*/
4754 
4755 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4756                                 float_status *status)
4757 {
4758     int32_t aExp, bExp, zExp;
4759     uint64_t aSig, bSig, zSig0, zSig1;
4760     int32_t expDiff;
4761 
4762     aSig = extractFloatx80Frac( a );
4763     aExp = extractFloatx80Exp( a );
4764     bSig = extractFloatx80Frac( b );
4765     bExp = extractFloatx80Exp( b );
4766     expDiff = aExp - bExp;
4767     if ( 0 < expDiff ) goto aExpBigger;
4768     if ( expDiff < 0 ) goto bExpBigger;
4769     if ( aExp == 0x7FFF ) {
4770         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4771             return propagateFloatx80NaN(a, b, status);
4772         }
4773         float_raise(float_flag_invalid, status);
4774         return floatx80_default_nan(status);
4775     }
4776     if ( aExp == 0 ) {
4777         aExp = 1;
4778         bExp = 1;
4779     }
4780     zSig1 = 0;
4781     if ( bSig < aSig ) goto aBigger;
4782     if ( aSig < bSig ) goto bBigger;
4783     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4784  bExpBigger:
4785     if ( bExp == 0x7FFF ) {
4786         if ((uint64_t)(bSig << 1)) {
4787             return propagateFloatx80NaN(a, b, status);
4788         }
4789         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4790     }
4791     if ( aExp == 0 ) ++expDiff;
4792     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4793  bBigger:
4794     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4795     zExp = bExp;
4796     zSign ^= 1;
4797     goto normalizeRoundAndPack;
4798  aExpBigger:
4799     if ( aExp == 0x7FFF ) {
4800         if ((uint64_t)(aSig << 1)) {
4801             return propagateFloatx80NaN(a, b, status);
4802         }
4803         return a;
4804     }
4805     if ( bExp == 0 ) --expDiff;
4806     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4807  aBigger:
4808     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4809     zExp = aExp;
4810  normalizeRoundAndPack:
4811     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4812                                          zSign, zExp, zSig0, zSig1, status);
4813 }
4814 
4815 /*----------------------------------------------------------------------------
4816 | Returns the result of adding the extended double-precision floating-point
4817 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4818 | Standard for Binary Floating-Point Arithmetic.
4819 *----------------------------------------------------------------------------*/
4820 
4821 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4822 {
4823     flag aSign, bSign;
4824 
4825     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4826         float_raise(float_flag_invalid, status);
4827         return floatx80_default_nan(status);
4828     }
4829     aSign = extractFloatx80Sign( a );
4830     bSign = extractFloatx80Sign( b );
4831     if ( aSign == bSign ) {
4832         return addFloatx80Sigs(a, b, aSign, status);
4833     }
4834     else {
4835         return subFloatx80Sigs(a, b, aSign, status);
4836     }
4837 
4838 }
4839 
4840 /*----------------------------------------------------------------------------
4841 | Returns the result of subtracting the extended double-precision floating-
4842 | point values `a' and `b'.  The operation is performed according to the
4843 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4844 *----------------------------------------------------------------------------*/
4845 
4846 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4847 {
4848     flag aSign, bSign;
4849 
4850     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4851         float_raise(float_flag_invalid, status);
4852         return floatx80_default_nan(status);
4853     }
4854     aSign = extractFloatx80Sign( a );
4855     bSign = extractFloatx80Sign( b );
4856     if ( aSign == bSign ) {
4857         return subFloatx80Sigs(a, b, aSign, status);
4858     }
4859     else {
4860         return addFloatx80Sigs(a, b, aSign, status);
4861     }
4862 
4863 }
4864 
4865 /*----------------------------------------------------------------------------
4866 | Returns the result of multiplying the extended double-precision floating-
4867 | point values `a' and `b'.  The operation is performed according to the
4868 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4869 *----------------------------------------------------------------------------*/
4870 
4871 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4872 {
4873     flag aSign, bSign, zSign;
4874     int32_t aExp, bExp, zExp;
4875     uint64_t aSig, bSig, zSig0, zSig1;
4876 
4877     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4878         float_raise(float_flag_invalid, status);
4879         return floatx80_default_nan(status);
4880     }
4881     aSig = extractFloatx80Frac( a );
4882     aExp = extractFloatx80Exp( a );
4883     aSign = extractFloatx80Sign( a );
4884     bSig = extractFloatx80Frac( b );
4885     bExp = extractFloatx80Exp( b );
4886     bSign = extractFloatx80Sign( b );
4887     zSign = aSign ^ bSign;
4888     if ( aExp == 0x7FFF ) {
4889         if (    (uint64_t) ( aSig<<1 )
4890              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4891             return propagateFloatx80NaN(a, b, status);
4892         }
4893         if ( ( bExp | bSig ) == 0 ) goto invalid;
4894         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4895     }
4896     if ( bExp == 0x7FFF ) {
4897         if ((uint64_t)(bSig << 1)) {
4898             return propagateFloatx80NaN(a, b, status);
4899         }
4900         if ( ( aExp | aSig ) == 0 ) {
4901  invalid:
4902             float_raise(float_flag_invalid, status);
4903             return floatx80_default_nan(status);
4904         }
4905         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4906     }
4907     if ( aExp == 0 ) {
4908         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4909         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4910     }
4911     if ( bExp == 0 ) {
4912         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4913         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4914     }
4915     zExp = aExp + bExp - 0x3FFE;
4916     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4917     if ( 0 < (int64_t) zSig0 ) {
4918         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4919         --zExp;
4920     }
4921     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4922                                 zSign, zExp, zSig0, zSig1, status);
4923 }
4924 
4925 /*----------------------------------------------------------------------------
4926 | Returns the result of dividing the extended double-precision floating-point
4927 | value `a' by the corresponding value `b'.  The operation is performed
4928 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4929 *----------------------------------------------------------------------------*/
4930 
4931 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4932 {
4933     flag aSign, bSign, zSign;
4934     int32_t aExp, bExp, zExp;
4935     uint64_t aSig, bSig, zSig0, zSig1;
4936     uint64_t rem0, rem1, rem2, term0, term1, term2;
4937 
4938     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4939         float_raise(float_flag_invalid, status);
4940         return floatx80_default_nan(status);
4941     }
4942     aSig = extractFloatx80Frac( a );
4943     aExp = extractFloatx80Exp( a );
4944     aSign = extractFloatx80Sign( a );
4945     bSig = extractFloatx80Frac( b );
4946     bExp = extractFloatx80Exp( b );
4947     bSign = extractFloatx80Sign( b );
4948     zSign = aSign ^ bSign;
4949     if ( aExp == 0x7FFF ) {
4950         if ((uint64_t)(aSig << 1)) {
4951             return propagateFloatx80NaN(a, b, status);
4952         }
4953         if ( bExp == 0x7FFF ) {
4954             if ((uint64_t)(bSig << 1)) {
4955                 return propagateFloatx80NaN(a, b, status);
4956             }
4957             goto invalid;
4958         }
4959         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4960     }
4961     if ( bExp == 0x7FFF ) {
4962         if ((uint64_t)(bSig << 1)) {
4963             return propagateFloatx80NaN(a, b, status);
4964         }
4965         return packFloatx80( zSign, 0, 0 );
4966     }
4967     if ( bExp == 0 ) {
4968         if ( bSig == 0 ) {
4969             if ( ( aExp | aSig ) == 0 ) {
4970  invalid:
4971                 float_raise(float_flag_invalid, status);
4972                 return floatx80_default_nan(status);
4973             }
4974             float_raise(float_flag_divbyzero, status);
4975             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4976         }
4977         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4978     }
4979     if ( aExp == 0 ) {
4980         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4981         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4982     }
4983     zExp = aExp - bExp + 0x3FFE;
4984     rem1 = 0;
4985     if ( bSig <= aSig ) {
4986         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4987         ++zExp;
4988     }
4989     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4990     mul64To128( bSig, zSig0, &term0, &term1 );
4991     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4992     while ( (int64_t) rem0 < 0 ) {
4993         --zSig0;
4994         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4995     }
4996     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4997     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4998         mul64To128( bSig, zSig1, &term1, &term2 );
4999         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5000         while ( (int64_t) rem1 < 0 ) {
5001             --zSig1;
5002             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5003         }
5004         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5005     }
5006     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5007                                 zSign, zExp, zSig0, zSig1, status);
5008 }
5009 
5010 /*----------------------------------------------------------------------------
5011 | Returns the remainder of the extended double-precision floating-point value
5012 | `a' with respect to the corresponding value `b'.  The operation is performed
5013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5015 
5016 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5017 {
5018     flag aSign, zSign;
5019     int32_t aExp, bExp, expDiff;
5020     uint64_t aSig0, aSig1, bSig;
5021     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5022 
5023     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5024         float_raise(float_flag_invalid, status);
5025         return floatx80_default_nan(status);
5026     }
5027     aSig0 = extractFloatx80Frac( a );
5028     aExp = extractFloatx80Exp( a );
5029     aSign = extractFloatx80Sign( a );
5030     bSig = extractFloatx80Frac( b );
5031     bExp = extractFloatx80Exp( b );
5032     if ( aExp == 0x7FFF ) {
5033         if (    (uint64_t) ( aSig0<<1 )
5034              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5035             return propagateFloatx80NaN(a, b, status);
5036         }
5037         goto invalid;
5038     }
5039     if ( bExp == 0x7FFF ) {
5040         if ((uint64_t)(bSig << 1)) {
5041             return propagateFloatx80NaN(a, b, status);
5042         }
5043         return a;
5044     }
5045     if ( bExp == 0 ) {
5046         if ( bSig == 0 ) {
5047  invalid:
5048             float_raise(float_flag_invalid, status);
5049             return floatx80_default_nan(status);
5050         }
5051         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5052     }
5053     if ( aExp == 0 ) {
5054         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5055         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5056     }
5057     bSig |= LIT64( 0x8000000000000000 );
5058     zSign = aSign;
5059     expDiff = aExp - bExp;
5060     aSig1 = 0;
5061     if ( expDiff < 0 ) {
5062         if ( expDiff < -1 ) return a;
5063         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5064         expDiff = 0;
5065     }
5066     q = ( bSig <= aSig0 );
5067     if ( q ) aSig0 -= bSig;
5068     expDiff -= 64;
5069     while ( 0 < expDiff ) {
5070         q = estimateDiv128To64( aSig0, aSig1, bSig );
5071         q = ( 2 < q ) ? q - 2 : 0;
5072         mul64To128( bSig, q, &term0, &term1 );
5073         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5074         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5075         expDiff -= 62;
5076     }
5077     expDiff += 64;
5078     if ( 0 < expDiff ) {
5079         q = estimateDiv128To64( aSig0, aSig1, bSig );
5080         q = ( 2 < q ) ? q - 2 : 0;
5081         q >>= 64 - expDiff;
5082         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5083         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5084         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5085         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5086             ++q;
5087             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5088         }
5089     }
5090     else {
5091         term1 = 0;
5092         term0 = bSig;
5093     }
5094     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5095     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5096          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5097               && ( q & 1 ) )
5098        ) {
5099         aSig0 = alternateASig0;
5100         aSig1 = alternateASig1;
5101         zSign = ! zSign;
5102     }
5103     return
5104         normalizeRoundAndPackFloatx80(
5105             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5106 
5107 }
5108 
5109 /*----------------------------------------------------------------------------
5110 | Returns the square root of the extended double-precision floating-point
5111 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5112 | for Binary Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 
5115 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5116 {
5117     flag aSign;
5118     int32_t aExp, zExp;
5119     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5120     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5121 
5122     if (floatx80_invalid_encoding(a)) {
5123         float_raise(float_flag_invalid, status);
5124         return floatx80_default_nan(status);
5125     }
5126     aSig0 = extractFloatx80Frac( a );
5127     aExp = extractFloatx80Exp( a );
5128     aSign = extractFloatx80Sign( a );
5129     if ( aExp == 0x7FFF ) {
5130         if ((uint64_t)(aSig0 << 1)) {
5131             return propagateFloatx80NaN(a, a, status);
5132         }
5133         if ( ! aSign ) return a;
5134         goto invalid;
5135     }
5136     if ( aSign ) {
5137         if ( ( aExp | aSig0 ) == 0 ) return a;
5138  invalid:
5139         float_raise(float_flag_invalid, status);
5140         return floatx80_default_nan(status);
5141     }
5142     if ( aExp == 0 ) {
5143         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5144         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5145     }
5146     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5147     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5148     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5149     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5150     doubleZSig0 = zSig0<<1;
5151     mul64To128( zSig0, zSig0, &term0, &term1 );
5152     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5153     while ( (int64_t) rem0 < 0 ) {
5154         --zSig0;
5155         doubleZSig0 -= 2;
5156         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5157     }
5158     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5159     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5160         if ( zSig1 == 0 ) zSig1 = 1;
5161         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5162         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5163         mul64To128( zSig1, zSig1, &term2, &term3 );
5164         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5165         while ( (int64_t) rem1 < 0 ) {
5166             --zSig1;
5167             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5168             term3 |= 1;
5169             term2 |= doubleZSig0;
5170             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5171         }
5172         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5173     }
5174     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5175     zSig0 |= doubleZSig0;
5176     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5177                                 0, zExp, zSig0, zSig1, status);
5178 }
5179 
5180 /*----------------------------------------------------------------------------
5181 | Returns 1 if the extended double-precision floating-point value `a' is equal
5182 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5183 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5185 *----------------------------------------------------------------------------*/
5186 
5187 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5188 {
5189 
5190     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5191         || (extractFloatx80Exp(a) == 0x7FFF
5192             && (uint64_t) (extractFloatx80Frac(a) << 1))
5193         || (extractFloatx80Exp(b) == 0x7FFF
5194             && (uint64_t) (extractFloatx80Frac(b) << 1))
5195        ) {
5196         float_raise(float_flag_invalid, status);
5197         return 0;
5198     }
5199     return
5200            ( a.low == b.low )
5201         && (    ( a.high == b.high )
5202              || (    ( a.low == 0 )
5203                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5204            );
5205 
5206 }
5207 
5208 /*----------------------------------------------------------------------------
5209 | Returns 1 if the extended double-precision floating-point value `a' is
5210 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5211 | invalid exception is raised if either operand is a NaN.  The comparison is
5212 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5213 | Arithmetic.
5214 *----------------------------------------------------------------------------*/
5215 
5216 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5217 {
5218     flag aSign, bSign;
5219 
5220     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5221         || (extractFloatx80Exp(a) == 0x7FFF
5222             && (uint64_t) (extractFloatx80Frac(a) << 1))
5223         || (extractFloatx80Exp(b) == 0x7FFF
5224             && (uint64_t) (extractFloatx80Frac(b) << 1))
5225        ) {
5226         float_raise(float_flag_invalid, status);
5227         return 0;
5228     }
5229     aSign = extractFloatx80Sign( a );
5230     bSign = extractFloatx80Sign( b );
5231     if ( aSign != bSign ) {
5232         return
5233                aSign
5234             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5235                  == 0 );
5236     }
5237     return
5238           aSign ? le128( b.high, b.low, a.high, a.low )
5239         : le128( a.high, a.low, b.high, b.low );
5240 
5241 }
5242 
5243 /*----------------------------------------------------------------------------
5244 | Returns 1 if the extended double-precision floating-point value `a' is
5245 | less than the corresponding value `b', and 0 otherwise.  The invalid
5246 | exception is raised if either operand is a NaN.  The comparison is performed
5247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 *----------------------------------------------------------------------------*/
5249 
5250 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5251 {
5252     flag aSign, bSign;
5253 
5254     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5255         || (extractFloatx80Exp(a) == 0x7FFF
5256             && (uint64_t) (extractFloatx80Frac(a) << 1))
5257         || (extractFloatx80Exp(b) == 0x7FFF
5258             && (uint64_t) (extractFloatx80Frac(b) << 1))
5259        ) {
5260         float_raise(float_flag_invalid, status);
5261         return 0;
5262     }
5263     aSign = extractFloatx80Sign( a );
5264     bSign = extractFloatx80Sign( b );
5265     if ( aSign != bSign ) {
5266         return
5267                aSign
5268             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5269                  != 0 );
5270     }
5271     return
5272           aSign ? lt128( b.high, b.low, a.high, a.low )
5273         : lt128( a.high, a.low, b.high, b.low );
5274 
5275 }
5276 
5277 /*----------------------------------------------------------------------------
5278 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5279 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5280 | either operand is a NaN.   The comparison is performed according to the
5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282 *----------------------------------------------------------------------------*/
5283 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5284 {
5285     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5286         || (extractFloatx80Exp(a) == 0x7FFF
5287             && (uint64_t) (extractFloatx80Frac(a) << 1))
5288         || (extractFloatx80Exp(b) == 0x7FFF
5289             && (uint64_t) (extractFloatx80Frac(b) << 1))
5290        ) {
5291         float_raise(float_flag_invalid, status);
5292         return 1;
5293     }
5294     return 0;
5295 }
5296 
5297 /*----------------------------------------------------------------------------
5298 | Returns 1 if the extended double-precision floating-point value `a' is
5299 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5300 | cause an exception.  The comparison is performed according to the IEC/IEEE
5301 | Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 
5304 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5305 {
5306 
5307     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5308         float_raise(float_flag_invalid, status);
5309         return 0;
5310     }
5311     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5312               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5313          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5314               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5315        ) {
5316         if (floatx80_is_signaling_nan(a, status)
5317          || floatx80_is_signaling_nan(b, status)) {
5318             float_raise(float_flag_invalid, status);
5319         }
5320         return 0;
5321     }
5322     return
5323            ( a.low == b.low )
5324         && (    ( a.high == b.high )
5325              || (    ( a.low == 0 )
5326                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5327            );
5328 
5329 }
5330 
5331 /*----------------------------------------------------------------------------
5332 | Returns 1 if the extended double-precision floating-point value `a' is less
5333 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5334 | do not cause an exception.  Otherwise, the comparison is performed according
5335 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5336 *----------------------------------------------------------------------------*/
5337 
5338 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5339 {
5340     flag aSign, bSign;
5341 
5342     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5343         float_raise(float_flag_invalid, status);
5344         return 0;
5345     }
5346     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5347               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5348          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5349               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5350        ) {
5351         if (floatx80_is_signaling_nan(a, status)
5352          || floatx80_is_signaling_nan(b, status)) {
5353             float_raise(float_flag_invalid, status);
5354         }
5355         return 0;
5356     }
5357     aSign = extractFloatx80Sign( a );
5358     bSign = extractFloatx80Sign( b );
5359     if ( aSign != bSign ) {
5360         return
5361                aSign
5362             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5363                  == 0 );
5364     }
5365     return
5366           aSign ? le128( b.high, b.low, a.high, a.low )
5367         : le128( a.high, a.low, b.high, b.low );
5368 
5369 }
5370 
5371 /*----------------------------------------------------------------------------
5372 | Returns 1 if the extended double-precision floating-point value `a' is less
5373 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5374 | an exception.  Otherwise, the comparison is performed according to the
5375 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5376 *----------------------------------------------------------------------------*/
5377 
5378 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5379 {
5380     flag aSign, bSign;
5381 
5382     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5383         float_raise(float_flag_invalid, status);
5384         return 0;
5385     }
5386     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5387               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5388          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5389               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5390        ) {
5391         if (floatx80_is_signaling_nan(a, status)
5392          || floatx80_is_signaling_nan(b, status)) {
5393             float_raise(float_flag_invalid, status);
5394         }
5395         return 0;
5396     }
5397     aSign = extractFloatx80Sign( a );
5398     bSign = extractFloatx80Sign( b );
5399     if ( aSign != bSign ) {
5400         return
5401                aSign
5402             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5403                  != 0 );
5404     }
5405     return
5406           aSign ? lt128( b.high, b.low, a.high, a.low )
5407         : lt128( a.high, a.low, b.high, b.low );
5408 
5409 }
5410 
5411 /*----------------------------------------------------------------------------
5412 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5413 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5414 | The comparison is performed according to the IEC/IEEE Standard for Binary
5415 | Floating-Point Arithmetic.
5416 *----------------------------------------------------------------------------*/
5417 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5418 {
5419     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5420         float_raise(float_flag_invalid, status);
5421         return 1;
5422     }
5423     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5424               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5425          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5426               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5427        ) {
5428         if (floatx80_is_signaling_nan(a, status)
5429          || floatx80_is_signaling_nan(b, status)) {
5430             float_raise(float_flag_invalid, status);
5431         }
5432         return 1;
5433     }
5434     return 0;
5435 }
5436 
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the quadruple-precision floating-point
5439 | value `a' to the 32-bit two's complement integer format.  The conversion
5440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5441 | Arithmetic---which means in particular that the conversion is rounded
5442 | according to the current rounding mode.  If `a' is a NaN, the largest
5443 | positive integer is returned.  Otherwise, if the conversion overflows, the
5444 | largest integer with the same sign as `a' is returned.
5445 *----------------------------------------------------------------------------*/
5446 
5447 int32_t float128_to_int32(float128 a, float_status *status)
5448 {
5449     flag aSign;
5450     int32_t aExp, shiftCount;
5451     uint64_t aSig0, aSig1;
5452 
5453     aSig1 = extractFloat128Frac1( a );
5454     aSig0 = extractFloat128Frac0( a );
5455     aExp = extractFloat128Exp( a );
5456     aSign = extractFloat128Sign( a );
5457     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5458     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5459     aSig0 |= ( aSig1 != 0 );
5460     shiftCount = 0x4028 - aExp;
5461     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5462     return roundAndPackInt32(aSign, aSig0, status);
5463 
5464 }
5465 
5466 /*----------------------------------------------------------------------------
5467 | Returns the result of converting the quadruple-precision floating-point
5468 | value `a' to the 32-bit two's complement integer format.  The conversion
5469 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5470 | Arithmetic, except that the conversion is always rounded toward zero.  If
5471 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5472 | conversion overflows, the largest integer with the same sign as `a' is
5473 | returned.
5474 *----------------------------------------------------------------------------*/
5475 
5476 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5477 {
5478     flag aSign;
5479     int32_t aExp, shiftCount;
5480     uint64_t aSig0, aSig1, savedASig;
5481     int32_t z;
5482 
5483     aSig1 = extractFloat128Frac1( a );
5484     aSig0 = extractFloat128Frac0( a );
5485     aExp = extractFloat128Exp( a );
5486     aSign = extractFloat128Sign( a );
5487     aSig0 |= ( aSig1 != 0 );
5488     if ( 0x401E < aExp ) {
5489         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5490         goto invalid;
5491     }
5492     else if ( aExp < 0x3FFF ) {
5493         if (aExp || aSig0) {
5494             status->float_exception_flags |= float_flag_inexact;
5495         }
5496         return 0;
5497     }
5498     aSig0 |= LIT64( 0x0001000000000000 );
5499     shiftCount = 0x402F - aExp;
5500     savedASig = aSig0;
5501     aSig0 >>= shiftCount;
5502     z = aSig0;
5503     if ( aSign ) z = - z;
5504     if ( ( z < 0 ) ^ aSign ) {
5505  invalid:
5506         float_raise(float_flag_invalid, status);
5507         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5508     }
5509     if ( ( aSig0<<shiftCount ) != savedASig ) {
5510         status->float_exception_flags |= float_flag_inexact;
5511     }
5512     return z;
5513 
5514 }
5515 
5516 /*----------------------------------------------------------------------------
5517 | Returns the result of converting the quadruple-precision floating-point
5518 | value `a' to the 64-bit two's complement integer format.  The conversion
5519 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5520 | Arithmetic---which means in particular that the conversion is rounded
5521 | according to the current rounding mode.  If `a' is a NaN, the largest
5522 | positive integer is returned.  Otherwise, if the conversion overflows, the
5523 | largest integer with the same sign as `a' is returned.
5524 *----------------------------------------------------------------------------*/
5525 
5526 int64_t float128_to_int64(float128 a, float_status *status)
5527 {
5528     flag aSign;
5529     int32_t aExp, shiftCount;
5530     uint64_t aSig0, aSig1;
5531 
5532     aSig1 = extractFloat128Frac1( a );
5533     aSig0 = extractFloat128Frac0( a );
5534     aExp = extractFloat128Exp( a );
5535     aSign = extractFloat128Sign( a );
5536     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5537     shiftCount = 0x402F - aExp;
5538     if ( shiftCount <= 0 ) {
5539         if ( 0x403E < aExp ) {
5540             float_raise(float_flag_invalid, status);
5541             if (    ! aSign
5542                  || (    ( aExp == 0x7FFF )
5543                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5544                     )
5545                ) {
5546                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5547             }
5548             return (int64_t) LIT64( 0x8000000000000000 );
5549         }
5550         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5551     }
5552     else {
5553         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5554     }
5555     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5556 
5557 }
5558 
5559 /*----------------------------------------------------------------------------
5560 | Returns the result of converting the quadruple-precision floating-point
5561 | value `a' to the 64-bit two's complement integer format.  The conversion
5562 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5563 | Arithmetic, except that the conversion is always rounded toward zero.
5564 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5565 | the conversion overflows, the largest integer with the same sign as `a' is
5566 | returned.
5567 *----------------------------------------------------------------------------*/
5568 
5569 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5570 {
5571     flag aSign;
5572     int32_t aExp, shiftCount;
5573     uint64_t aSig0, aSig1;
5574     int64_t z;
5575 
5576     aSig1 = extractFloat128Frac1( a );
5577     aSig0 = extractFloat128Frac0( a );
5578     aExp = extractFloat128Exp( a );
5579     aSign = extractFloat128Sign( a );
5580     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5581     shiftCount = aExp - 0x402F;
5582     if ( 0 < shiftCount ) {
5583         if ( 0x403E <= aExp ) {
5584             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5585             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5586                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5587                 if (aSig1) {
5588                     status->float_exception_flags |= float_flag_inexact;
5589                 }
5590             }
5591             else {
5592                 float_raise(float_flag_invalid, status);
5593                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5594                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5595                 }
5596             }
5597             return (int64_t) LIT64( 0x8000000000000000 );
5598         }
5599         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5600         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5601             status->float_exception_flags |= float_flag_inexact;
5602         }
5603     }
5604     else {
5605         if ( aExp < 0x3FFF ) {
5606             if ( aExp | aSig0 | aSig1 ) {
5607                 status->float_exception_flags |= float_flag_inexact;
5608             }
5609             return 0;
5610         }
5611         z = aSig0>>( - shiftCount );
5612         if (    aSig1
5613              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5614             status->float_exception_flags |= float_flag_inexact;
5615         }
5616     }
5617     if ( aSign ) z = - z;
5618     return z;
5619 
5620 }
5621 
5622 /*----------------------------------------------------------------------------
5623 | Returns the result of converting the quadruple-precision floating-point value
5624 | `a' to the 64-bit unsigned integer format.  The conversion is
5625 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5626 | Arithmetic---which means in particular that the conversion is rounded
5627 | according to the current rounding mode.  If `a' is a NaN, the largest
5628 | positive integer is returned.  If the conversion overflows, the
5629 | largest unsigned integer is returned.  If 'a' is negative, the value is
5630 | rounded and zero is returned; negative values that do not round to zero
5631 | will raise the inexact exception.
5632 *----------------------------------------------------------------------------*/
5633 
5634 uint64_t float128_to_uint64(float128 a, float_status *status)
5635 {
5636     flag aSign;
5637     int aExp;
5638     int shiftCount;
5639     uint64_t aSig0, aSig1;
5640 
5641     aSig0 = extractFloat128Frac0(a);
5642     aSig1 = extractFloat128Frac1(a);
5643     aExp = extractFloat128Exp(a);
5644     aSign = extractFloat128Sign(a);
5645     if (aSign && (aExp > 0x3FFE)) {
5646         float_raise(float_flag_invalid, status);
5647         if (float128_is_any_nan(a)) {
5648             return LIT64(0xFFFFFFFFFFFFFFFF);
5649         } else {
5650             return 0;
5651         }
5652     }
5653     if (aExp) {
5654         aSig0 |= LIT64(0x0001000000000000);
5655     }
5656     shiftCount = 0x402F - aExp;
5657     if (shiftCount <= 0) {
5658         if (0x403E < aExp) {
5659             float_raise(float_flag_invalid, status);
5660             return LIT64(0xFFFFFFFFFFFFFFFF);
5661         }
5662         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5663     } else {
5664         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5665     }
5666     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5667 }
5668 
5669 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5670 {
5671     uint64_t v;
5672     signed char current_rounding_mode = status->float_rounding_mode;
5673 
5674     set_float_rounding_mode(float_round_to_zero, status);
5675     v = float128_to_uint64(a, status);
5676     set_float_rounding_mode(current_rounding_mode, status);
5677 
5678     return v;
5679 }
5680 
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point
5683 | value `a' to the 32-bit unsigned integer format.  The conversion
5684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic except that the conversion is always rounded toward zero.
5686 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5687 | if the conversion overflows, the largest unsigned integer is returned.
5688 | If 'a' is negative, the value is rounded and zero is returned; negative
5689 | values that do not round to zero will raise the inexact exception.
5690 *----------------------------------------------------------------------------*/
5691 
5692 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5693 {
5694     uint64_t v;
5695     uint32_t res;
5696     int old_exc_flags = get_float_exception_flags(status);
5697 
5698     v = float128_to_uint64_round_to_zero(a, status);
5699     if (v > 0xffffffff) {
5700         res = 0xffffffff;
5701     } else {
5702         return v;
5703     }
5704     set_float_exception_flags(old_exc_flags, status);
5705     float_raise(float_flag_invalid, status);
5706     return res;
5707 }
5708 
5709 /*----------------------------------------------------------------------------
5710 | Returns the result of converting the quadruple-precision floating-point
5711 | value `a' to the single-precision floating-point format.  The conversion
5712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5713 | Arithmetic.
5714 *----------------------------------------------------------------------------*/
5715 
5716 float32 float128_to_float32(float128 a, float_status *status)
5717 {
5718     flag aSign;
5719     int32_t aExp;
5720     uint64_t aSig0, aSig1;
5721     uint32_t zSig;
5722 
5723     aSig1 = extractFloat128Frac1( a );
5724     aSig0 = extractFloat128Frac0( a );
5725     aExp = extractFloat128Exp( a );
5726     aSign = extractFloat128Sign( a );
5727     if ( aExp == 0x7FFF ) {
5728         if ( aSig0 | aSig1 ) {
5729             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5730         }
5731         return packFloat32( aSign, 0xFF, 0 );
5732     }
5733     aSig0 |= ( aSig1 != 0 );
5734     shift64RightJamming( aSig0, 18, &aSig0 );
5735     zSig = aSig0;
5736     if ( aExp || zSig ) {
5737         zSig |= 0x40000000;
5738         aExp -= 0x3F81;
5739     }
5740     return roundAndPackFloat32(aSign, aExp, zSig, status);
5741 
5742 }
5743 
5744 /*----------------------------------------------------------------------------
5745 | Returns the result of converting the quadruple-precision floating-point
5746 | value `a' to the double-precision floating-point format.  The conversion
5747 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5748 | Arithmetic.
5749 *----------------------------------------------------------------------------*/
5750 
5751 float64 float128_to_float64(float128 a, float_status *status)
5752 {
5753     flag aSign;
5754     int32_t aExp;
5755     uint64_t aSig0, aSig1;
5756 
5757     aSig1 = extractFloat128Frac1( a );
5758     aSig0 = extractFloat128Frac0( a );
5759     aExp = extractFloat128Exp( a );
5760     aSign = extractFloat128Sign( a );
5761     if ( aExp == 0x7FFF ) {
5762         if ( aSig0 | aSig1 ) {
5763             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5764         }
5765         return packFloat64( aSign, 0x7FF, 0 );
5766     }
5767     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5768     aSig0 |= ( aSig1 != 0 );
5769     if ( aExp || aSig0 ) {
5770         aSig0 |= LIT64( 0x4000000000000000 );
5771         aExp -= 0x3C01;
5772     }
5773     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5774 
5775 }
5776 
5777 /*----------------------------------------------------------------------------
5778 | Returns the result of converting the quadruple-precision floating-point
5779 | value `a' to the extended double-precision floating-point format.  The
5780 | conversion is performed according to the IEC/IEEE Standard for Binary
5781 | Floating-Point Arithmetic.
5782 *----------------------------------------------------------------------------*/
5783 
5784 floatx80 float128_to_floatx80(float128 a, float_status *status)
5785 {
5786     flag aSign;
5787     int32_t aExp;
5788     uint64_t aSig0, aSig1;
5789 
5790     aSig1 = extractFloat128Frac1( a );
5791     aSig0 = extractFloat128Frac0( a );
5792     aExp = extractFloat128Exp( a );
5793     aSign = extractFloat128Sign( a );
5794     if ( aExp == 0x7FFF ) {
5795         if ( aSig0 | aSig1 ) {
5796             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5797         }
5798         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5799     }
5800     if ( aExp == 0 ) {
5801         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5802         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5803     }
5804     else {
5805         aSig0 |= LIT64( 0x0001000000000000 );
5806     }
5807     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5808     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5809 
5810 }
5811 
5812 /*----------------------------------------------------------------------------
5813 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5814 | returns the result as a quadruple-precision floating-point value.  The
5815 | operation is performed according to the IEC/IEEE Standard for Binary
5816 | Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5818 
5819 float128 float128_round_to_int(float128 a, float_status *status)
5820 {
5821     flag aSign;
5822     int32_t aExp;
5823     uint64_t lastBitMask, roundBitsMask;
5824     float128 z;
5825 
5826     aExp = extractFloat128Exp( a );
5827     if ( 0x402F <= aExp ) {
5828         if ( 0x406F <= aExp ) {
5829             if (    ( aExp == 0x7FFF )
5830                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5831                ) {
5832                 return propagateFloat128NaN(a, a, status);
5833             }
5834             return a;
5835         }
5836         lastBitMask = 1;
5837         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5838         roundBitsMask = lastBitMask - 1;
5839         z = a;
5840         switch (status->float_rounding_mode) {
5841         case float_round_nearest_even:
5842             if ( lastBitMask ) {
5843                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5844                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5845             }
5846             else {
5847                 if ( (int64_t) z.low < 0 ) {
5848                     ++z.high;
5849                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5850                 }
5851             }
5852             break;
5853         case float_round_ties_away:
5854             if (lastBitMask) {
5855                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5856             } else {
5857                 if ((int64_t) z.low < 0) {
5858                     ++z.high;
5859                 }
5860             }
5861             break;
5862         case float_round_to_zero:
5863             break;
5864         case float_round_up:
5865             if (!extractFloat128Sign(z)) {
5866                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5867             }
5868             break;
5869         case float_round_down:
5870             if (extractFloat128Sign(z)) {
5871                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5872             }
5873             break;
5874         default:
5875             abort();
5876         }
5877         z.low &= ~ roundBitsMask;
5878     }
5879     else {
5880         if ( aExp < 0x3FFF ) {
5881             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5882             status->float_exception_flags |= float_flag_inexact;
5883             aSign = extractFloat128Sign( a );
5884             switch (status->float_rounding_mode) {
5885              case float_round_nearest_even:
5886                 if (    ( aExp == 0x3FFE )
5887                      && (   extractFloat128Frac0( a )
5888                           | extractFloat128Frac1( a ) )
5889                    ) {
5890                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5891                 }
5892                 break;
5893             case float_round_ties_away:
5894                 if (aExp == 0x3FFE) {
5895                     return packFloat128(aSign, 0x3FFF, 0, 0);
5896                 }
5897                 break;
5898              case float_round_down:
5899                 return
5900                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5901                     : packFloat128( 0, 0, 0, 0 );
5902              case float_round_up:
5903                 return
5904                       aSign ? packFloat128( 1, 0, 0, 0 )
5905                     : packFloat128( 0, 0x3FFF, 0, 0 );
5906             }
5907             return packFloat128( aSign, 0, 0, 0 );
5908         }
5909         lastBitMask = 1;
5910         lastBitMask <<= 0x402F - aExp;
5911         roundBitsMask = lastBitMask - 1;
5912         z.low = 0;
5913         z.high = a.high;
5914         switch (status->float_rounding_mode) {
5915         case float_round_nearest_even:
5916             z.high += lastBitMask>>1;
5917             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5918                 z.high &= ~ lastBitMask;
5919             }
5920             break;
5921         case float_round_ties_away:
5922             z.high += lastBitMask>>1;
5923             break;
5924         case float_round_to_zero:
5925             break;
5926         case float_round_up:
5927             if (!extractFloat128Sign(z)) {
5928                 z.high |= ( a.low != 0 );
5929                 z.high += roundBitsMask;
5930             }
5931             break;
5932         case float_round_down:
5933             if (extractFloat128Sign(z)) {
5934                 z.high |= (a.low != 0);
5935                 z.high += roundBitsMask;
5936             }
5937             break;
5938         default:
5939             abort();
5940         }
5941         z.high &= ~ roundBitsMask;
5942     }
5943     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5944         status->float_exception_flags |= float_flag_inexact;
5945     }
5946     return z;
5947 
5948 }
5949 
5950 /*----------------------------------------------------------------------------
5951 | Returns the result of adding the absolute values of the quadruple-precision
5952 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5953 | before being returned.  `zSign' is ignored if the result is a NaN.
5954 | The addition is performed according to the IEC/IEEE Standard for Binary
5955 | Floating-Point Arithmetic.
5956 *----------------------------------------------------------------------------*/
5957 
5958 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5959                                 float_status *status)
5960 {
5961     int32_t aExp, bExp, zExp;
5962     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5963     int32_t expDiff;
5964 
5965     aSig1 = extractFloat128Frac1( a );
5966     aSig0 = extractFloat128Frac0( a );
5967     aExp = extractFloat128Exp( a );
5968     bSig1 = extractFloat128Frac1( b );
5969     bSig0 = extractFloat128Frac0( b );
5970     bExp = extractFloat128Exp( b );
5971     expDiff = aExp - bExp;
5972     if ( 0 < expDiff ) {
5973         if ( aExp == 0x7FFF ) {
5974             if (aSig0 | aSig1) {
5975                 return propagateFloat128NaN(a, b, status);
5976             }
5977             return a;
5978         }
5979         if ( bExp == 0 ) {
5980             --expDiff;
5981         }
5982         else {
5983             bSig0 |= LIT64( 0x0001000000000000 );
5984         }
5985         shift128ExtraRightJamming(
5986             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5987         zExp = aExp;
5988     }
5989     else if ( expDiff < 0 ) {
5990         if ( bExp == 0x7FFF ) {
5991             if (bSig0 | bSig1) {
5992                 return propagateFloat128NaN(a, b, status);
5993             }
5994             return packFloat128( zSign, 0x7FFF, 0, 0 );
5995         }
5996         if ( aExp == 0 ) {
5997             ++expDiff;
5998         }
5999         else {
6000             aSig0 |= LIT64( 0x0001000000000000 );
6001         }
6002         shift128ExtraRightJamming(
6003             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6004         zExp = bExp;
6005     }
6006     else {
6007         if ( aExp == 0x7FFF ) {
6008             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6009                 return propagateFloat128NaN(a, b, status);
6010             }
6011             return a;
6012         }
6013         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6014         if ( aExp == 0 ) {
6015             if (status->flush_to_zero) {
6016                 if (zSig0 | zSig1) {
6017                     float_raise(float_flag_output_denormal, status);
6018                 }
6019                 return packFloat128(zSign, 0, 0, 0);
6020             }
6021             return packFloat128( zSign, 0, zSig0, zSig1 );
6022         }
6023         zSig2 = 0;
6024         zSig0 |= LIT64( 0x0002000000000000 );
6025         zExp = aExp;
6026         goto shiftRight1;
6027     }
6028     aSig0 |= LIT64( 0x0001000000000000 );
6029     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6030     --zExp;
6031     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6032     ++zExp;
6033  shiftRight1:
6034     shift128ExtraRightJamming(
6035         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6036  roundAndPack:
6037     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6038 
6039 }
6040 
6041 /*----------------------------------------------------------------------------
6042 | Returns the result of subtracting the absolute values of the quadruple-
6043 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6044 | difference is negated before being returned.  `zSign' is ignored if the
6045 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6046 | Standard for Binary Floating-Point Arithmetic.
6047 *----------------------------------------------------------------------------*/
6048 
6049 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6050                                 float_status *status)
6051 {
6052     int32_t aExp, bExp, zExp;
6053     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6054     int32_t expDiff;
6055 
6056     aSig1 = extractFloat128Frac1( a );
6057     aSig0 = extractFloat128Frac0( a );
6058     aExp = extractFloat128Exp( a );
6059     bSig1 = extractFloat128Frac1( b );
6060     bSig0 = extractFloat128Frac0( b );
6061     bExp = extractFloat128Exp( b );
6062     expDiff = aExp - bExp;
6063     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6064     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6065     if ( 0 < expDiff ) goto aExpBigger;
6066     if ( expDiff < 0 ) goto bExpBigger;
6067     if ( aExp == 0x7FFF ) {
6068         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6069             return propagateFloat128NaN(a, b, status);
6070         }
6071         float_raise(float_flag_invalid, status);
6072         return float128_default_nan(status);
6073     }
6074     if ( aExp == 0 ) {
6075         aExp = 1;
6076         bExp = 1;
6077     }
6078     if ( bSig0 < aSig0 ) goto aBigger;
6079     if ( aSig0 < bSig0 ) goto bBigger;
6080     if ( bSig1 < aSig1 ) goto aBigger;
6081     if ( aSig1 < bSig1 ) goto bBigger;
6082     return packFloat128(status->float_rounding_mode == float_round_down,
6083                         0, 0, 0);
6084  bExpBigger:
6085     if ( bExp == 0x7FFF ) {
6086         if (bSig0 | bSig1) {
6087             return propagateFloat128NaN(a, b, status);
6088         }
6089         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6090     }
6091     if ( aExp == 0 ) {
6092         ++expDiff;
6093     }
6094     else {
6095         aSig0 |= LIT64( 0x4000000000000000 );
6096     }
6097     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6098     bSig0 |= LIT64( 0x4000000000000000 );
6099  bBigger:
6100     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6101     zExp = bExp;
6102     zSign ^= 1;
6103     goto normalizeRoundAndPack;
6104  aExpBigger:
6105     if ( aExp == 0x7FFF ) {
6106         if (aSig0 | aSig1) {
6107             return propagateFloat128NaN(a, b, status);
6108         }
6109         return a;
6110     }
6111     if ( bExp == 0 ) {
6112         --expDiff;
6113     }
6114     else {
6115         bSig0 |= LIT64( 0x4000000000000000 );
6116     }
6117     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6118     aSig0 |= LIT64( 0x4000000000000000 );
6119  aBigger:
6120     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6121     zExp = aExp;
6122  normalizeRoundAndPack:
6123     --zExp;
6124     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6125                                          status);
6126 
6127 }
6128 
6129 /*----------------------------------------------------------------------------
6130 | Returns the result of adding the quadruple-precision floating-point values
6131 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6132 | for Binary Floating-Point Arithmetic.
6133 *----------------------------------------------------------------------------*/
6134 
6135 float128 float128_add(float128 a, float128 b, float_status *status)
6136 {
6137     flag aSign, bSign;
6138 
6139     aSign = extractFloat128Sign( a );
6140     bSign = extractFloat128Sign( b );
6141     if ( aSign == bSign ) {
6142         return addFloat128Sigs(a, b, aSign, status);
6143     }
6144     else {
6145         return subFloat128Sigs(a, b, aSign, status);
6146     }
6147 
6148 }
6149 
6150 /*----------------------------------------------------------------------------
6151 | Returns the result of subtracting the quadruple-precision floating-point
6152 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6153 | Standard for Binary Floating-Point Arithmetic.
6154 *----------------------------------------------------------------------------*/
6155 
6156 float128 float128_sub(float128 a, float128 b, float_status *status)
6157 {
6158     flag aSign, bSign;
6159 
6160     aSign = extractFloat128Sign( a );
6161     bSign = extractFloat128Sign( b );
6162     if ( aSign == bSign ) {
6163         return subFloat128Sigs(a, b, aSign, status);
6164     }
6165     else {
6166         return addFloat128Sigs(a, b, aSign, status);
6167     }
6168 
6169 }
6170 
6171 /*----------------------------------------------------------------------------
6172 | Returns the result of multiplying the quadruple-precision floating-point
6173 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6174 | Standard for Binary Floating-Point Arithmetic.
6175 *----------------------------------------------------------------------------*/
6176 
6177 float128 float128_mul(float128 a, float128 b, float_status *status)
6178 {
6179     flag aSign, bSign, zSign;
6180     int32_t aExp, bExp, zExp;
6181     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6182 
6183     aSig1 = extractFloat128Frac1( a );
6184     aSig0 = extractFloat128Frac0( a );
6185     aExp = extractFloat128Exp( a );
6186     aSign = extractFloat128Sign( a );
6187     bSig1 = extractFloat128Frac1( b );
6188     bSig0 = extractFloat128Frac0( b );
6189     bExp = extractFloat128Exp( b );
6190     bSign = extractFloat128Sign( b );
6191     zSign = aSign ^ bSign;
6192     if ( aExp == 0x7FFF ) {
6193         if (    ( aSig0 | aSig1 )
6194              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6195             return propagateFloat128NaN(a, b, status);
6196         }
6197         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6198         return packFloat128( zSign, 0x7FFF, 0, 0 );
6199     }
6200     if ( bExp == 0x7FFF ) {
6201         if (bSig0 | bSig1) {
6202             return propagateFloat128NaN(a, b, status);
6203         }
6204         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6205  invalid:
6206             float_raise(float_flag_invalid, status);
6207             return float128_default_nan(status);
6208         }
6209         return packFloat128( zSign, 0x7FFF, 0, 0 );
6210     }
6211     if ( aExp == 0 ) {
6212         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6213         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6214     }
6215     if ( bExp == 0 ) {
6216         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6217         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6218     }
6219     zExp = aExp + bExp - 0x4000;
6220     aSig0 |= LIT64( 0x0001000000000000 );
6221     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6222     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6223     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6224     zSig2 |= ( zSig3 != 0 );
6225     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6226         shift128ExtraRightJamming(
6227             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6228         ++zExp;
6229     }
6230     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6231 
6232 }
6233 
6234 /*----------------------------------------------------------------------------
6235 | Returns the result of dividing the quadruple-precision floating-point value
6236 | `a' by the corresponding value `b'.  The operation is performed according to
6237 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6238 *----------------------------------------------------------------------------*/
6239 
6240 float128 float128_div(float128 a, float128 b, float_status *status)
6241 {
6242     flag aSign, bSign, zSign;
6243     int32_t aExp, bExp, zExp;
6244     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6245     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6246 
6247     aSig1 = extractFloat128Frac1( a );
6248     aSig0 = extractFloat128Frac0( a );
6249     aExp = extractFloat128Exp( a );
6250     aSign = extractFloat128Sign( a );
6251     bSig1 = extractFloat128Frac1( b );
6252     bSig0 = extractFloat128Frac0( b );
6253     bExp = extractFloat128Exp( b );
6254     bSign = extractFloat128Sign( b );
6255     zSign = aSign ^ bSign;
6256     if ( aExp == 0x7FFF ) {
6257         if (aSig0 | aSig1) {
6258             return propagateFloat128NaN(a, b, status);
6259         }
6260         if ( bExp == 0x7FFF ) {
6261             if (bSig0 | bSig1) {
6262                 return propagateFloat128NaN(a, b, status);
6263             }
6264             goto invalid;
6265         }
6266         return packFloat128( zSign, 0x7FFF, 0, 0 );
6267     }
6268     if ( bExp == 0x7FFF ) {
6269         if (bSig0 | bSig1) {
6270             return propagateFloat128NaN(a, b, status);
6271         }
6272         return packFloat128( zSign, 0, 0, 0 );
6273     }
6274     if ( bExp == 0 ) {
6275         if ( ( bSig0 | bSig1 ) == 0 ) {
6276             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6277  invalid:
6278                 float_raise(float_flag_invalid, status);
6279                 return float128_default_nan(status);
6280             }
6281             float_raise(float_flag_divbyzero, status);
6282             return packFloat128( zSign, 0x7FFF, 0, 0 );
6283         }
6284         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6285     }
6286     if ( aExp == 0 ) {
6287         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6288         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6289     }
6290     zExp = aExp - bExp + 0x3FFD;
6291     shortShift128Left(
6292         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6293     shortShift128Left(
6294         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6295     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6296         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6297         ++zExp;
6298     }
6299     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6300     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6301     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6302     while ( (int64_t) rem0 < 0 ) {
6303         --zSig0;
6304         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6305     }
6306     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6307     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6308         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6309         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6310         while ( (int64_t) rem1 < 0 ) {
6311             --zSig1;
6312             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6313         }
6314         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6315     }
6316     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6317     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6318 
6319 }
6320 
6321 /*----------------------------------------------------------------------------
6322 | Returns the remainder of the quadruple-precision floating-point value `a'
6323 | with respect to the corresponding value `b'.  The operation is performed
6324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6325 *----------------------------------------------------------------------------*/
6326 
6327 float128 float128_rem(float128 a, float128 b, float_status *status)
6328 {
6329     flag aSign, zSign;
6330     int32_t aExp, bExp, expDiff;
6331     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6332     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6333     int64_t sigMean0;
6334 
6335     aSig1 = extractFloat128Frac1( a );
6336     aSig0 = extractFloat128Frac0( a );
6337     aExp = extractFloat128Exp( a );
6338     aSign = extractFloat128Sign( a );
6339     bSig1 = extractFloat128Frac1( b );
6340     bSig0 = extractFloat128Frac0( b );
6341     bExp = extractFloat128Exp( b );
6342     if ( aExp == 0x7FFF ) {
6343         if (    ( aSig0 | aSig1 )
6344              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6345             return propagateFloat128NaN(a, b, status);
6346         }
6347         goto invalid;
6348     }
6349     if ( bExp == 0x7FFF ) {
6350         if (bSig0 | bSig1) {
6351             return propagateFloat128NaN(a, b, status);
6352         }
6353         return a;
6354     }
6355     if ( bExp == 0 ) {
6356         if ( ( bSig0 | bSig1 ) == 0 ) {
6357  invalid:
6358             float_raise(float_flag_invalid, status);
6359             return float128_default_nan(status);
6360         }
6361         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6362     }
6363     if ( aExp == 0 ) {
6364         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6365         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6366     }
6367     expDiff = aExp - bExp;
6368     if ( expDiff < -1 ) return a;
6369     shortShift128Left(
6370         aSig0 | LIT64( 0x0001000000000000 ),
6371         aSig1,
6372         15 - ( expDiff < 0 ),
6373         &aSig0,
6374         &aSig1
6375     );
6376     shortShift128Left(
6377         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6378     q = le128( bSig0, bSig1, aSig0, aSig1 );
6379     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6380     expDiff -= 64;
6381     while ( 0 < expDiff ) {
6382         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6383         q = ( 4 < q ) ? q - 4 : 0;
6384         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6385         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6386         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6387         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6388         expDiff -= 61;
6389     }
6390     if ( -64 < expDiff ) {
6391         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6392         q = ( 4 < q ) ? q - 4 : 0;
6393         q >>= - expDiff;
6394         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6395         expDiff += 52;
6396         if ( expDiff < 0 ) {
6397             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6398         }
6399         else {
6400             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6401         }
6402         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6403         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6404     }
6405     else {
6406         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6407         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6408     }
6409     do {
6410         alternateASig0 = aSig0;
6411         alternateASig1 = aSig1;
6412         ++q;
6413         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6414     } while ( 0 <= (int64_t) aSig0 );
6415     add128(
6416         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6417     if (    ( sigMean0 < 0 )
6418          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6419         aSig0 = alternateASig0;
6420         aSig1 = alternateASig1;
6421     }
6422     zSign = ( (int64_t) aSig0 < 0 );
6423     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6424     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6425                                          status);
6426 }
6427 
6428 /*----------------------------------------------------------------------------
6429 | Returns the square root of the quadruple-precision floating-point value `a'.
6430 | The operation is performed according to the IEC/IEEE Standard for Binary
6431 | Floating-Point Arithmetic.
6432 *----------------------------------------------------------------------------*/
6433 
6434 float128 float128_sqrt(float128 a, float_status *status)
6435 {
6436     flag aSign;
6437     int32_t aExp, zExp;
6438     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6439     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6440 
6441     aSig1 = extractFloat128Frac1( a );
6442     aSig0 = extractFloat128Frac0( a );
6443     aExp = extractFloat128Exp( a );
6444     aSign = extractFloat128Sign( a );
6445     if ( aExp == 0x7FFF ) {
6446         if (aSig0 | aSig1) {
6447             return propagateFloat128NaN(a, a, status);
6448         }
6449         if ( ! aSign ) return a;
6450         goto invalid;
6451     }
6452     if ( aSign ) {
6453         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6454  invalid:
6455         float_raise(float_flag_invalid, status);
6456         return float128_default_nan(status);
6457     }
6458     if ( aExp == 0 ) {
6459         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6460         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6461     }
6462     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6463     aSig0 |= LIT64( 0x0001000000000000 );
6464     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6465     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6466     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6467     doubleZSig0 = zSig0<<1;
6468     mul64To128( zSig0, zSig0, &term0, &term1 );
6469     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6470     while ( (int64_t) rem0 < 0 ) {
6471         --zSig0;
6472         doubleZSig0 -= 2;
6473         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6474     }
6475     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6476     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6477         if ( zSig1 == 0 ) zSig1 = 1;
6478         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6479         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6480         mul64To128( zSig1, zSig1, &term2, &term3 );
6481         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6482         while ( (int64_t) rem1 < 0 ) {
6483             --zSig1;
6484             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6485             term3 |= 1;
6486             term2 |= doubleZSig0;
6487             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6488         }
6489         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6490     }
6491     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6492     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6493 
6494 }
6495 
6496 /*----------------------------------------------------------------------------
6497 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6498 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6499 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6502 
6503 int float128_eq(float128 a, float128 b, float_status *status)
6504 {
6505 
6506     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6507               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6508          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6509               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6510        ) {
6511         float_raise(float_flag_invalid, status);
6512         return 0;
6513     }
6514     return
6515            ( a.low == b.low )
6516         && (    ( a.high == b.high )
6517              || (    ( a.low == 0 )
6518                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6519            );
6520 
6521 }
6522 
6523 /*----------------------------------------------------------------------------
6524 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6525 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6526 | exception is raised if either operand is a NaN.  The comparison is performed
6527 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6528 *----------------------------------------------------------------------------*/
6529 
6530 int float128_le(float128 a, float128 b, float_status *status)
6531 {
6532     flag aSign, bSign;
6533 
6534     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6535               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6536          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6537               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6538        ) {
6539         float_raise(float_flag_invalid, status);
6540         return 0;
6541     }
6542     aSign = extractFloat128Sign( a );
6543     bSign = extractFloat128Sign( b );
6544     if ( aSign != bSign ) {
6545         return
6546                aSign
6547             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6548                  == 0 );
6549     }
6550     return
6551           aSign ? le128( b.high, b.low, a.high, a.low )
6552         : le128( a.high, a.low, b.high, b.low );
6553 
6554 }
6555 
6556 /*----------------------------------------------------------------------------
6557 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6558 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6559 | raised if either operand is a NaN.  The comparison is performed according
6560 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6561 *----------------------------------------------------------------------------*/
6562 
6563 int float128_lt(float128 a, float128 b, float_status *status)
6564 {
6565     flag aSign, bSign;
6566 
6567     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6568               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6569          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6570               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6571        ) {
6572         float_raise(float_flag_invalid, status);
6573         return 0;
6574     }
6575     aSign = extractFloat128Sign( a );
6576     bSign = extractFloat128Sign( b );
6577     if ( aSign != bSign ) {
6578         return
6579                aSign
6580             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6581                  != 0 );
6582     }
6583     return
6584           aSign ? lt128( b.high, b.low, a.high, a.low )
6585         : lt128( a.high, a.low, b.high, b.low );
6586 
6587 }
6588 
6589 /*----------------------------------------------------------------------------
6590 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6591 | be compared, and 0 otherwise.  The invalid exception is raised if either
6592 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6593 | Standard for Binary Floating-Point Arithmetic.
6594 *----------------------------------------------------------------------------*/
6595 
6596 int float128_unordered(float128 a, float128 b, float_status *status)
6597 {
6598     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6599               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6600          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6601               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6602        ) {
6603         float_raise(float_flag_invalid, status);
6604         return 1;
6605     }
6606     return 0;
6607 }
6608 
6609 /*----------------------------------------------------------------------------
6610 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6611 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6612 | exception.  The comparison is performed according to the IEC/IEEE Standard
6613 | for Binary Floating-Point Arithmetic.
6614 *----------------------------------------------------------------------------*/
6615 
6616 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6617 {
6618 
6619     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6620               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6621          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6622               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6623        ) {
6624         if (float128_is_signaling_nan(a, status)
6625          || float128_is_signaling_nan(b, status)) {
6626             float_raise(float_flag_invalid, status);
6627         }
6628         return 0;
6629     }
6630     return
6631            ( a.low == b.low )
6632         && (    ( a.high == b.high )
6633              || (    ( a.low == 0 )
6634                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6635            );
6636 
6637 }
6638 
6639 /*----------------------------------------------------------------------------
6640 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6641 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6642 | cause an exception.  Otherwise, the comparison is performed according to the
6643 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6644 *----------------------------------------------------------------------------*/
6645 
6646 int float128_le_quiet(float128 a, float128 b, float_status *status)
6647 {
6648     flag aSign, bSign;
6649 
6650     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6651               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6652          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6653               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6654        ) {
6655         if (float128_is_signaling_nan(a, status)
6656          || float128_is_signaling_nan(b, status)) {
6657             float_raise(float_flag_invalid, status);
6658         }
6659         return 0;
6660     }
6661     aSign = extractFloat128Sign( a );
6662     bSign = extractFloat128Sign( b );
6663     if ( aSign != bSign ) {
6664         return
6665                aSign
6666             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6667                  == 0 );
6668     }
6669     return
6670           aSign ? le128( b.high, b.low, a.high, a.low )
6671         : le128( a.high, a.low, b.high, b.low );
6672 
6673 }
6674 
6675 /*----------------------------------------------------------------------------
6676 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6677 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6678 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6679 | Standard for Binary Floating-Point Arithmetic.
6680 *----------------------------------------------------------------------------*/
6681 
6682 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6683 {
6684     flag aSign, bSign;
6685 
6686     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6687               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6688          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6689               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6690        ) {
6691         if (float128_is_signaling_nan(a, status)
6692          || float128_is_signaling_nan(b, status)) {
6693             float_raise(float_flag_invalid, status);
6694         }
6695         return 0;
6696     }
6697     aSign = extractFloat128Sign( a );
6698     bSign = extractFloat128Sign( b );
6699     if ( aSign != bSign ) {
6700         return
6701                aSign
6702             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6703                  != 0 );
6704     }
6705     return
6706           aSign ? lt128( b.high, b.low, a.high, a.low )
6707         : lt128( a.high, a.low, b.high, b.low );
6708 
6709 }
6710 
6711 /*----------------------------------------------------------------------------
6712 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6713 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6714 | comparison is performed according to the IEC/IEEE Standard for Binary
6715 | Floating-Point Arithmetic.
6716 *----------------------------------------------------------------------------*/
6717 
6718 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6719 {
6720     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6721               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6722          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6723               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6724        ) {
6725         if (float128_is_signaling_nan(a, status)
6726          || float128_is_signaling_nan(b, status)) {
6727             float_raise(float_flag_invalid, status);
6728         }
6729         return 1;
6730     }
6731     return 0;
6732 }
6733 
6734 #define COMPARE(s, nan_exp)                                                  \
6735 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
6736                                       int is_quiet, float_status *status)    \
6737 {                                                                            \
6738     flag aSign, bSign;                                                       \
6739     uint ## s ## _t av, bv;                                                  \
6740     a = float ## s ## _squash_input_denormal(a, status);                     \
6741     b = float ## s ## _squash_input_denormal(b, status);                     \
6742                                                                              \
6743     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6744          extractFloat ## s ## Frac( a ) ) ||                                 \
6745         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6746           extractFloat ## s ## Frac( b ) )) {                                \
6747         if (!is_quiet ||                                                     \
6748             float ## s ## _is_signaling_nan(a, status) ||                  \
6749             float ## s ## _is_signaling_nan(b, status)) {                 \
6750             float_raise(float_flag_invalid, status);                         \
6751         }                                                                    \
6752         return float_relation_unordered;                                     \
6753     }                                                                        \
6754     aSign = extractFloat ## s ## Sign( a );                                  \
6755     bSign = extractFloat ## s ## Sign( b );                                  \
6756     av = float ## s ## _val(a);                                              \
6757     bv = float ## s ## _val(b);                                              \
6758     if ( aSign != bSign ) {                                                  \
6759         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6760             /* zero case */                                                  \
6761             return float_relation_equal;                                     \
6762         } else {                                                             \
6763             return 1 - (2 * aSign);                                          \
6764         }                                                                    \
6765     } else {                                                                 \
6766         if (av == bv) {                                                      \
6767             return float_relation_equal;                                     \
6768         } else {                                                             \
6769             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6770         }                                                                    \
6771     }                                                                        \
6772 }                                                                            \
6773                                                                              \
6774 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
6775 {                                                                            \
6776     return float ## s ## _compare_internal(a, b, 0, status);                 \
6777 }                                                                            \
6778                                                                              \
6779 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
6780                                  float_status *status)                       \
6781 {                                                                            \
6782     return float ## s ## _compare_internal(a, b, 1, status);                 \
6783 }
6784 
6785 COMPARE(32, 0xff)
6786 COMPARE(64, 0x7ff)
6787 
6788 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6789                                             int is_quiet, float_status *status)
6790 {
6791     flag aSign, bSign;
6792 
6793     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6794         float_raise(float_flag_invalid, status);
6795         return float_relation_unordered;
6796     }
6797     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6798           ( extractFloatx80Frac( a )<<1 ) ) ||
6799         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6800           ( extractFloatx80Frac( b )<<1 ) )) {
6801         if (!is_quiet ||
6802             floatx80_is_signaling_nan(a, status) ||
6803             floatx80_is_signaling_nan(b, status)) {
6804             float_raise(float_flag_invalid, status);
6805         }
6806         return float_relation_unordered;
6807     }
6808     aSign = extractFloatx80Sign( a );
6809     bSign = extractFloatx80Sign( b );
6810     if ( aSign != bSign ) {
6811 
6812         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6813              ( ( a.low | b.low ) == 0 ) ) {
6814             /* zero case */
6815             return float_relation_equal;
6816         } else {
6817             return 1 - (2 * aSign);
6818         }
6819     } else {
6820         if (a.low == b.low && a.high == b.high) {
6821             return float_relation_equal;
6822         } else {
6823             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6824         }
6825     }
6826 }
6827 
6828 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6829 {
6830     return floatx80_compare_internal(a, b, 0, status);
6831 }
6832 
6833 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6834 {
6835     return floatx80_compare_internal(a, b, 1, status);
6836 }
6837 
6838 static inline int float128_compare_internal(float128 a, float128 b,
6839                                             int is_quiet, float_status *status)
6840 {
6841     flag aSign, bSign;
6842 
6843     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6844           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6845         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6846           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6847         if (!is_quiet ||
6848             float128_is_signaling_nan(a, status) ||
6849             float128_is_signaling_nan(b, status)) {
6850             float_raise(float_flag_invalid, status);
6851         }
6852         return float_relation_unordered;
6853     }
6854     aSign = extractFloat128Sign( a );
6855     bSign = extractFloat128Sign( b );
6856     if ( aSign != bSign ) {
6857         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6858             /* zero case */
6859             return float_relation_equal;
6860         } else {
6861             return 1 - (2 * aSign);
6862         }
6863     } else {
6864         if (a.low == b.low && a.high == b.high) {
6865             return float_relation_equal;
6866         } else {
6867             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6868         }
6869     }
6870 }
6871 
6872 int float128_compare(float128 a, float128 b, float_status *status)
6873 {
6874     return float128_compare_internal(a, b, 0, status);
6875 }
6876 
6877 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6878 {
6879     return float128_compare_internal(a, b, 1, status);
6880 }
6881 
6882 /* min() and max() functions. These can't be implemented as
6883  * 'compare and pick one input' because that would mishandle
6884  * NaNs and +0 vs -0.
6885  *
6886  * minnum() and maxnum() functions. These are similar to the min()
6887  * and max() functions but if one of the arguments is a QNaN and
6888  * the other is numerical then the numerical argument is returned.
6889  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6890  * and maxNum() operations. min() and max() are the typical min/max
6891  * semantics provided by many CPUs which predate that specification.
6892  *
6893  * minnummag() and maxnummag() functions correspond to minNumMag()
6894  * and minNumMag() from the IEEE-754 2008.
6895  */
6896 #define MINMAX(s)                                                       \
6897 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6898                                                int ismin, int isieee,   \
6899                                                int ismag,               \
6900                                                float_status *status)    \
6901 {                                                                       \
6902     flag aSign, bSign;                                                  \
6903     uint ## s ## _t av, bv, aav, abv;                                   \
6904     a = float ## s ## _squash_input_denormal(a, status);                \
6905     b = float ## s ## _squash_input_denormal(b, status);                \
6906     if (float ## s ## _is_any_nan(a) ||                                 \
6907         float ## s ## _is_any_nan(b)) {                                 \
6908         if (isieee) {                                                   \
6909             if (float ## s ## _is_quiet_nan(a, status) &&               \
6910                 !float ## s ##_is_any_nan(b)) {                         \
6911                 return b;                                               \
6912             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
6913                        !float ## s ## _is_any_nan(a)) {                \
6914                 return a;                                               \
6915             }                                                           \
6916         }                                                               \
6917         return propagateFloat ## s ## NaN(a, b, status);                \
6918     }                                                                   \
6919     aSign = extractFloat ## s ## Sign(a);                               \
6920     bSign = extractFloat ## s ## Sign(b);                               \
6921     av = float ## s ## _val(a);                                         \
6922     bv = float ## s ## _val(b);                                         \
6923     if (ismag) {                                                        \
6924         aav = float ## s ## _abs(av);                                   \
6925         abv = float ## s ## _abs(bv);                                   \
6926         if (aav != abv) {                                               \
6927             if (ismin) {                                                \
6928                 return (aav < abv) ? a : b;                             \
6929             } else {                                                    \
6930                 return (aav < abv) ? b : a;                             \
6931             }                                                           \
6932         }                                                               \
6933     }                                                                   \
6934     if (aSign != bSign) {                                               \
6935         if (ismin) {                                                    \
6936             return aSign ? a : b;                                       \
6937         } else {                                                        \
6938             return aSign ? b : a;                                       \
6939         }                                                               \
6940     } else {                                                            \
6941         if (ismin) {                                                    \
6942             return (aSign ^ (av < bv)) ? a : b;                         \
6943         } else {                                                        \
6944             return (aSign ^ (av < bv)) ? b : a;                         \
6945         }                                                               \
6946     }                                                                   \
6947 }                                                                       \
6948                                                                         \
6949 float ## s float ## s ## _min(float ## s a, float ## s b,               \
6950                               float_status *status)                     \
6951 {                                                                       \
6952     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
6953 }                                                                       \
6954                                                                         \
6955 float ## s float ## s ## _max(float ## s a, float ## s b,               \
6956                               float_status *status)                     \
6957 {                                                                       \
6958     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
6959 }                                                                       \
6960                                                                         \
6961 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
6962                                  float_status *status)                  \
6963 {                                                                       \
6964     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
6965 }                                                                       \
6966                                                                         \
6967 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
6968                                  float_status *status)                  \
6969 {                                                                       \
6970     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
6971 }                                                                       \
6972                                                                         \
6973 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
6974                                     float_status *status)               \
6975 {                                                                       \
6976     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
6977 }                                                                       \
6978                                                                         \
6979 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
6980                                     float_status *status)               \
6981 {                                                                       \
6982     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
6983 }
6984 
6985 MINMAX(32)
6986 MINMAX(64)
6987 
6988 
6989 /* Multiply A by 2 raised to the power N.  */
6990 float32 float32_scalbn(float32 a, int n, float_status *status)
6991 {
6992     flag aSign;
6993     int16_t aExp;
6994     uint32_t aSig;
6995 
6996     a = float32_squash_input_denormal(a, status);
6997     aSig = extractFloat32Frac( a );
6998     aExp = extractFloat32Exp( a );
6999     aSign = extractFloat32Sign( a );
7000 
7001     if ( aExp == 0xFF ) {
7002         if ( aSig ) {
7003             return propagateFloat32NaN(a, a, status);
7004         }
7005         return a;
7006     }
7007     if (aExp != 0) {
7008         aSig |= 0x00800000;
7009     } else if (aSig == 0) {
7010         return a;
7011     } else {
7012         aExp++;
7013     }
7014 
7015     if (n > 0x200) {
7016         n = 0x200;
7017     } else if (n < -0x200) {
7018         n = -0x200;
7019     }
7020 
7021     aExp += n - 1;
7022     aSig <<= 7;
7023     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7024 }
7025 
7026 float64 float64_scalbn(float64 a, int n, float_status *status)
7027 {
7028     flag aSign;
7029     int16_t aExp;
7030     uint64_t aSig;
7031 
7032     a = float64_squash_input_denormal(a, status);
7033     aSig = extractFloat64Frac( a );
7034     aExp = extractFloat64Exp( a );
7035     aSign = extractFloat64Sign( a );
7036 
7037     if ( aExp == 0x7FF ) {
7038         if ( aSig ) {
7039             return propagateFloat64NaN(a, a, status);
7040         }
7041         return a;
7042     }
7043     if (aExp != 0) {
7044         aSig |= LIT64( 0x0010000000000000 );
7045     } else if (aSig == 0) {
7046         return a;
7047     } else {
7048         aExp++;
7049     }
7050 
7051     if (n > 0x1000) {
7052         n = 0x1000;
7053     } else if (n < -0x1000) {
7054         n = -0x1000;
7055     }
7056 
7057     aExp += n - 1;
7058     aSig <<= 10;
7059     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7060 }
7061 
7062 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7063 {
7064     flag aSign;
7065     int32_t aExp;
7066     uint64_t aSig;
7067 
7068     if (floatx80_invalid_encoding(a)) {
7069         float_raise(float_flag_invalid, status);
7070         return floatx80_default_nan(status);
7071     }
7072     aSig = extractFloatx80Frac( a );
7073     aExp = extractFloatx80Exp( a );
7074     aSign = extractFloatx80Sign( a );
7075 
7076     if ( aExp == 0x7FFF ) {
7077         if ( aSig<<1 ) {
7078             return propagateFloatx80NaN(a, a, status);
7079         }
7080         return a;
7081     }
7082 
7083     if (aExp == 0) {
7084         if (aSig == 0) {
7085             return a;
7086         }
7087         aExp++;
7088     }
7089 
7090     if (n > 0x10000) {
7091         n = 0x10000;
7092     } else if (n < -0x10000) {
7093         n = -0x10000;
7094     }
7095 
7096     aExp += n;
7097     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7098                                          aSign, aExp, aSig, 0, status);
7099 }
7100 
7101 float128 float128_scalbn(float128 a, int n, float_status *status)
7102 {
7103     flag aSign;
7104     int32_t aExp;
7105     uint64_t aSig0, aSig1;
7106 
7107     aSig1 = extractFloat128Frac1( a );
7108     aSig0 = extractFloat128Frac0( a );
7109     aExp = extractFloat128Exp( a );
7110     aSign = extractFloat128Sign( a );
7111     if ( aExp == 0x7FFF ) {
7112         if ( aSig0 | aSig1 ) {
7113             return propagateFloat128NaN(a, a, status);
7114         }
7115         return a;
7116     }
7117     if (aExp != 0) {
7118         aSig0 |= LIT64( 0x0001000000000000 );
7119     } else if (aSig0 == 0 && aSig1 == 0) {
7120         return a;
7121     } else {
7122         aExp++;
7123     }
7124 
7125     if (n > 0x10000) {
7126         n = 0x10000;
7127     } else if (n < -0x10000) {
7128         n = -0x10000;
7129     }
7130 
7131     aExp += n - 1;
7132     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7133                                          , status);
7134 
7135 }
7136