xref: /qemu/fpu/softfloat.c (revision 6fff216769cf7eaa3961c85dee7a72838696d365)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
564 {
565     if (is_snan(a.cls) || is_snan(b.cls)) {
566         s->float_exception_flags |= float_flag_invalid;
567     }
568 
569     if (s->default_nan_mode) {
570         a.cls = float_class_dnan;
571     } else {
572         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
573                     is_qnan(b.cls), is_snan(b.cls),
574                     a.frac > b.frac ||
575                     (a.frac == b.frac && a.sign < b.sign))) {
576             a = b;
577         }
578         a.cls = float_class_msnan;
579     }
580     return a;
581 }
582 
583 /*
584  * Returns the result of adding or subtracting the values of the
585  * floating-point values `a' and `b'. The operation is performed
586  * according to the IEC/IEEE Standard for Binary Floating-Point
587  * Arithmetic.
588  */
589 
590 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
591                                 float_status *s)
592 {
593     bool a_sign = a.sign;
594     bool b_sign = b.sign ^ subtract;
595 
596     if (a_sign != b_sign) {
597         /* Subtraction */
598 
599         if (a.cls == float_class_normal && b.cls == float_class_normal) {
600             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
601                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
602                 a.frac = a.frac - b.frac;
603             } else {
604                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
605                 a.frac = b.frac - a.frac;
606                 a.exp = b.exp;
607                 a_sign ^= 1;
608             }
609 
610             if (a.frac == 0) {
611                 a.cls = float_class_zero;
612                 a.sign = s->float_rounding_mode == float_round_down;
613             } else {
614                 int shift = clz64(a.frac) - 1;
615                 a.frac = a.frac << shift;
616                 a.exp = a.exp - shift;
617                 a.sign = a_sign;
618             }
619             return a;
620         }
621         if (is_nan(a.cls) || is_nan(b.cls)) {
622             return pick_nan(a, b, s);
623         }
624         if (a.cls == float_class_inf) {
625             if (b.cls == float_class_inf) {
626                 float_raise(float_flag_invalid, s);
627                 a.cls = float_class_dnan;
628             }
629             return a;
630         }
631         if (a.cls == float_class_zero && b.cls == float_class_zero) {
632             a.sign = s->float_rounding_mode == float_round_down;
633             return a;
634         }
635         if (a.cls == float_class_zero || b.cls == float_class_inf) {
636             b.sign = a_sign ^ 1;
637             return b;
638         }
639         if (b.cls == float_class_zero) {
640             return a;
641         }
642     } else {
643         /* Addition */
644         if (a.cls == float_class_normal && b.cls == float_class_normal) {
645             if (a.exp > b.exp) {
646                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
647             } else if (a.exp < b.exp) {
648                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
649                 a.exp = b.exp;
650             }
651             a.frac += b.frac;
652             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
653                 a.frac >>= 1;
654                 a.exp += 1;
655             }
656             return a;
657         }
658         if (is_nan(a.cls) || is_nan(b.cls)) {
659             return pick_nan(a, b, s);
660         }
661         if (a.cls == float_class_inf || b.cls == float_class_zero) {
662             return a;
663         }
664         if (b.cls == float_class_inf || a.cls == float_class_zero) {
665             b.sign = b_sign;
666             return b;
667         }
668     }
669     g_assert_not_reached();
670 }
671 
672 /*
673  * Returns the result of adding or subtracting the floating-point
674  * values `a' and `b'. The operation is performed according to the
675  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
676  */
677 
678 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
679                                               float_status *status)
680 {
681     FloatParts pa = float16_unpack_canonical(a, status);
682     FloatParts pb = float16_unpack_canonical(b, status);
683     FloatParts pr = addsub_floats(pa, pb, false, status);
684 
685     return float16_round_pack_canonical(pr, status);
686 }
687 
688 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
689                                              float_status *status)
690 {
691     FloatParts pa = float32_unpack_canonical(a, status);
692     FloatParts pb = float32_unpack_canonical(b, status);
693     FloatParts pr = addsub_floats(pa, pb, false, status);
694 
695     return float32_round_pack_canonical(pr, status);
696 }
697 
698 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
699                                              float_status *status)
700 {
701     FloatParts pa = float64_unpack_canonical(a, status);
702     FloatParts pb = float64_unpack_canonical(b, status);
703     FloatParts pr = addsub_floats(pa, pb, false, status);
704 
705     return float64_round_pack_canonical(pr, status);
706 }
707 
708 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
709                                              float_status *status)
710 {
711     FloatParts pa = float16_unpack_canonical(a, status);
712     FloatParts pb = float16_unpack_canonical(b, status);
713     FloatParts pr = addsub_floats(pa, pb, true, status);
714 
715     return float16_round_pack_canonical(pr, status);
716 }
717 
718 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
719                                              float_status *status)
720 {
721     FloatParts pa = float32_unpack_canonical(a, status);
722     FloatParts pb = float32_unpack_canonical(b, status);
723     FloatParts pr = addsub_floats(pa, pb, true, status);
724 
725     return float32_round_pack_canonical(pr, status);
726 }
727 
728 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
729                                              float_status *status)
730 {
731     FloatParts pa = float64_unpack_canonical(a, status);
732     FloatParts pb = float64_unpack_canonical(b, status);
733     FloatParts pr = addsub_floats(pa, pb, true, status);
734 
735     return float64_round_pack_canonical(pr, status);
736 }
737 
738 /*----------------------------------------------------------------------------
739 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
740 | and 7, and returns the properly rounded 32-bit integer corresponding to the
741 | input.  If `zSign' is 1, the input is negated before being converted to an
742 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
743 | is simply rounded to an integer, with the inexact exception raised if the
744 | input cannot be represented exactly as an integer.  However, if the fixed-
745 | point input is too large, the invalid exception is raised and the largest
746 | positive or negative integer is returned.
747 *----------------------------------------------------------------------------*/
748 
749 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
750 {
751     int8_t roundingMode;
752     flag roundNearestEven;
753     int8_t roundIncrement, roundBits;
754     int32_t z;
755 
756     roundingMode = status->float_rounding_mode;
757     roundNearestEven = ( roundingMode == float_round_nearest_even );
758     switch (roundingMode) {
759     case float_round_nearest_even:
760     case float_round_ties_away:
761         roundIncrement = 0x40;
762         break;
763     case float_round_to_zero:
764         roundIncrement = 0;
765         break;
766     case float_round_up:
767         roundIncrement = zSign ? 0 : 0x7f;
768         break;
769     case float_round_down:
770         roundIncrement = zSign ? 0x7f : 0;
771         break;
772     default:
773         abort();
774     }
775     roundBits = absZ & 0x7F;
776     absZ = ( absZ + roundIncrement )>>7;
777     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
778     z = absZ;
779     if ( zSign ) z = - z;
780     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
781         float_raise(float_flag_invalid, status);
782         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
783     }
784     if (roundBits) {
785         status->float_exception_flags |= float_flag_inexact;
786     }
787     return z;
788 
789 }
790 
791 /*----------------------------------------------------------------------------
792 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
793 | `absZ1', with binary point between bits 63 and 64 (between the input words),
794 | and returns the properly rounded 64-bit integer corresponding to the input.
795 | If `zSign' is 1, the input is negated before being converted to an integer.
796 | Ordinarily, the fixed-point input is simply rounded to an integer, with
797 | the inexact exception raised if the input cannot be represented exactly as
798 | an integer.  However, if the fixed-point input is too large, the invalid
799 | exception is raised and the largest positive or negative integer is
800 | returned.
801 *----------------------------------------------------------------------------*/
802 
803 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
804                                float_status *status)
805 {
806     int8_t roundingMode;
807     flag roundNearestEven, increment;
808     int64_t z;
809 
810     roundingMode = status->float_rounding_mode;
811     roundNearestEven = ( roundingMode == float_round_nearest_even );
812     switch (roundingMode) {
813     case float_round_nearest_even:
814     case float_round_ties_away:
815         increment = ((int64_t) absZ1 < 0);
816         break;
817     case float_round_to_zero:
818         increment = 0;
819         break;
820     case float_round_up:
821         increment = !zSign && absZ1;
822         break;
823     case float_round_down:
824         increment = zSign && absZ1;
825         break;
826     default:
827         abort();
828     }
829     if ( increment ) {
830         ++absZ0;
831         if ( absZ0 == 0 ) goto overflow;
832         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
833     }
834     z = absZ0;
835     if ( zSign ) z = - z;
836     if ( z && ( ( z < 0 ) ^ zSign ) ) {
837  overflow:
838         float_raise(float_flag_invalid, status);
839         return
840               zSign ? (int64_t) LIT64( 0x8000000000000000 )
841             : LIT64( 0x7FFFFFFFFFFFFFFF );
842     }
843     if (absZ1) {
844         status->float_exception_flags |= float_flag_inexact;
845     }
846     return z;
847 
848 }
849 
850 /*----------------------------------------------------------------------------
851 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
852 | `absZ1', with binary point between bits 63 and 64 (between the input words),
853 | and returns the properly rounded 64-bit unsigned integer corresponding to the
854 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
855 | with the inexact exception raised if the input cannot be represented exactly
856 | as an integer.  However, if the fixed-point input is too large, the invalid
857 | exception is raised and the largest unsigned integer is returned.
858 *----------------------------------------------------------------------------*/
859 
860 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
861                                 uint64_t absZ1, float_status *status)
862 {
863     int8_t roundingMode;
864     flag roundNearestEven, increment;
865 
866     roundingMode = status->float_rounding_mode;
867     roundNearestEven = (roundingMode == float_round_nearest_even);
868     switch (roundingMode) {
869     case float_round_nearest_even:
870     case float_round_ties_away:
871         increment = ((int64_t)absZ1 < 0);
872         break;
873     case float_round_to_zero:
874         increment = 0;
875         break;
876     case float_round_up:
877         increment = !zSign && absZ1;
878         break;
879     case float_round_down:
880         increment = zSign && absZ1;
881         break;
882     default:
883         abort();
884     }
885     if (increment) {
886         ++absZ0;
887         if (absZ0 == 0) {
888             float_raise(float_flag_invalid, status);
889             return LIT64(0xFFFFFFFFFFFFFFFF);
890         }
891         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
892     }
893 
894     if (zSign && absZ0) {
895         float_raise(float_flag_invalid, status);
896         return 0;
897     }
898 
899     if (absZ1) {
900         status->float_exception_flags |= float_flag_inexact;
901     }
902     return absZ0;
903 }
904 
905 /*----------------------------------------------------------------------------
906 | If `a' is denormal and we are in flush-to-zero mode then set the
907 | input-denormal exception and return zero. Otherwise just return the value.
908 *----------------------------------------------------------------------------*/
909 float32 float32_squash_input_denormal(float32 a, float_status *status)
910 {
911     if (status->flush_inputs_to_zero) {
912         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
913             float_raise(float_flag_input_denormal, status);
914             return make_float32(float32_val(a) & 0x80000000);
915         }
916     }
917     return a;
918 }
919 
920 /*----------------------------------------------------------------------------
921 | Normalizes the subnormal single-precision floating-point value represented
922 | by the denormalized significand `aSig'.  The normalized exponent and
923 | significand are stored at the locations pointed to by `zExpPtr' and
924 | `zSigPtr', respectively.
925 *----------------------------------------------------------------------------*/
926 
927 static void
928  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
929 {
930     int8_t shiftCount;
931 
932     shiftCount = countLeadingZeros32( aSig ) - 8;
933     *zSigPtr = aSig<<shiftCount;
934     *zExpPtr = 1 - shiftCount;
935 
936 }
937 
938 /*----------------------------------------------------------------------------
939 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
940 | single-precision floating-point value, returning the result.  After being
941 | shifted into the proper positions, the three fields are simply added
942 | together to form the result.  This means that any integer portion of `zSig'
943 | will be added into the exponent.  Since a properly normalized significand
944 | will have an integer portion equal to 1, the `zExp' input should be 1 less
945 | than the desired result exponent whenever `zSig' is a complete, normalized
946 | significand.
947 *----------------------------------------------------------------------------*/
948 
949 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
950 {
951 
952     return make_float32(
953           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
954 
955 }
956 
957 /*----------------------------------------------------------------------------
958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
959 | and significand `zSig', and returns the proper single-precision floating-
960 | point value corresponding to the abstract input.  Ordinarily, the abstract
961 | value is simply rounded and packed into the single-precision format, with
962 | the inexact exception raised if the abstract input cannot be represented
963 | exactly.  However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned.  If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal single-
968 | precision floating-point number.
969 |     The input significand `zSig' has its binary point between bits 30
970 | and 29, which is 7 bits to the left of the usual location.  This shifted
971 | significand must be normalized or smaller.  If `zSig' is not normalized,
972 | `zExp' must be 0; in that case, the result returned is a subnormal number,
973 | and it must not require rounding.  In the usual case that `zSig' is
974 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
975 | The handling of underflow and overflow follows the IEC/IEEE Standard for
976 | Binary Floating-Point Arithmetic.
977 *----------------------------------------------------------------------------*/
978 
979 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
980                                    float_status *status)
981 {
982     int8_t roundingMode;
983     flag roundNearestEven;
984     int8_t roundIncrement, roundBits;
985     flag isTiny;
986 
987     roundingMode = status->float_rounding_mode;
988     roundNearestEven = ( roundingMode == float_round_nearest_even );
989     switch (roundingMode) {
990     case float_round_nearest_even:
991     case float_round_ties_away:
992         roundIncrement = 0x40;
993         break;
994     case float_round_to_zero:
995         roundIncrement = 0;
996         break;
997     case float_round_up:
998         roundIncrement = zSign ? 0 : 0x7f;
999         break;
1000     case float_round_down:
1001         roundIncrement = zSign ? 0x7f : 0;
1002         break;
1003     default:
1004         abort();
1005         break;
1006     }
1007     roundBits = zSig & 0x7F;
1008     if ( 0xFD <= (uint16_t) zExp ) {
1009         if (    ( 0xFD < zExp )
1010              || (    ( zExp == 0xFD )
1011                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
1012            ) {
1013             float_raise(float_flag_overflow | float_flag_inexact, status);
1014             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
1015         }
1016         if ( zExp < 0 ) {
1017             if (status->flush_to_zero) {
1018                 float_raise(float_flag_output_denormal, status);
1019                 return packFloat32(zSign, 0, 0);
1020             }
1021             isTiny =
1022                 (status->float_detect_tininess
1023                  == float_tininess_before_rounding)
1024                 || ( zExp < -1 )
1025                 || ( zSig + roundIncrement < 0x80000000 );
1026             shift32RightJamming( zSig, - zExp, &zSig );
1027             zExp = 0;
1028             roundBits = zSig & 0x7F;
1029             if (isTiny && roundBits) {
1030                 float_raise(float_flag_underflow, status);
1031             }
1032         }
1033     }
1034     if (roundBits) {
1035         status->float_exception_flags |= float_flag_inexact;
1036     }
1037     zSig = ( zSig + roundIncrement )>>7;
1038     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1039     if ( zSig == 0 ) zExp = 0;
1040     return packFloat32( zSign, zExp, zSig );
1041 
1042 }
1043 
1044 /*----------------------------------------------------------------------------
1045 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1046 | and significand `zSig', and returns the proper single-precision floating-
1047 | point value corresponding to the abstract input.  This routine is just like
1048 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1049 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1050 | floating-point exponent.
1051 *----------------------------------------------------------------------------*/
1052 
1053 static float32
1054  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1055                               float_status *status)
1056 {
1057     int8_t shiftCount;
1058 
1059     shiftCount = countLeadingZeros32( zSig ) - 1;
1060     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1061                                status);
1062 
1063 }
1064 
1065 /*----------------------------------------------------------------------------
1066 | If `a' is denormal and we are in flush-to-zero mode then set the
1067 | input-denormal exception and return zero. Otherwise just return the value.
1068 *----------------------------------------------------------------------------*/
1069 float64 float64_squash_input_denormal(float64 a, float_status *status)
1070 {
1071     if (status->flush_inputs_to_zero) {
1072         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
1073             float_raise(float_flag_input_denormal, status);
1074             return make_float64(float64_val(a) & (1ULL << 63));
1075         }
1076     }
1077     return a;
1078 }
1079 
1080 /*----------------------------------------------------------------------------
1081 | Normalizes the subnormal double-precision floating-point value represented
1082 | by the denormalized significand `aSig'.  The normalized exponent and
1083 | significand are stored at the locations pointed to by `zExpPtr' and
1084 | `zSigPtr', respectively.
1085 *----------------------------------------------------------------------------*/
1086 
1087 static void
1088  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
1089 {
1090     int8_t shiftCount;
1091 
1092     shiftCount = countLeadingZeros64( aSig ) - 11;
1093     *zSigPtr = aSig<<shiftCount;
1094     *zExpPtr = 1 - shiftCount;
1095 
1096 }
1097 
1098 /*----------------------------------------------------------------------------
1099 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1100 | double-precision floating-point value, returning the result.  After being
1101 | shifted into the proper positions, the three fields are simply added
1102 | together to form the result.  This means that any integer portion of `zSig'
1103 | will be added into the exponent.  Since a properly normalized significand
1104 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1105 | than the desired result exponent whenever `zSig' is a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108 
1109 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
1110 {
1111 
1112     return make_float64(
1113         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
1114 
1115 }
1116 
1117 /*----------------------------------------------------------------------------
1118 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119 | and significand `zSig', and returns the proper double-precision floating-
1120 | point value corresponding to the abstract input.  Ordinarily, the abstract
1121 | value is simply rounded and packed into the double-precision format, with
1122 | the inexact exception raised if the abstract input cannot be represented
1123 | exactly.  However, if the abstract value is too large, the overflow and
1124 | inexact exceptions are raised and an infinity or maximal finite value is
1125 | returned.  If the abstract value is too small, the input value is rounded to
1126 | a subnormal number, and the underflow and inexact exceptions are raised if
1127 | the abstract input cannot be represented exactly as a subnormal double-
1128 | precision floating-point number.
1129 |     The input significand `zSig' has its binary point between bits 62
1130 | and 61, which is 10 bits to the left of the usual location.  This shifted
1131 | significand must be normalized or smaller.  If `zSig' is not normalized,
1132 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1133 | and it must not require rounding.  In the usual case that `zSig' is
1134 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1135 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1136 | Binary Floating-Point Arithmetic.
1137 *----------------------------------------------------------------------------*/
1138 
1139 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1140                                    float_status *status)
1141 {
1142     int8_t roundingMode;
1143     flag roundNearestEven;
1144     int roundIncrement, roundBits;
1145     flag isTiny;
1146 
1147     roundingMode = status->float_rounding_mode;
1148     roundNearestEven = ( roundingMode == float_round_nearest_even );
1149     switch (roundingMode) {
1150     case float_round_nearest_even:
1151     case float_round_ties_away:
1152         roundIncrement = 0x200;
1153         break;
1154     case float_round_to_zero:
1155         roundIncrement = 0;
1156         break;
1157     case float_round_up:
1158         roundIncrement = zSign ? 0 : 0x3ff;
1159         break;
1160     case float_round_down:
1161         roundIncrement = zSign ? 0x3ff : 0;
1162         break;
1163     case float_round_to_odd:
1164         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1165         break;
1166     default:
1167         abort();
1168     }
1169     roundBits = zSig & 0x3FF;
1170     if ( 0x7FD <= (uint16_t) zExp ) {
1171         if (    ( 0x7FD < zExp )
1172              || (    ( zExp == 0x7FD )
1173                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
1174            ) {
1175             bool overflow_to_inf = roundingMode != float_round_to_odd &&
1176                                    roundIncrement != 0;
1177             float_raise(float_flag_overflow | float_flag_inexact, status);
1178             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
1179         }
1180         if ( zExp < 0 ) {
1181             if (status->flush_to_zero) {
1182                 float_raise(float_flag_output_denormal, status);
1183                 return packFloat64(zSign, 0, 0);
1184             }
1185             isTiny =
1186                    (status->float_detect_tininess
1187                     == float_tininess_before_rounding)
1188                 || ( zExp < -1 )
1189                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1190             shift64RightJamming( zSig, - zExp, &zSig );
1191             zExp = 0;
1192             roundBits = zSig & 0x3FF;
1193             if (isTiny && roundBits) {
1194                 float_raise(float_flag_underflow, status);
1195             }
1196             if (roundingMode == float_round_to_odd) {
1197                 /*
1198                  * For round-to-odd case, the roundIncrement depends on
1199                  * zSig which just changed.
1200                  */
1201                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1202             }
1203         }
1204     }
1205     if (roundBits) {
1206         status->float_exception_flags |= float_flag_inexact;
1207     }
1208     zSig = ( zSig + roundIncrement )>>10;
1209     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1210     if ( zSig == 0 ) zExp = 0;
1211     return packFloat64( zSign, zExp, zSig );
1212 
1213 }
1214 
1215 /*----------------------------------------------------------------------------
1216 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1217 | and significand `zSig', and returns the proper double-precision floating-
1218 | point value corresponding to the abstract input.  This routine is just like
1219 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1220 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1221 | floating-point exponent.
1222 *----------------------------------------------------------------------------*/
1223 
1224 static float64
1225  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1226                               float_status *status)
1227 {
1228     int8_t shiftCount;
1229 
1230     shiftCount = countLeadingZeros64( zSig ) - 1;
1231     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1232                                status);
1233 
1234 }
1235 
1236 /*----------------------------------------------------------------------------
1237 | Returns the fraction bits of the extended double-precision floating-point
1238 | value `a'.
1239 *----------------------------------------------------------------------------*/
1240 
1241 static inline uint64_t extractFloatx80Frac( floatx80 a )
1242 {
1243 
1244     return a.low;
1245 
1246 }
1247 
1248 /*----------------------------------------------------------------------------
1249 | Returns the exponent bits of the extended double-precision floating-point
1250 | value `a'.
1251 *----------------------------------------------------------------------------*/
1252 
1253 static inline int32_t extractFloatx80Exp( floatx80 a )
1254 {
1255 
1256     return a.high & 0x7FFF;
1257 
1258 }
1259 
1260 /*----------------------------------------------------------------------------
1261 | Returns the sign bit of the extended double-precision floating-point value
1262 | `a'.
1263 *----------------------------------------------------------------------------*/
1264 
1265 static inline flag extractFloatx80Sign( floatx80 a )
1266 {
1267 
1268     return a.high>>15;
1269 
1270 }
1271 
1272 /*----------------------------------------------------------------------------
1273 | Normalizes the subnormal extended double-precision floating-point value
1274 | represented by the denormalized significand `aSig'.  The normalized exponent
1275 | and significand are stored at the locations pointed to by `zExpPtr' and
1276 | `zSigPtr', respectively.
1277 *----------------------------------------------------------------------------*/
1278 
1279 static void
1280  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
1281 {
1282     int8_t shiftCount;
1283 
1284     shiftCount = countLeadingZeros64( aSig );
1285     *zSigPtr = aSig<<shiftCount;
1286     *zExpPtr = 1 - shiftCount;
1287 
1288 }
1289 
1290 /*----------------------------------------------------------------------------
1291 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1292 | extended double-precision floating-point value, returning the result.
1293 *----------------------------------------------------------------------------*/
1294 
1295 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
1296 {
1297     floatx80 z;
1298 
1299     z.low = zSig;
1300     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
1301     return z;
1302 
1303 }
1304 
1305 /*----------------------------------------------------------------------------
1306 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1307 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
1308 | and returns the proper extended double-precision floating-point value
1309 | corresponding to the abstract input.  Ordinarily, the abstract value is
1310 | rounded and packed into the extended double-precision format, with the
1311 | inexact exception raised if the abstract input cannot be represented
1312 | exactly.  However, if the abstract value is too large, the overflow and
1313 | inexact exceptions are raised and an infinity or maximal finite value is
1314 | returned.  If the abstract value is too small, the input value is rounded to
1315 | a subnormal number, and the underflow and inexact exceptions are raised if
1316 | the abstract input cannot be represented exactly as a subnormal extended
1317 | double-precision floating-point number.
1318 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
1319 | number of bits as single or double precision, respectively.  Otherwise, the
1320 | result is rounded to the full precision of the extended double-precision
1321 | format.
1322 |     The input significand must be normalized or smaller.  If the input
1323 | significand is not normalized, `zExp' must be 0; in that case, the result
1324 | returned is a subnormal number, and it must not require rounding.  The
1325 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1326 | Floating-Point Arithmetic.
1327 *----------------------------------------------------------------------------*/
1328 
1329 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
1330                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
1331                                      float_status *status)
1332 {
1333     int8_t roundingMode;
1334     flag roundNearestEven, increment, isTiny;
1335     int64_t roundIncrement, roundMask, roundBits;
1336 
1337     roundingMode = status->float_rounding_mode;
1338     roundNearestEven = ( roundingMode == float_round_nearest_even );
1339     if ( roundingPrecision == 80 ) goto precision80;
1340     if ( roundingPrecision == 64 ) {
1341         roundIncrement = LIT64( 0x0000000000000400 );
1342         roundMask = LIT64( 0x00000000000007FF );
1343     }
1344     else if ( roundingPrecision == 32 ) {
1345         roundIncrement = LIT64( 0x0000008000000000 );
1346         roundMask = LIT64( 0x000000FFFFFFFFFF );
1347     }
1348     else {
1349         goto precision80;
1350     }
1351     zSig0 |= ( zSig1 != 0 );
1352     switch (roundingMode) {
1353     case float_round_nearest_even:
1354     case float_round_ties_away:
1355         break;
1356     case float_round_to_zero:
1357         roundIncrement = 0;
1358         break;
1359     case float_round_up:
1360         roundIncrement = zSign ? 0 : roundMask;
1361         break;
1362     case float_round_down:
1363         roundIncrement = zSign ? roundMask : 0;
1364         break;
1365     default:
1366         abort();
1367     }
1368     roundBits = zSig0 & roundMask;
1369     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
1370         if (    ( 0x7FFE < zExp )
1371              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1372            ) {
1373             goto overflow;
1374         }
1375         if ( zExp <= 0 ) {
1376             if (status->flush_to_zero) {
1377                 float_raise(float_flag_output_denormal, status);
1378                 return packFloatx80(zSign, 0, 0);
1379             }
1380             isTiny =
1381                    (status->float_detect_tininess
1382                     == float_tininess_before_rounding)
1383                 || ( zExp < 0 )
1384                 || ( zSig0 <= zSig0 + roundIncrement );
1385             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1386             zExp = 0;
1387             roundBits = zSig0 & roundMask;
1388             if (isTiny && roundBits) {
1389                 float_raise(float_flag_underflow, status);
1390             }
1391             if (roundBits) {
1392                 status->float_exception_flags |= float_flag_inexact;
1393             }
1394             zSig0 += roundIncrement;
1395             if ( (int64_t) zSig0 < 0 ) zExp = 1;
1396             roundIncrement = roundMask + 1;
1397             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1398                 roundMask |= roundIncrement;
1399             }
1400             zSig0 &= ~ roundMask;
1401             return packFloatx80( zSign, zExp, zSig0 );
1402         }
1403     }
1404     if (roundBits) {
1405         status->float_exception_flags |= float_flag_inexact;
1406     }
1407     zSig0 += roundIncrement;
1408     if ( zSig0 < roundIncrement ) {
1409         ++zExp;
1410         zSig0 = LIT64( 0x8000000000000000 );
1411     }
1412     roundIncrement = roundMask + 1;
1413     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1414         roundMask |= roundIncrement;
1415     }
1416     zSig0 &= ~ roundMask;
1417     if ( zSig0 == 0 ) zExp = 0;
1418     return packFloatx80( zSign, zExp, zSig0 );
1419  precision80:
1420     switch (roundingMode) {
1421     case float_round_nearest_even:
1422     case float_round_ties_away:
1423         increment = ((int64_t)zSig1 < 0);
1424         break;
1425     case float_round_to_zero:
1426         increment = 0;
1427         break;
1428     case float_round_up:
1429         increment = !zSign && zSig1;
1430         break;
1431     case float_round_down:
1432         increment = zSign && zSig1;
1433         break;
1434     default:
1435         abort();
1436     }
1437     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
1438         if (    ( 0x7FFE < zExp )
1439              || (    ( zExp == 0x7FFE )
1440                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
1441                   && increment
1442                 )
1443            ) {
1444             roundMask = 0;
1445  overflow:
1446             float_raise(float_flag_overflow | float_flag_inexact, status);
1447             if (    ( roundingMode == float_round_to_zero )
1448                  || ( zSign && ( roundingMode == float_round_up ) )
1449                  || ( ! zSign && ( roundingMode == float_round_down ) )
1450                ) {
1451                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
1452             }
1453             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1454         }
1455         if ( zExp <= 0 ) {
1456             isTiny =
1457                    (status->float_detect_tininess
1458                     == float_tininess_before_rounding)
1459                 || ( zExp < 0 )
1460                 || ! increment
1461                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
1462             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
1463             zExp = 0;
1464             if (isTiny && zSig1) {
1465                 float_raise(float_flag_underflow, status);
1466             }
1467             if (zSig1) {
1468                 status->float_exception_flags |= float_flag_inexact;
1469             }
1470             switch (roundingMode) {
1471             case float_round_nearest_even:
1472             case float_round_ties_away:
1473                 increment = ((int64_t)zSig1 < 0);
1474                 break;
1475             case float_round_to_zero:
1476                 increment = 0;
1477                 break;
1478             case float_round_up:
1479                 increment = !zSign && zSig1;
1480                 break;
1481             case float_round_down:
1482                 increment = zSign && zSig1;
1483                 break;
1484             default:
1485                 abort();
1486             }
1487             if ( increment ) {
1488                 ++zSig0;
1489                 zSig0 &=
1490                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1491                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
1492             }
1493             return packFloatx80( zSign, zExp, zSig0 );
1494         }
1495     }
1496     if (zSig1) {
1497         status->float_exception_flags |= float_flag_inexact;
1498     }
1499     if ( increment ) {
1500         ++zSig0;
1501         if ( zSig0 == 0 ) {
1502             ++zExp;
1503             zSig0 = LIT64( 0x8000000000000000 );
1504         }
1505         else {
1506             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1507         }
1508     }
1509     else {
1510         if ( zSig0 == 0 ) zExp = 0;
1511     }
1512     return packFloatx80( zSign, zExp, zSig0 );
1513 
1514 }
1515 
1516 /*----------------------------------------------------------------------------
1517 | Takes an abstract floating-point value having sign `zSign', exponent
1518 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1519 | and returns the proper extended double-precision floating-point value
1520 | corresponding to the abstract input.  This routine is just like
1521 | `roundAndPackFloatx80' except that the input significand does not have to be
1522 | normalized.
1523 *----------------------------------------------------------------------------*/
1524 
1525 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
1526                                               flag zSign, int32_t zExp,
1527                                               uint64_t zSig0, uint64_t zSig1,
1528                                               float_status *status)
1529 {
1530     int8_t shiftCount;
1531 
1532     if ( zSig0 == 0 ) {
1533         zSig0 = zSig1;
1534         zSig1 = 0;
1535         zExp -= 64;
1536     }
1537     shiftCount = countLeadingZeros64( zSig0 );
1538     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1539     zExp -= shiftCount;
1540     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1541                                 zSig0, zSig1, status);
1542 
1543 }
1544 
1545 /*----------------------------------------------------------------------------
1546 | Returns the least-significant 64 fraction bits of the quadruple-precision
1547 | floating-point value `a'.
1548 *----------------------------------------------------------------------------*/
1549 
1550 static inline uint64_t extractFloat128Frac1( float128 a )
1551 {
1552 
1553     return a.low;
1554 
1555 }
1556 
1557 /*----------------------------------------------------------------------------
1558 | Returns the most-significant 48 fraction bits of the quadruple-precision
1559 | floating-point value `a'.
1560 *----------------------------------------------------------------------------*/
1561 
1562 static inline uint64_t extractFloat128Frac0( float128 a )
1563 {
1564 
1565     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1566 
1567 }
1568 
1569 /*----------------------------------------------------------------------------
1570 | Returns the exponent bits of the quadruple-precision floating-point value
1571 | `a'.
1572 *----------------------------------------------------------------------------*/
1573 
1574 static inline int32_t extractFloat128Exp( float128 a )
1575 {
1576 
1577     return ( a.high>>48 ) & 0x7FFF;
1578 
1579 }
1580 
1581 /*----------------------------------------------------------------------------
1582 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1583 *----------------------------------------------------------------------------*/
1584 
1585 static inline flag extractFloat128Sign( float128 a )
1586 {
1587 
1588     return a.high>>63;
1589 
1590 }
1591 
1592 /*----------------------------------------------------------------------------
1593 | Normalizes the subnormal quadruple-precision floating-point value
1594 | represented by the denormalized significand formed by the concatenation of
1595 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1596 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1597 | significand are stored at the location pointed to by `zSig0Ptr', and the
1598 | least significant 64 bits of the normalized significand are stored at the
1599 | location pointed to by `zSig1Ptr'.
1600 *----------------------------------------------------------------------------*/
1601 
1602 static void
1603  normalizeFloat128Subnormal(
1604      uint64_t aSig0,
1605      uint64_t aSig1,
1606      int32_t *zExpPtr,
1607      uint64_t *zSig0Ptr,
1608      uint64_t *zSig1Ptr
1609  )
1610 {
1611     int8_t shiftCount;
1612 
1613     if ( aSig0 == 0 ) {
1614         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1615         if ( shiftCount < 0 ) {
1616             *zSig0Ptr = aSig1>>( - shiftCount );
1617             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1618         }
1619         else {
1620             *zSig0Ptr = aSig1<<shiftCount;
1621             *zSig1Ptr = 0;
1622         }
1623         *zExpPtr = - shiftCount - 63;
1624     }
1625     else {
1626         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1627         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1628         *zExpPtr = 1 - shiftCount;
1629     }
1630 
1631 }
1632 
1633 /*----------------------------------------------------------------------------
1634 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1635 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1636 | floating-point value, returning the result.  After being shifted into the
1637 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1638 | added together to form the most significant 32 bits of the result.  This
1639 | means that any integer portion of `zSig0' will be added into the exponent.
1640 | Since a properly normalized significand will have an integer portion equal
1641 | to 1, the `zExp' input should be 1 less than the desired result exponent
1642 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1643 | significand.
1644 *----------------------------------------------------------------------------*/
1645 
1646 static inline float128
1647  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1648 {
1649     float128 z;
1650 
1651     z.low = zSig1;
1652     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1653     return z;
1654 
1655 }
1656 
1657 /*----------------------------------------------------------------------------
1658 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1659 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1660 | and `zSig2', and returns the proper quadruple-precision floating-point value
1661 | corresponding to the abstract input.  Ordinarily, the abstract value is
1662 | simply rounded and packed into the quadruple-precision format, with the
1663 | inexact exception raised if the abstract input cannot be represented
1664 | exactly.  However, if the abstract value is too large, the overflow and
1665 | inexact exceptions are raised and an infinity or maximal finite value is
1666 | returned.  If the abstract value is too small, the input value is rounded to
1667 | a subnormal number, and the underflow and inexact exceptions are raised if
1668 | the abstract input cannot be represented exactly as a subnormal quadruple-
1669 | precision floating-point number.
1670 |     The input significand must be normalized or smaller.  If the input
1671 | significand is not normalized, `zExp' must be 0; in that case, the result
1672 | returned is a subnormal number, and it must not require rounding.  In the
1673 | usual case that the input significand is normalized, `zExp' must be 1 less
1674 | than the ``true'' floating-point exponent.  The handling of underflow and
1675 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1676 *----------------------------------------------------------------------------*/
1677 
1678 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1679                                      uint64_t zSig0, uint64_t zSig1,
1680                                      uint64_t zSig2, float_status *status)
1681 {
1682     int8_t roundingMode;
1683     flag roundNearestEven, increment, isTiny;
1684 
1685     roundingMode = status->float_rounding_mode;
1686     roundNearestEven = ( roundingMode == float_round_nearest_even );
1687     switch (roundingMode) {
1688     case float_round_nearest_even:
1689     case float_round_ties_away:
1690         increment = ((int64_t)zSig2 < 0);
1691         break;
1692     case float_round_to_zero:
1693         increment = 0;
1694         break;
1695     case float_round_up:
1696         increment = !zSign && zSig2;
1697         break;
1698     case float_round_down:
1699         increment = zSign && zSig2;
1700         break;
1701     case float_round_to_odd:
1702         increment = !(zSig1 & 0x1) && zSig2;
1703         break;
1704     default:
1705         abort();
1706     }
1707     if ( 0x7FFD <= (uint32_t) zExp ) {
1708         if (    ( 0x7FFD < zExp )
1709              || (    ( zExp == 0x7FFD )
1710                   && eq128(
1711                          LIT64( 0x0001FFFFFFFFFFFF ),
1712                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1713                          zSig0,
1714                          zSig1
1715                      )
1716                   && increment
1717                 )
1718            ) {
1719             float_raise(float_flag_overflow | float_flag_inexact, status);
1720             if (    ( roundingMode == float_round_to_zero )
1721                  || ( zSign && ( roundingMode == float_round_up ) )
1722                  || ( ! zSign && ( roundingMode == float_round_down ) )
1723                  || (roundingMode == float_round_to_odd)
1724                ) {
1725                 return
1726                     packFloat128(
1727                         zSign,
1728                         0x7FFE,
1729                         LIT64( 0x0000FFFFFFFFFFFF ),
1730                         LIT64( 0xFFFFFFFFFFFFFFFF )
1731                     );
1732             }
1733             return packFloat128( zSign, 0x7FFF, 0, 0 );
1734         }
1735         if ( zExp < 0 ) {
1736             if (status->flush_to_zero) {
1737                 float_raise(float_flag_output_denormal, status);
1738                 return packFloat128(zSign, 0, 0, 0);
1739             }
1740             isTiny =
1741                    (status->float_detect_tininess
1742                     == float_tininess_before_rounding)
1743                 || ( zExp < -1 )
1744                 || ! increment
1745                 || lt128(
1746                        zSig0,
1747                        zSig1,
1748                        LIT64( 0x0001FFFFFFFFFFFF ),
1749                        LIT64( 0xFFFFFFFFFFFFFFFF )
1750                    );
1751             shift128ExtraRightJamming(
1752                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1753             zExp = 0;
1754             if (isTiny && zSig2) {
1755                 float_raise(float_flag_underflow, status);
1756             }
1757             switch (roundingMode) {
1758             case float_round_nearest_even:
1759             case float_round_ties_away:
1760                 increment = ((int64_t)zSig2 < 0);
1761                 break;
1762             case float_round_to_zero:
1763                 increment = 0;
1764                 break;
1765             case float_round_up:
1766                 increment = !zSign && zSig2;
1767                 break;
1768             case float_round_down:
1769                 increment = zSign && zSig2;
1770                 break;
1771             case float_round_to_odd:
1772                 increment = !(zSig1 & 0x1) && zSig2;
1773                 break;
1774             default:
1775                 abort();
1776             }
1777         }
1778     }
1779     if (zSig2) {
1780         status->float_exception_flags |= float_flag_inexact;
1781     }
1782     if ( increment ) {
1783         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1784         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1785     }
1786     else {
1787         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1788     }
1789     return packFloat128( zSign, zExp, zSig0, zSig1 );
1790 
1791 }
1792 
1793 /*----------------------------------------------------------------------------
1794 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1795 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1796 | returns the proper quadruple-precision floating-point value corresponding
1797 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1798 | except that the input significand has fewer bits and does not have to be
1799 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1800 | point exponent.
1801 *----------------------------------------------------------------------------*/
1802 
1803 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1804                                               uint64_t zSig0, uint64_t zSig1,
1805                                               float_status *status)
1806 {
1807     int8_t shiftCount;
1808     uint64_t zSig2;
1809 
1810     if ( zSig0 == 0 ) {
1811         zSig0 = zSig1;
1812         zSig1 = 0;
1813         zExp -= 64;
1814     }
1815     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1816     if ( 0 <= shiftCount ) {
1817         zSig2 = 0;
1818         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1819     }
1820     else {
1821         shift128ExtraRightJamming(
1822             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1823     }
1824     zExp -= shiftCount;
1825     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1826 
1827 }
1828 
1829 /*----------------------------------------------------------------------------
1830 | Returns the result of converting the 32-bit two's complement integer `a'
1831 | to the single-precision floating-point format.  The conversion is performed
1832 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1833 *----------------------------------------------------------------------------*/
1834 
1835 float32 int32_to_float32(int32_t a, float_status *status)
1836 {
1837     flag zSign;
1838 
1839     if ( a == 0 ) return float32_zero;
1840     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1841     zSign = ( a < 0 );
1842     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1843 }
1844 
1845 /*----------------------------------------------------------------------------
1846 | Returns the result of converting the 32-bit two's complement integer `a'
1847 | to the double-precision floating-point format.  The conversion is performed
1848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1849 *----------------------------------------------------------------------------*/
1850 
1851 float64 int32_to_float64(int32_t a, float_status *status)
1852 {
1853     flag zSign;
1854     uint32_t absA;
1855     int8_t shiftCount;
1856     uint64_t zSig;
1857 
1858     if ( a == 0 ) return float64_zero;
1859     zSign = ( a < 0 );
1860     absA = zSign ? - a : a;
1861     shiftCount = countLeadingZeros32( absA ) + 21;
1862     zSig = absA;
1863     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1864 
1865 }
1866 
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the 32-bit two's complement integer `a'
1869 | to the extended double-precision floating-point format.  The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873 
1874 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1875 {
1876     flag zSign;
1877     uint32_t absA;
1878     int8_t shiftCount;
1879     uint64_t zSig;
1880 
1881     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1882     zSign = ( a < 0 );
1883     absA = zSign ? - a : a;
1884     shiftCount = countLeadingZeros32( absA ) + 32;
1885     zSig = absA;
1886     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1887 
1888 }
1889 
1890 /*----------------------------------------------------------------------------
1891 | Returns the result of converting the 32-bit two's complement integer `a' to
1892 | the quadruple-precision floating-point format.  The conversion is performed
1893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1894 *----------------------------------------------------------------------------*/
1895 
1896 float128 int32_to_float128(int32_t a, float_status *status)
1897 {
1898     flag zSign;
1899     uint32_t absA;
1900     int8_t shiftCount;
1901     uint64_t zSig0;
1902 
1903     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1904     zSign = ( a < 0 );
1905     absA = zSign ? - a : a;
1906     shiftCount = countLeadingZeros32( absA ) + 17;
1907     zSig0 = absA;
1908     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1909 
1910 }
1911 
1912 /*----------------------------------------------------------------------------
1913 | Returns the result of converting the 64-bit two's complement integer `a'
1914 | to the single-precision floating-point format.  The conversion is performed
1915 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1916 *----------------------------------------------------------------------------*/
1917 
1918 float32 int64_to_float32(int64_t a, float_status *status)
1919 {
1920     flag zSign;
1921     uint64_t absA;
1922     int8_t shiftCount;
1923 
1924     if ( a == 0 ) return float32_zero;
1925     zSign = ( a < 0 );
1926     absA = zSign ? - a : a;
1927     shiftCount = countLeadingZeros64( absA ) - 40;
1928     if ( 0 <= shiftCount ) {
1929         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1930     }
1931     else {
1932         shiftCount += 7;
1933         if ( shiftCount < 0 ) {
1934             shift64RightJamming( absA, - shiftCount, &absA );
1935         }
1936         else {
1937             absA <<= shiftCount;
1938         }
1939         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1940     }
1941 
1942 }
1943 
1944 /*----------------------------------------------------------------------------
1945 | Returns the result of converting the 64-bit two's complement integer `a'
1946 | to the double-precision floating-point format.  The conversion is performed
1947 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1948 *----------------------------------------------------------------------------*/
1949 
1950 float64 int64_to_float64(int64_t a, float_status *status)
1951 {
1952     flag zSign;
1953 
1954     if ( a == 0 ) return float64_zero;
1955     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1956         return packFloat64( 1, 0x43E, 0 );
1957     }
1958     zSign = ( a < 0 );
1959     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1960 }
1961 
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of converting the 64-bit two's complement integer `a'
1964 | to the extended double-precision floating-point format.  The conversion
1965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1966 | Arithmetic.
1967 *----------------------------------------------------------------------------*/
1968 
1969 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1970 {
1971     flag zSign;
1972     uint64_t absA;
1973     int8_t shiftCount;
1974 
1975     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1976     zSign = ( a < 0 );
1977     absA = zSign ? - a : a;
1978     shiftCount = countLeadingZeros64( absA );
1979     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1980 
1981 }
1982 
1983 /*----------------------------------------------------------------------------
1984 | Returns the result of converting the 64-bit two's complement integer `a' to
1985 | the quadruple-precision floating-point format.  The conversion is performed
1986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1987 *----------------------------------------------------------------------------*/
1988 
1989 float128 int64_to_float128(int64_t a, float_status *status)
1990 {
1991     flag zSign;
1992     uint64_t absA;
1993     int8_t shiftCount;
1994     int32_t zExp;
1995     uint64_t zSig0, zSig1;
1996 
1997     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1998     zSign = ( a < 0 );
1999     absA = zSign ? - a : a;
2000     shiftCount = countLeadingZeros64( absA ) + 49;
2001     zExp = 0x406E - shiftCount;
2002     if ( 64 <= shiftCount ) {
2003         zSig1 = 0;
2004         zSig0 = absA;
2005         shiftCount -= 64;
2006     }
2007     else {
2008         zSig1 = absA;
2009         zSig0 = 0;
2010     }
2011     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2012     return packFloat128( zSign, zExp, zSig0, zSig1 );
2013 
2014 }
2015 
2016 /*----------------------------------------------------------------------------
2017 | Returns the result of converting the 64-bit unsigned integer `a'
2018 | to the single-precision floating-point format.  The conversion is performed
2019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2020 *----------------------------------------------------------------------------*/
2021 
2022 float32 uint64_to_float32(uint64_t a, float_status *status)
2023 {
2024     int shiftcount;
2025 
2026     if (a == 0) {
2027         return float32_zero;
2028     }
2029 
2030     /* Determine (left) shift needed to put first set bit into bit posn 23
2031      * (since packFloat32() expects the binary point between bits 23 and 22);
2032      * this is the fast case for smallish numbers.
2033      */
2034     shiftcount = countLeadingZeros64(a) - 40;
2035     if (shiftcount >= 0) {
2036         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2037     }
2038     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2039      * expects the binary point between bits 30 and 29, hence the + 7.
2040      */
2041     shiftcount += 7;
2042     if (shiftcount < 0) {
2043         shift64RightJamming(a, -shiftcount, &a);
2044     } else {
2045         a <<= shiftcount;
2046     }
2047 
2048     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
2049 }
2050 
2051 /*----------------------------------------------------------------------------
2052 | Returns the result of converting the 64-bit unsigned integer `a'
2053 | to the double-precision floating-point format.  The conversion is performed
2054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055 *----------------------------------------------------------------------------*/
2056 
2057 float64 uint64_to_float64(uint64_t a, float_status *status)
2058 {
2059     int exp = 0x43C;
2060     int shiftcount;
2061 
2062     if (a == 0) {
2063         return float64_zero;
2064     }
2065 
2066     shiftcount = countLeadingZeros64(a) - 1;
2067     if (shiftcount < 0) {
2068         shift64RightJamming(a, -shiftcount, &a);
2069     } else {
2070         a <<= shiftcount;
2071     }
2072     return roundAndPackFloat64(0, exp - shiftcount, a, status);
2073 }
2074 
2075 /*----------------------------------------------------------------------------
2076 | Returns the result of converting the 64-bit unsigned integer `a'
2077 | to the quadruple-precision floating-point format.  The conversion is performed
2078 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2079 *----------------------------------------------------------------------------*/
2080 
2081 float128 uint64_to_float128(uint64_t a, float_status *status)
2082 {
2083     if (a == 0) {
2084         return float128_zero;
2085     }
2086     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
2087 }
2088 
2089 /*----------------------------------------------------------------------------
2090 | Returns the result of converting the single-precision floating-point value
2091 | `a' to the 32-bit two's complement integer format.  The conversion is
2092 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2093 | Arithmetic---which means in particular that the conversion is rounded
2094 | according to the current rounding mode.  If `a' is a NaN, the largest
2095 | positive integer is returned.  Otherwise, if the conversion overflows, the
2096 | largest integer with the same sign as `a' is returned.
2097 *----------------------------------------------------------------------------*/
2098 
2099 int32_t float32_to_int32(float32 a, float_status *status)
2100 {
2101     flag aSign;
2102     int aExp;
2103     int shiftCount;
2104     uint32_t aSig;
2105     uint64_t aSig64;
2106 
2107     a = float32_squash_input_denormal(a, status);
2108     aSig = extractFloat32Frac( a );
2109     aExp = extractFloat32Exp( a );
2110     aSign = extractFloat32Sign( a );
2111     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2112     if ( aExp ) aSig |= 0x00800000;
2113     shiftCount = 0xAF - aExp;
2114     aSig64 = aSig;
2115     aSig64 <<= 32;
2116     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
2117     return roundAndPackInt32(aSign, aSig64, status);
2118 
2119 }
2120 
2121 /*----------------------------------------------------------------------------
2122 | Returns the result of converting the single-precision floating-point value
2123 | `a' to the 32-bit two's complement integer format.  The conversion is
2124 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2125 | Arithmetic, except that the conversion is always rounded toward zero.
2126 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2127 | the conversion overflows, the largest integer with the same sign as `a' is
2128 | returned.
2129 *----------------------------------------------------------------------------*/
2130 
2131 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
2132 {
2133     flag aSign;
2134     int aExp;
2135     int shiftCount;
2136     uint32_t aSig;
2137     int32_t z;
2138     a = float32_squash_input_denormal(a, status);
2139 
2140     aSig = extractFloat32Frac( a );
2141     aExp = extractFloat32Exp( a );
2142     aSign = extractFloat32Sign( a );
2143     shiftCount = aExp - 0x9E;
2144     if ( 0 <= shiftCount ) {
2145         if ( float32_val(a) != 0xCF000000 ) {
2146             float_raise(float_flag_invalid, status);
2147             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2148         }
2149         return (int32_t) 0x80000000;
2150     }
2151     else if ( aExp <= 0x7E ) {
2152         if (aExp | aSig) {
2153             status->float_exception_flags |= float_flag_inexact;
2154         }
2155         return 0;
2156     }
2157     aSig = ( aSig | 0x00800000 )<<8;
2158     z = aSig>>( - shiftCount );
2159     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2160         status->float_exception_flags |= float_flag_inexact;
2161     }
2162     if ( aSign ) z = - z;
2163     return z;
2164 
2165 }
2166 
2167 /*----------------------------------------------------------------------------
2168 | Returns the result of converting the single-precision floating-point value
2169 | `a' to the 16-bit two's complement integer format.  The conversion is
2170 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2171 | Arithmetic, except that the conversion is always rounded toward zero.
2172 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2173 | the conversion overflows, the largest integer with the same sign as `a' is
2174 | returned.
2175 *----------------------------------------------------------------------------*/
2176 
2177 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
2178 {
2179     flag aSign;
2180     int aExp;
2181     int shiftCount;
2182     uint32_t aSig;
2183     int32_t z;
2184 
2185     aSig = extractFloat32Frac( a );
2186     aExp = extractFloat32Exp( a );
2187     aSign = extractFloat32Sign( a );
2188     shiftCount = aExp - 0x8E;
2189     if ( 0 <= shiftCount ) {
2190         if ( float32_val(a) != 0xC7000000 ) {
2191             float_raise(float_flag_invalid, status);
2192             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2193                 return 0x7FFF;
2194             }
2195         }
2196         return (int32_t) 0xffff8000;
2197     }
2198     else if ( aExp <= 0x7E ) {
2199         if ( aExp | aSig ) {
2200             status->float_exception_flags |= float_flag_inexact;
2201         }
2202         return 0;
2203     }
2204     shiftCount -= 0x10;
2205     aSig = ( aSig | 0x00800000 )<<8;
2206     z = aSig>>( - shiftCount );
2207     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2208         status->float_exception_flags |= float_flag_inexact;
2209     }
2210     if ( aSign ) {
2211         z = - z;
2212     }
2213     return z;
2214 
2215 }
2216 
2217 /*----------------------------------------------------------------------------
2218 | Returns the result of converting the single-precision floating-point value
2219 | `a' to the 64-bit two's complement integer format.  The conversion is
2220 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2221 | Arithmetic---which means in particular that the conversion is rounded
2222 | according to the current rounding mode.  If `a' is a NaN, the largest
2223 | positive integer is returned.  Otherwise, if the conversion overflows, the
2224 | largest integer with the same sign as `a' is returned.
2225 *----------------------------------------------------------------------------*/
2226 
2227 int64_t float32_to_int64(float32 a, float_status *status)
2228 {
2229     flag aSign;
2230     int aExp;
2231     int shiftCount;
2232     uint32_t aSig;
2233     uint64_t aSig64, aSigExtra;
2234     a = float32_squash_input_denormal(a, status);
2235 
2236     aSig = extractFloat32Frac( a );
2237     aExp = extractFloat32Exp( a );
2238     aSign = extractFloat32Sign( a );
2239     shiftCount = 0xBE - aExp;
2240     if ( shiftCount < 0 ) {
2241         float_raise(float_flag_invalid, status);
2242         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2243             return LIT64( 0x7FFFFFFFFFFFFFFF );
2244         }
2245         return (int64_t) LIT64( 0x8000000000000000 );
2246     }
2247     if ( aExp ) aSig |= 0x00800000;
2248     aSig64 = aSig;
2249     aSig64 <<= 40;
2250     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
2251     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
2252 
2253 }
2254 
2255 /*----------------------------------------------------------------------------
2256 | Returns the result of converting the single-precision floating-point value
2257 | `a' to the 64-bit unsigned integer format.  The conversion is
2258 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2259 | Arithmetic---which means in particular that the conversion is rounded
2260 | according to the current rounding mode.  If `a' is a NaN, the largest
2261 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
2262 | largest unsigned integer is returned.  If the 'a' is negative, the result
2263 | is rounded and zero is returned; values that do not round to zero will
2264 | raise the inexact exception flag.
2265 *----------------------------------------------------------------------------*/
2266 
2267 uint64_t float32_to_uint64(float32 a, float_status *status)
2268 {
2269     flag aSign;
2270     int aExp;
2271     int shiftCount;
2272     uint32_t aSig;
2273     uint64_t aSig64, aSigExtra;
2274     a = float32_squash_input_denormal(a, status);
2275 
2276     aSig = extractFloat32Frac(a);
2277     aExp = extractFloat32Exp(a);
2278     aSign = extractFloat32Sign(a);
2279     if ((aSign) && (aExp > 126)) {
2280         float_raise(float_flag_invalid, status);
2281         if (float32_is_any_nan(a)) {
2282             return LIT64(0xFFFFFFFFFFFFFFFF);
2283         } else {
2284             return 0;
2285         }
2286     }
2287     shiftCount = 0xBE - aExp;
2288     if (aExp) {
2289         aSig |= 0x00800000;
2290     }
2291     if (shiftCount < 0) {
2292         float_raise(float_flag_invalid, status);
2293         return LIT64(0xFFFFFFFFFFFFFFFF);
2294     }
2295 
2296     aSig64 = aSig;
2297     aSig64 <<= 40;
2298     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
2299     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2300 }
2301 
2302 /*----------------------------------------------------------------------------
2303 | Returns the result of converting the single-precision floating-point value
2304 | `a' to the 64-bit unsigned integer format.  The conversion is
2305 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2306 | Arithmetic, except that the conversion is always rounded toward zero.  If
2307 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
2308 | conversion overflows, the largest unsigned integer is returned.  If the
2309 | 'a' is negative, the result is rounded and zero is returned; values that do
2310 | not round to zero will raise the inexact flag.
2311 *----------------------------------------------------------------------------*/
2312 
2313 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
2314 {
2315     signed char current_rounding_mode = status->float_rounding_mode;
2316     set_float_rounding_mode(float_round_to_zero, status);
2317     int64_t v = float32_to_uint64(a, status);
2318     set_float_rounding_mode(current_rounding_mode, status);
2319     return v;
2320 }
2321 
2322 /*----------------------------------------------------------------------------
2323 | Returns the result of converting the single-precision floating-point value
2324 | `a' to the 64-bit two's complement integer format.  The conversion is
2325 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2326 | Arithmetic, except that the conversion is always rounded toward zero.  If
2327 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2328 | conversion overflows, the largest integer with the same sign as `a' is
2329 | returned.
2330 *----------------------------------------------------------------------------*/
2331 
2332 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
2333 {
2334     flag aSign;
2335     int aExp;
2336     int shiftCount;
2337     uint32_t aSig;
2338     uint64_t aSig64;
2339     int64_t z;
2340     a = float32_squash_input_denormal(a, status);
2341 
2342     aSig = extractFloat32Frac( a );
2343     aExp = extractFloat32Exp( a );
2344     aSign = extractFloat32Sign( a );
2345     shiftCount = aExp - 0xBE;
2346     if ( 0 <= shiftCount ) {
2347         if ( float32_val(a) != 0xDF000000 ) {
2348             float_raise(float_flag_invalid, status);
2349             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2350                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2351             }
2352         }
2353         return (int64_t) LIT64( 0x8000000000000000 );
2354     }
2355     else if ( aExp <= 0x7E ) {
2356         if (aExp | aSig) {
2357             status->float_exception_flags |= float_flag_inexact;
2358         }
2359         return 0;
2360     }
2361     aSig64 = aSig | 0x00800000;
2362     aSig64 <<= 40;
2363     z = aSig64>>( - shiftCount );
2364     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
2365         status->float_exception_flags |= float_flag_inexact;
2366     }
2367     if ( aSign ) z = - z;
2368     return z;
2369 
2370 }
2371 
2372 /*----------------------------------------------------------------------------
2373 | Returns the result of converting the single-precision floating-point value
2374 | `a' to the double-precision floating-point format.  The conversion is
2375 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2376 | Arithmetic.
2377 *----------------------------------------------------------------------------*/
2378 
2379 float64 float32_to_float64(float32 a, float_status *status)
2380 {
2381     flag aSign;
2382     int aExp;
2383     uint32_t aSig;
2384     a = float32_squash_input_denormal(a, status);
2385 
2386     aSig = extractFloat32Frac( a );
2387     aExp = extractFloat32Exp( a );
2388     aSign = extractFloat32Sign( a );
2389     if ( aExp == 0xFF ) {
2390         if (aSig) {
2391             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2392         }
2393         return packFloat64( aSign, 0x7FF, 0 );
2394     }
2395     if ( aExp == 0 ) {
2396         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2397         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2398         --aExp;
2399     }
2400     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
2401 
2402 }
2403 
2404 /*----------------------------------------------------------------------------
2405 | Returns the result of converting the single-precision floating-point value
2406 | `a' to the extended double-precision floating-point format.  The conversion
2407 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 | Arithmetic.
2409 *----------------------------------------------------------------------------*/
2410 
2411 floatx80 float32_to_floatx80(float32 a, float_status *status)
2412 {
2413     flag aSign;
2414     int aExp;
2415     uint32_t aSig;
2416 
2417     a = float32_squash_input_denormal(a, status);
2418     aSig = extractFloat32Frac( a );
2419     aExp = extractFloat32Exp( a );
2420     aSign = extractFloat32Sign( a );
2421     if ( aExp == 0xFF ) {
2422         if (aSig) {
2423             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2424         }
2425         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2426     }
2427     if ( aExp == 0 ) {
2428         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2429         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2430     }
2431     aSig |= 0x00800000;
2432     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
2433 
2434 }
2435 
2436 /*----------------------------------------------------------------------------
2437 | Returns the result of converting the single-precision floating-point value
2438 | `a' to the double-precision floating-point format.  The conversion is
2439 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2440 | Arithmetic.
2441 *----------------------------------------------------------------------------*/
2442 
2443 float128 float32_to_float128(float32 a, float_status *status)
2444 {
2445     flag aSign;
2446     int aExp;
2447     uint32_t aSig;
2448 
2449     a = float32_squash_input_denormal(a, status);
2450     aSig = extractFloat32Frac( a );
2451     aExp = extractFloat32Exp( a );
2452     aSign = extractFloat32Sign( a );
2453     if ( aExp == 0xFF ) {
2454         if (aSig) {
2455             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2456         }
2457         return packFloat128( aSign, 0x7FFF, 0, 0 );
2458     }
2459     if ( aExp == 0 ) {
2460         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2461         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2462         --aExp;
2463     }
2464     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
2465 
2466 }
2467 
2468 /*----------------------------------------------------------------------------
2469 | Rounds the single-precision floating-point value `a' to an integer, and
2470 | returns the result as a single-precision floating-point value.  The
2471 | operation is performed according to the IEC/IEEE Standard for Binary
2472 | Floating-Point Arithmetic.
2473 *----------------------------------------------------------------------------*/
2474 
2475 float32 float32_round_to_int(float32 a, float_status *status)
2476 {
2477     flag aSign;
2478     int aExp;
2479     uint32_t lastBitMask, roundBitsMask;
2480     uint32_t z;
2481     a = float32_squash_input_denormal(a, status);
2482 
2483     aExp = extractFloat32Exp( a );
2484     if ( 0x96 <= aExp ) {
2485         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
2486             return propagateFloat32NaN(a, a, status);
2487         }
2488         return a;
2489     }
2490     if ( aExp <= 0x7E ) {
2491         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
2492         status->float_exception_flags |= float_flag_inexact;
2493         aSign = extractFloat32Sign( a );
2494         switch (status->float_rounding_mode) {
2495          case float_round_nearest_even:
2496             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2497                 return packFloat32( aSign, 0x7F, 0 );
2498             }
2499             break;
2500         case float_round_ties_away:
2501             if (aExp == 0x7E) {
2502                 return packFloat32(aSign, 0x7F, 0);
2503             }
2504             break;
2505          case float_round_down:
2506             return make_float32(aSign ? 0xBF800000 : 0);
2507          case float_round_up:
2508             return make_float32(aSign ? 0x80000000 : 0x3F800000);
2509         }
2510         return packFloat32( aSign, 0, 0 );
2511     }
2512     lastBitMask = 1;
2513     lastBitMask <<= 0x96 - aExp;
2514     roundBitsMask = lastBitMask - 1;
2515     z = float32_val(a);
2516     switch (status->float_rounding_mode) {
2517     case float_round_nearest_even:
2518         z += lastBitMask>>1;
2519         if ((z & roundBitsMask) == 0) {
2520             z &= ~lastBitMask;
2521         }
2522         break;
2523     case float_round_ties_away:
2524         z += lastBitMask >> 1;
2525         break;
2526     case float_round_to_zero:
2527         break;
2528     case float_round_up:
2529         if (!extractFloat32Sign(make_float32(z))) {
2530             z += roundBitsMask;
2531         }
2532         break;
2533     case float_round_down:
2534         if (extractFloat32Sign(make_float32(z))) {
2535             z += roundBitsMask;
2536         }
2537         break;
2538     default:
2539         abort();
2540     }
2541     z &= ~ roundBitsMask;
2542     if (z != float32_val(a)) {
2543         status->float_exception_flags |= float_flag_inexact;
2544     }
2545     return make_float32(z);
2546 
2547 }
2548 
2549 /*----------------------------------------------------------------------------
2550 | Returns the result of multiplying the single-precision floating-point values
2551 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2552 | for Binary Floating-Point Arithmetic.
2553 *----------------------------------------------------------------------------*/
2554 
2555 float32 float32_mul(float32 a, float32 b, float_status *status)
2556 {
2557     flag aSign, bSign, zSign;
2558     int aExp, bExp, zExp;
2559     uint32_t aSig, bSig;
2560     uint64_t zSig64;
2561     uint32_t zSig;
2562 
2563     a = float32_squash_input_denormal(a, status);
2564     b = float32_squash_input_denormal(b, status);
2565 
2566     aSig = extractFloat32Frac( a );
2567     aExp = extractFloat32Exp( a );
2568     aSign = extractFloat32Sign( a );
2569     bSig = extractFloat32Frac( b );
2570     bExp = extractFloat32Exp( b );
2571     bSign = extractFloat32Sign( b );
2572     zSign = aSign ^ bSign;
2573     if ( aExp == 0xFF ) {
2574         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2575             return propagateFloat32NaN(a, b, status);
2576         }
2577         if ( ( bExp | bSig ) == 0 ) {
2578             float_raise(float_flag_invalid, status);
2579             return float32_default_nan(status);
2580         }
2581         return packFloat32( zSign, 0xFF, 0 );
2582     }
2583     if ( bExp == 0xFF ) {
2584         if (bSig) {
2585             return propagateFloat32NaN(a, b, status);
2586         }
2587         if ( ( aExp | aSig ) == 0 ) {
2588             float_raise(float_flag_invalid, status);
2589             return float32_default_nan(status);
2590         }
2591         return packFloat32( zSign, 0xFF, 0 );
2592     }
2593     if ( aExp == 0 ) {
2594         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2595         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2596     }
2597     if ( bExp == 0 ) {
2598         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2599         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2600     }
2601     zExp = aExp + bExp - 0x7F;
2602     aSig = ( aSig | 0x00800000 )<<7;
2603     bSig = ( bSig | 0x00800000 )<<8;
2604     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2605     zSig = zSig64;
2606     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2607         zSig <<= 1;
2608         --zExp;
2609     }
2610     return roundAndPackFloat32(zSign, zExp, zSig, status);
2611 
2612 }
2613 
2614 /*----------------------------------------------------------------------------
2615 | Returns the result of dividing the single-precision floating-point value `a'
2616 | by the corresponding value `b'.  The operation is performed according to the
2617 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2618 *----------------------------------------------------------------------------*/
2619 
2620 float32 float32_div(float32 a, float32 b, float_status *status)
2621 {
2622     flag aSign, bSign, zSign;
2623     int aExp, bExp, zExp;
2624     uint32_t aSig, bSig, zSig;
2625     a = float32_squash_input_denormal(a, status);
2626     b = float32_squash_input_denormal(b, status);
2627 
2628     aSig = extractFloat32Frac( a );
2629     aExp = extractFloat32Exp( a );
2630     aSign = extractFloat32Sign( a );
2631     bSig = extractFloat32Frac( b );
2632     bExp = extractFloat32Exp( b );
2633     bSign = extractFloat32Sign( b );
2634     zSign = aSign ^ bSign;
2635     if ( aExp == 0xFF ) {
2636         if (aSig) {
2637             return propagateFloat32NaN(a, b, status);
2638         }
2639         if ( bExp == 0xFF ) {
2640             if (bSig) {
2641                 return propagateFloat32NaN(a, b, status);
2642             }
2643             float_raise(float_flag_invalid, status);
2644             return float32_default_nan(status);
2645         }
2646         return packFloat32( zSign, 0xFF, 0 );
2647     }
2648     if ( bExp == 0xFF ) {
2649         if (bSig) {
2650             return propagateFloat32NaN(a, b, status);
2651         }
2652         return packFloat32( zSign, 0, 0 );
2653     }
2654     if ( bExp == 0 ) {
2655         if ( bSig == 0 ) {
2656             if ( ( aExp | aSig ) == 0 ) {
2657                 float_raise(float_flag_invalid, status);
2658                 return float32_default_nan(status);
2659             }
2660             float_raise(float_flag_divbyzero, status);
2661             return packFloat32( zSign, 0xFF, 0 );
2662         }
2663         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2664     }
2665     if ( aExp == 0 ) {
2666         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2667         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2668     }
2669     zExp = aExp - bExp + 0x7D;
2670     aSig = ( aSig | 0x00800000 )<<7;
2671     bSig = ( bSig | 0x00800000 )<<8;
2672     if ( bSig <= ( aSig + aSig ) ) {
2673         aSig >>= 1;
2674         ++zExp;
2675     }
2676     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2677     if ( ( zSig & 0x3F ) == 0 ) {
2678         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2679     }
2680     return roundAndPackFloat32(zSign, zExp, zSig, status);
2681 
2682 }
2683 
2684 /*----------------------------------------------------------------------------
2685 | Returns the remainder of the single-precision floating-point value `a'
2686 | with respect to the corresponding value `b'.  The operation is performed
2687 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2688 *----------------------------------------------------------------------------*/
2689 
2690 float32 float32_rem(float32 a, float32 b, float_status *status)
2691 {
2692     flag aSign, zSign;
2693     int aExp, bExp, expDiff;
2694     uint32_t aSig, bSig;
2695     uint32_t q;
2696     uint64_t aSig64, bSig64, q64;
2697     uint32_t alternateASig;
2698     int32_t sigMean;
2699     a = float32_squash_input_denormal(a, status);
2700     b = float32_squash_input_denormal(b, status);
2701 
2702     aSig = extractFloat32Frac( a );
2703     aExp = extractFloat32Exp( a );
2704     aSign = extractFloat32Sign( a );
2705     bSig = extractFloat32Frac( b );
2706     bExp = extractFloat32Exp( b );
2707     if ( aExp == 0xFF ) {
2708         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2709             return propagateFloat32NaN(a, b, status);
2710         }
2711         float_raise(float_flag_invalid, status);
2712         return float32_default_nan(status);
2713     }
2714     if ( bExp == 0xFF ) {
2715         if (bSig) {
2716             return propagateFloat32NaN(a, b, status);
2717         }
2718         return a;
2719     }
2720     if ( bExp == 0 ) {
2721         if ( bSig == 0 ) {
2722             float_raise(float_flag_invalid, status);
2723             return float32_default_nan(status);
2724         }
2725         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2726     }
2727     if ( aExp == 0 ) {
2728         if ( aSig == 0 ) return a;
2729         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2730     }
2731     expDiff = aExp - bExp;
2732     aSig |= 0x00800000;
2733     bSig |= 0x00800000;
2734     if ( expDiff < 32 ) {
2735         aSig <<= 8;
2736         bSig <<= 8;
2737         if ( expDiff < 0 ) {
2738             if ( expDiff < -1 ) return a;
2739             aSig >>= 1;
2740         }
2741         q = ( bSig <= aSig );
2742         if ( q ) aSig -= bSig;
2743         if ( 0 < expDiff ) {
2744             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2745             q >>= 32 - expDiff;
2746             bSig >>= 2;
2747             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2748         }
2749         else {
2750             aSig >>= 2;
2751             bSig >>= 2;
2752         }
2753     }
2754     else {
2755         if ( bSig <= aSig ) aSig -= bSig;
2756         aSig64 = ( (uint64_t) aSig )<<40;
2757         bSig64 = ( (uint64_t) bSig )<<40;
2758         expDiff -= 64;
2759         while ( 0 < expDiff ) {
2760             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2761             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2762             aSig64 = - ( ( bSig * q64 )<<38 );
2763             expDiff -= 62;
2764         }
2765         expDiff += 64;
2766         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2767         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2768         q = q64>>( 64 - expDiff );
2769         bSig <<= 6;
2770         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2771     }
2772     do {
2773         alternateASig = aSig;
2774         ++q;
2775         aSig -= bSig;
2776     } while ( 0 <= (int32_t) aSig );
2777     sigMean = aSig + alternateASig;
2778     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2779         aSig = alternateASig;
2780     }
2781     zSign = ( (int32_t) aSig < 0 );
2782     if ( zSign ) aSig = - aSig;
2783     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2784 }
2785 
2786 /*----------------------------------------------------------------------------
2787 | Returns the result of multiplying the single-precision floating-point values
2788 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2789 | multiplication.  The operation is performed according to the IEC/IEEE
2790 | Standard for Binary Floating-Point Arithmetic 754-2008.
2791 | The flags argument allows the caller to select negation of the
2792 | addend, the intermediate product, or the final result. (The difference
2793 | between this and having the caller do a separate negation is that negating
2794 | externally will flip the sign bit on NaNs.)
2795 *----------------------------------------------------------------------------*/
2796 
2797 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2798                        float_status *status)
2799 {
2800     flag aSign, bSign, cSign, zSign;
2801     int aExp, bExp, cExp, pExp, zExp, expDiff;
2802     uint32_t aSig, bSig, cSig;
2803     flag pInf, pZero, pSign;
2804     uint64_t pSig64, cSig64, zSig64;
2805     uint32_t pSig;
2806     int shiftcount;
2807     flag signflip, infzero;
2808 
2809     a = float32_squash_input_denormal(a, status);
2810     b = float32_squash_input_denormal(b, status);
2811     c = float32_squash_input_denormal(c, status);
2812     aSig = extractFloat32Frac(a);
2813     aExp = extractFloat32Exp(a);
2814     aSign = extractFloat32Sign(a);
2815     bSig = extractFloat32Frac(b);
2816     bExp = extractFloat32Exp(b);
2817     bSign = extractFloat32Sign(b);
2818     cSig = extractFloat32Frac(c);
2819     cExp = extractFloat32Exp(c);
2820     cSign = extractFloat32Sign(c);
2821 
2822     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2823                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2824 
2825     /* It is implementation-defined whether the cases of (0,inf,qnan)
2826      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2827      * they return if they do), so we have to hand this information
2828      * off to the target-specific pick-a-NaN routine.
2829      */
2830     if (((aExp == 0xff) && aSig) ||
2831         ((bExp == 0xff) && bSig) ||
2832         ((cExp == 0xff) && cSig)) {
2833         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2834     }
2835 
2836     if (infzero) {
2837         float_raise(float_flag_invalid, status);
2838         return float32_default_nan(status);
2839     }
2840 
2841     if (flags & float_muladd_negate_c) {
2842         cSign ^= 1;
2843     }
2844 
2845     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2846 
2847     /* Work out the sign and type of the product */
2848     pSign = aSign ^ bSign;
2849     if (flags & float_muladd_negate_product) {
2850         pSign ^= 1;
2851     }
2852     pInf = (aExp == 0xff) || (bExp == 0xff);
2853     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2854 
2855     if (cExp == 0xff) {
2856         if (pInf && (pSign ^ cSign)) {
2857             /* addition of opposite-signed infinities => InvalidOperation */
2858             float_raise(float_flag_invalid, status);
2859             return float32_default_nan(status);
2860         }
2861         /* Otherwise generate an infinity of the same sign */
2862         return packFloat32(cSign ^ signflip, 0xff, 0);
2863     }
2864 
2865     if (pInf) {
2866         return packFloat32(pSign ^ signflip, 0xff, 0);
2867     }
2868 
2869     if (pZero) {
2870         if (cExp == 0) {
2871             if (cSig == 0) {
2872                 /* Adding two exact zeroes */
2873                 if (pSign == cSign) {
2874                     zSign = pSign;
2875                 } else if (status->float_rounding_mode == float_round_down) {
2876                     zSign = 1;
2877                 } else {
2878                     zSign = 0;
2879                 }
2880                 return packFloat32(zSign ^ signflip, 0, 0);
2881             }
2882             /* Exact zero plus a denorm */
2883             if (status->flush_to_zero) {
2884                 float_raise(float_flag_output_denormal, status);
2885                 return packFloat32(cSign ^ signflip, 0, 0);
2886             }
2887         }
2888         /* Zero plus something non-zero : just return the something */
2889         if (flags & float_muladd_halve_result) {
2890             if (cExp == 0) {
2891                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2892             }
2893             /* Subtract one to halve, and one again because roundAndPackFloat32
2894              * wants one less than the true exponent.
2895              */
2896             cExp -= 2;
2897             cSig = (cSig | 0x00800000) << 7;
2898             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2899         }
2900         return packFloat32(cSign ^ signflip, cExp, cSig);
2901     }
2902 
2903     if (aExp == 0) {
2904         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2905     }
2906     if (bExp == 0) {
2907         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2908     }
2909 
2910     /* Calculate the actual result a * b + c */
2911 
2912     /* Multiply first; this is easy. */
2913     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2914      * because we want the true exponent, not the "one-less-than"
2915      * flavour that roundAndPackFloat32() takes.
2916      */
2917     pExp = aExp + bExp - 0x7e;
2918     aSig = (aSig | 0x00800000) << 7;
2919     bSig = (bSig | 0x00800000) << 8;
2920     pSig64 = (uint64_t)aSig * bSig;
2921     if ((int64_t)(pSig64 << 1) >= 0) {
2922         pSig64 <<= 1;
2923         pExp--;
2924     }
2925 
2926     zSign = pSign ^ signflip;
2927 
2928     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2929      * position 62.
2930      */
2931     if (cExp == 0) {
2932         if (!cSig) {
2933             /* Throw out the special case of c being an exact zero now */
2934             shift64RightJamming(pSig64, 32, &pSig64);
2935             pSig = pSig64;
2936             if (flags & float_muladd_halve_result) {
2937                 pExp--;
2938             }
2939             return roundAndPackFloat32(zSign, pExp - 1,
2940                                        pSig, status);
2941         }
2942         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2943     }
2944 
2945     cSig64 = (uint64_t)cSig << (62 - 23);
2946     cSig64 |= LIT64(0x4000000000000000);
2947     expDiff = pExp - cExp;
2948 
2949     if (pSign == cSign) {
2950         /* Addition */
2951         if (expDiff > 0) {
2952             /* scale c to match p */
2953             shift64RightJamming(cSig64, expDiff, &cSig64);
2954             zExp = pExp;
2955         } else if (expDiff < 0) {
2956             /* scale p to match c */
2957             shift64RightJamming(pSig64, -expDiff, &pSig64);
2958             zExp = cExp;
2959         } else {
2960             /* no scaling needed */
2961             zExp = cExp;
2962         }
2963         /* Add significands and make sure explicit bit ends up in posn 62 */
2964         zSig64 = pSig64 + cSig64;
2965         if ((int64_t)zSig64 < 0) {
2966             shift64RightJamming(zSig64, 1, &zSig64);
2967         } else {
2968             zExp--;
2969         }
2970     } else {
2971         /* Subtraction */
2972         if (expDiff > 0) {
2973             shift64RightJamming(cSig64, expDiff, &cSig64);
2974             zSig64 = pSig64 - cSig64;
2975             zExp = pExp;
2976         } else if (expDiff < 0) {
2977             shift64RightJamming(pSig64, -expDiff, &pSig64);
2978             zSig64 = cSig64 - pSig64;
2979             zExp = cExp;
2980             zSign ^= 1;
2981         } else {
2982             zExp = pExp;
2983             if (cSig64 < pSig64) {
2984                 zSig64 = pSig64 - cSig64;
2985             } else if (pSig64 < cSig64) {
2986                 zSig64 = cSig64 - pSig64;
2987                 zSign ^= 1;
2988             } else {
2989                 /* Exact zero */
2990                 zSign = signflip;
2991                 if (status->float_rounding_mode == float_round_down) {
2992                     zSign ^= 1;
2993                 }
2994                 return packFloat32(zSign, 0, 0);
2995             }
2996         }
2997         --zExp;
2998         /* Normalize to put the explicit bit back into bit 62. */
2999         shiftcount = countLeadingZeros64(zSig64) - 1;
3000         zSig64 <<= shiftcount;
3001         zExp -= shiftcount;
3002     }
3003     if (flags & float_muladd_halve_result) {
3004         zExp--;
3005     }
3006 
3007     shift64RightJamming(zSig64, 32, &zSig64);
3008     return roundAndPackFloat32(zSign, zExp, zSig64, status);
3009 }
3010 
3011 
3012 /*----------------------------------------------------------------------------
3013 | Returns the square root of the single-precision floating-point value `a'.
3014 | The operation is performed according to the IEC/IEEE Standard for Binary
3015 | Floating-Point Arithmetic.
3016 *----------------------------------------------------------------------------*/
3017 
3018 float32 float32_sqrt(float32 a, float_status *status)
3019 {
3020     flag aSign;
3021     int aExp, zExp;
3022     uint32_t aSig, zSig;
3023     uint64_t rem, term;
3024     a = float32_squash_input_denormal(a, status);
3025 
3026     aSig = extractFloat32Frac( a );
3027     aExp = extractFloat32Exp( a );
3028     aSign = extractFloat32Sign( a );
3029     if ( aExp == 0xFF ) {
3030         if (aSig) {
3031             return propagateFloat32NaN(a, float32_zero, status);
3032         }
3033         if ( ! aSign ) return a;
3034         float_raise(float_flag_invalid, status);
3035         return float32_default_nan(status);
3036     }
3037     if ( aSign ) {
3038         if ( ( aExp | aSig ) == 0 ) return a;
3039         float_raise(float_flag_invalid, status);
3040         return float32_default_nan(status);
3041     }
3042     if ( aExp == 0 ) {
3043         if ( aSig == 0 ) return float32_zero;
3044         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3045     }
3046     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3047     aSig = ( aSig | 0x00800000 )<<8;
3048     zSig = estimateSqrt32( aExp, aSig ) + 2;
3049     if ( ( zSig & 0x7F ) <= 5 ) {
3050         if ( zSig < 2 ) {
3051             zSig = 0x7FFFFFFF;
3052             goto roundAndPack;
3053         }
3054         aSig >>= aExp & 1;
3055         term = ( (uint64_t) zSig ) * zSig;
3056         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3057         while ( (int64_t) rem < 0 ) {
3058             --zSig;
3059             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3060         }
3061         zSig |= ( rem != 0 );
3062     }
3063     shift32RightJamming( zSig, 1, &zSig );
3064  roundAndPack:
3065     return roundAndPackFloat32(0, zExp, zSig, status);
3066 
3067 }
3068 
3069 /*----------------------------------------------------------------------------
3070 | Returns the binary exponential of the single-precision floating-point value
3071 | `a'. The operation is performed according to the IEC/IEEE Standard for
3072 | Binary Floating-Point Arithmetic.
3073 |
3074 | Uses the following identities:
3075 |
3076 | 1. -------------------------------------------------------------------------
3077 |      x    x*ln(2)
3078 |     2  = e
3079 |
3080 | 2. -------------------------------------------------------------------------
3081 |                      2     3     4     5           n
3082 |      x        x     x     x     x     x           x
3083 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3084 |               1!    2!    3!    4!    5!          n!
3085 *----------------------------------------------------------------------------*/
3086 
3087 static const float64 float32_exp2_coefficients[15] =
3088 {
3089     const_float64( 0x3ff0000000000000ll ), /*  1 */
3090     const_float64( 0x3fe0000000000000ll ), /*  2 */
3091     const_float64( 0x3fc5555555555555ll ), /*  3 */
3092     const_float64( 0x3fa5555555555555ll ), /*  4 */
3093     const_float64( 0x3f81111111111111ll ), /*  5 */
3094     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3095     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3096     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3097     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3098     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3099     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3100     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3101     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3102     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3103     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3104 };
3105 
3106 float32 float32_exp2(float32 a, float_status *status)
3107 {
3108     flag aSign;
3109     int aExp;
3110     uint32_t aSig;
3111     float64 r, x, xn;
3112     int i;
3113     a = float32_squash_input_denormal(a, status);
3114 
3115     aSig = extractFloat32Frac( a );
3116     aExp = extractFloat32Exp( a );
3117     aSign = extractFloat32Sign( a );
3118 
3119     if ( aExp == 0xFF) {
3120         if (aSig) {
3121             return propagateFloat32NaN(a, float32_zero, status);
3122         }
3123         return (aSign) ? float32_zero : a;
3124     }
3125     if (aExp == 0) {
3126         if (aSig == 0) return float32_one;
3127     }
3128 
3129     float_raise(float_flag_inexact, status);
3130 
3131     /* ******************************* */
3132     /* using float64 for approximation */
3133     /* ******************************* */
3134     x = float32_to_float64(a, status);
3135     x = float64_mul(x, float64_ln2, status);
3136 
3137     xn = x;
3138     r = float64_one;
3139     for (i = 0 ; i < 15 ; i++) {
3140         float64 f;
3141 
3142         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3143         r = float64_add(r, f, status);
3144 
3145         xn = float64_mul(xn, x, status);
3146     }
3147 
3148     return float64_to_float32(r, status);
3149 }
3150 
3151 /*----------------------------------------------------------------------------
3152 | Returns the binary log of the single-precision floating-point value `a'.
3153 | The operation is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic.
3155 *----------------------------------------------------------------------------*/
3156 float32 float32_log2(float32 a, float_status *status)
3157 {
3158     flag aSign, zSign;
3159     int aExp;
3160     uint32_t aSig, zSig, i;
3161 
3162     a = float32_squash_input_denormal(a, status);
3163     aSig = extractFloat32Frac( a );
3164     aExp = extractFloat32Exp( a );
3165     aSign = extractFloat32Sign( a );
3166 
3167     if ( aExp == 0 ) {
3168         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3169         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3170     }
3171     if ( aSign ) {
3172         float_raise(float_flag_invalid, status);
3173         return float32_default_nan(status);
3174     }
3175     if ( aExp == 0xFF ) {
3176         if (aSig) {
3177             return propagateFloat32NaN(a, float32_zero, status);
3178         }
3179         return a;
3180     }
3181 
3182     aExp -= 0x7F;
3183     aSig |= 0x00800000;
3184     zSign = aExp < 0;
3185     zSig = aExp << 23;
3186 
3187     for (i = 1 << 22; i > 0; i >>= 1) {
3188         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3189         if ( aSig & 0x01000000 ) {
3190             aSig >>= 1;
3191             zSig |= i;
3192         }
3193     }
3194 
3195     if ( zSign )
3196         zSig = -zSig;
3197 
3198     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3199 }
3200 
3201 /*----------------------------------------------------------------------------
3202 | Returns 1 if the single-precision floating-point value `a' is equal to
3203 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3204 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3206 *----------------------------------------------------------------------------*/
3207 
3208 int float32_eq(float32 a, float32 b, float_status *status)
3209 {
3210     uint32_t av, bv;
3211     a = float32_squash_input_denormal(a, status);
3212     b = float32_squash_input_denormal(b, status);
3213 
3214     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3215          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3216        ) {
3217         float_raise(float_flag_invalid, status);
3218         return 0;
3219     }
3220     av = float32_val(a);
3221     bv = float32_val(b);
3222     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3223 }
3224 
3225 /*----------------------------------------------------------------------------
3226 | Returns 1 if the single-precision floating-point value `a' is less than
3227 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3228 | exception is raised if either operand is a NaN.  The comparison is performed
3229 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3231 
3232 int float32_le(float32 a, float32 b, float_status *status)
3233 {
3234     flag aSign, bSign;
3235     uint32_t av, bv;
3236     a = float32_squash_input_denormal(a, status);
3237     b = float32_squash_input_denormal(b, status);
3238 
3239     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3240          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3241        ) {
3242         float_raise(float_flag_invalid, status);
3243         return 0;
3244     }
3245     aSign = extractFloat32Sign( a );
3246     bSign = extractFloat32Sign( b );
3247     av = float32_val(a);
3248     bv = float32_val(b);
3249     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3250     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3251 
3252 }
3253 
3254 /*----------------------------------------------------------------------------
3255 | Returns 1 if the single-precision floating-point value `a' is less than
3256 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3257 | raised if either operand is a NaN.  The comparison is performed according
3258 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3259 *----------------------------------------------------------------------------*/
3260 
3261 int float32_lt(float32 a, float32 b, float_status *status)
3262 {
3263     flag aSign, bSign;
3264     uint32_t av, bv;
3265     a = float32_squash_input_denormal(a, status);
3266     b = float32_squash_input_denormal(b, status);
3267 
3268     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3269          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3270        ) {
3271         float_raise(float_flag_invalid, status);
3272         return 0;
3273     }
3274     aSign = extractFloat32Sign( a );
3275     bSign = extractFloat32Sign( b );
3276     av = float32_val(a);
3277     bv = float32_val(b);
3278     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3279     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3280 
3281 }
3282 
3283 /*----------------------------------------------------------------------------
3284 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3285 | be compared, and 0 otherwise.  The invalid exception is raised if either
3286 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3287 | Standard for Binary Floating-Point Arithmetic.
3288 *----------------------------------------------------------------------------*/
3289 
3290 int float32_unordered(float32 a, float32 b, float_status *status)
3291 {
3292     a = float32_squash_input_denormal(a, status);
3293     b = float32_squash_input_denormal(b, status);
3294 
3295     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3296          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3297        ) {
3298         float_raise(float_flag_invalid, status);
3299         return 1;
3300     }
3301     return 0;
3302 }
3303 
3304 /*----------------------------------------------------------------------------
3305 | Returns 1 if the single-precision floating-point value `a' is equal to
3306 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3307 | exception.  The comparison is performed according to the IEC/IEEE Standard
3308 | for Binary Floating-Point Arithmetic.
3309 *----------------------------------------------------------------------------*/
3310 
3311 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3312 {
3313     a = float32_squash_input_denormal(a, status);
3314     b = float32_squash_input_denormal(b, status);
3315 
3316     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3317          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3318        ) {
3319         if (float32_is_signaling_nan(a, status)
3320          || float32_is_signaling_nan(b, status)) {
3321             float_raise(float_flag_invalid, status);
3322         }
3323         return 0;
3324     }
3325     return ( float32_val(a) == float32_val(b) ) ||
3326             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3327 }
3328 
3329 /*----------------------------------------------------------------------------
3330 | Returns 1 if the single-precision floating-point value `a' is less than or
3331 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3332 | cause an exception.  Otherwise, the comparison is performed according to the
3333 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3334 *----------------------------------------------------------------------------*/
3335 
3336 int float32_le_quiet(float32 a, float32 b, float_status *status)
3337 {
3338     flag aSign, bSign;
3339     uint32_t av, bv;
3340     a = float32_squash_input_denormal(a, status);
3341     b = float32_squash_input_denormal(b, status);
3342 
3343     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3344          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3345        ) {
3346         if (float32_is_signaling_nan(a, status)
3347          || float32_is_signaling_nan(b, status)) {
3348             float_raise(float_flag_invalid, status);
3349         }
3350         return 0;
3351     }
3352     aSign = extractFloat32Sign( a );
3353     bSign = extractFloat32Sign( b );
3354     av = float32_val(a);
3355     bv = float32_val(b);
3356     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3357     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3358 
3359 }
3360 
3361 /*----------------------------------------------------------------------------
3362 | Returns 1 if the single-precision floating-point value `a' is less than
3363 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3364 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3365 | Standard for Binary Floating-Point Arithmetic.
3366 *----------------------------------------------------------------------------*/
3367 
3368 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3369 {
3370     flag aSign, bSign;
3371     uint32_t av, bv;
3372     a = float32_squash_input_denormal(a, status);
3373     b = float32_squash_input_denormal(b, status);
3374 
3375     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3376          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3377        ) {
3378         if (float32_is_signaling_nan(a, status)
3379          || float32_is_signaling_nan(b, status)) {
3380             float_raise(float_flag_invalid, status);
3381         }
3382         return 0;
3383     }
3384     aSign = extractFloat32Sign( a );
3385     bSign = extractFloat32Sign( b );
3386     av = float32_val(a);
3387     bv = float32_val(b);
3388     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3389     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3390 
3391 }
3392 
3393 /*----------------------------------------------------------------------------
3394 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3395 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3396 | comparison is performed according to the IEC/IEEE Standard for Binary
3397 | Floating-Point Arithmetic.
3398 *----------------------------------------------------------------------------*/
3399 
3400 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3401 {
3402     a = float32_squash_input_denormal(a, status);
3403     b = float32_squash_input_denormal(b, status);
3404 
3405     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3406          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3407        ) {
3408         if (float32_is_signaling_nan(a, status)
3409          || float32_is_signaling_nan(b, status)) {
3410             float_raise(float_flag_invalid, status);
3411         }
3412         return 1;
3413     }
3414     return 0;
3415 }
3416 
3417 /*----------------------------------------------------------------------------
3418 | Returns the result of converting the double-precision floating-point value
3419 | `a' to the 32-bit two's complement integer format.  The conversion is
3420 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3421 | Arithmetic---which means in particular that the conversion is rounded
3422 | according to the current rounding mode.  If `a' is a NaN, the largest
3423 | positive integer is returned.  Otherwise, if the conversion overflows, the
3424 | largest integer with the same sign as `a' is returned.
3425 *----------------------------------------------------------------------------*/
3426 
3427 int32_t float64_to_int32(float64 a, float_status *status)
3428 {
3429     flag aSign;
3430     int aExp;
3431     int shiftCount;
3432     uint64_t aSig;
3433     a = float64_squash_input_denormal(a, status);
3434 
3435     aSig = extractFloat64Frac( a );
3436     aExp = extractFloat64Exp( a );
3437     aSign = extractFloat64Sign( a );
3438     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3439     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3440     shiftCount = 0x42C - aExp;
3441     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3442     return roundAndPackInt32(aSign, aSig, status);
3443 
3444 }
3445 
3446 /*----------------------------------------------------------------------------
3447 | Returns the result of converting the double-precision floating-point value
3448 | `a' to the 32-bit two's complement integer format.  The conversion is
3449 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3450 | Arithmetic, except that the conversion is always rounded toward zero.
3451 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3452 | the conversion overflows, the largest integer with the same sign as `a' is
3453 | returned.
3454 *----------------------------------------------------------------------------*/
3455 
3456 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3457 {
3458     flag aSign;
3459     int aExp;
3460     int shiftCount;
3461     uint64_t aSig, savedASig;
3462     int32_t z;
3463     a = float64_squash_input_denormal(a, status);
3464 
3465     aSig = extractFloat64Frac( a );
3466     aExp = extractFloat64Exp( a );
3467     aSign = extractFloat64Sign( a );
3468     if ( 0x41E < aExp ) {
3469         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3470         goto invalid;
3471     }
3472     else if ( aExp < 0x3FF ) {
3473         if (aExp || aSig) {
3474             status->float_exception_flags |= float_flag_inexact;
3475         }
3476         return 0;
3477     }
3478     aSig |= LIT64( 0x0010000000000000 );
3479     shiftCount = 0x433 - aExp;
3480     savedASig = aSig;
3481     aSig >>= shiftCount;
3482     z = aSig;
3483     if ( aSign ) z = - z;
3484     if ( ( z < 0 ) ^ aSign ) {
3485  invalid:
3486         float_raise(float_flag_invalid, status);
3487         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3488     }
3489     if ( ( aSig<<shiftCount ) != savedASig ) {
3490         status->float_exception_flags |= float_flag_inexact;
3491     }
3492     return z;
3493 
3494 }
3495 
3496 /*----------------------------------------------------------------------------
3497 | Returns the result of converting the double-precision floating-point value
3498 | `a' to the 16-bit two's complement integer format.  The conversion is
3499 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3500 | Arithmetic, except that the conversion is always rounded toward zero.
3501 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3502 | the conversion overflows, the largest integer with the same sign as `a' is
3503 | returned.
3504 *----------------------------------------------------------------------------*/
3505 
3506 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3507 {
3508     flag aSign;
3509     int aExp;
3510     int shiftCount;
3511     uint64_t aSig, savedASig;
3512     int32_t z;
3513 
3514     aSig = extractFloat64Frac( a );
3515     aExp = extractFloat64Exp( a );
3516     aSign = extractFloat64Sign( a );
3517     if ( 0x40E < aExp ) {
3518         if ( ( aExp == 0x7FF ) && aSig ) {
3519             aSign = 0;
3520         }
3521         goto invalid;
3522     }
3523     else if ( aExp < 0x3FF ) {
3524         if ( aExp || aSig ) {
3525             status->float_exception_flags |= float_flag_inexact;
3526         }
3527         return 0;
3528     }
3529     aSig |= LIT64( 0x0010000000000000 );
3530     shiftCount = 0x433 - aExp;
3531     savedASig = aSig;
3532     aSig >>= shiftCount;
3533     z = aSig;
3534     if ( aSign ) {
3535         z = - z;
3536     }
3537     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3538  invalid:
3539         float_raise(float_flag_invalid, status);
3540         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3541     }
3542     if ( ( aSig<<shiftCount ) != savedASig ) {
3543         status->float_exception_flags |= float_flag_inexact;
3544     }
3545     return z;
3546 }
3547 
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of converting the double-precision floating-point value
3550 | `a' to the 64-bit two's complement integer format.  The conversion is
3551 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3552 | Arithmetic---which means in particular that the conversion is rounded
3553 | according to the current rounding mode.  If `a' is a NaN, the largest
3554 | positive integer is returned.  Otherwise, if the conversion overflows, the
3555 | largest integer with the same sign as `a' is returned.
3556 *----------------------------------------------------------------------------*/
3557 
3558 int64_t float64_to_int64(float64 a, float_status *status)
3559 {
3560     flag aSign;
3561     int aExp;
3562     int shiftCount;
3563     uint64_t aSig, aSigExtra;
3564     a = float64_squash_input_denormal(a, status);
3565 
3566     aSig = extractFloat64Frac( a );
3567     aExp = extractFloat64Exp( a );
3568     aSign = extractFloat64Sign( a );
3569     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3570     shiftCount = 0x433 - aExp;
3571     if ( shiftCount <= 0 ) {
3572         if ( 0x43E < aExp ) {
3573             float_raise(float_flag_invalid, status);
3574             if (    ! aSign
3575                  || (    ( aExp == 0x7FF )
3576                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3577                ) {
3578                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3579             }
3580             return (int64_t) LIT64( 0x8000000000000000 );
3581         }
3582         aSigExtra = 0;
3583         aSig <<= - shiftCount;
3584     }
3585     else {
3586         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3587     }
3588     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3589 
3590 }
3591 
3592 /*----------------------------------------------------------------------------
3593 | Returns the result of converting the double-precision floating-point value
3594 | `a' to the 64-bit two's complement integer format.  The conversion is
3595 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3596 | Arithmetic, except that the conversion is always rounded toward zero.
3597 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3598 | the conversion overflows, the largest integer with the same sign as `a' is
3599 | returned.
3600 *----------------------------------------------------------------------------*/
3601 
3602 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3603 {
3604     flag aSign;
3605     int aExp;
3606     int shiftCount;
3607     uint64_t aSig;
3608     int64_t z;
3609     a = float64_squash_input_denormal(a, status);
3610 
3611     aSig = extractFloat64Frac( a );
3612     aExp = extractFloat64Exp( a );
3613     aSign = extractFloat64Sign( a );
3614     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3615     shiftCount = aExp - 0x433;
3616     if ( 0 <= shiftCount ) {
3617         if ( 0x43E <= aExp ) {
3618             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3619                 float_raise(float_flag_invalid, status);
3620                 if (    ! aSign
3621                      || (    ( aExp == 0x7FF )
3622                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3623                    ) {
3624                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3625                 }
3626             }
3627             return (int64_t) LIT64( 0x8000000000000000 );
3628         }
3629         z = aSig<<shiftCount;
3630     }
3631     else {
3632         if ( aExp < 0x3FE ) {
3633             if (aExp | aSig) {
3634                 status->float_exception_flags |= float_flag_inexact;
3635             }
3636             return 0;
3637         }
3638         z = aSig>>( - shiftCount );
3639         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3640             status->float_exception_flags |= float_flag_inexact;
3641         }
3642     }
3643     if ( aSign ) z = - z;
3644     return z;
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the single-precision floating-point format.  The conversion is
3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3654 
3655 float32 float64_to_float32(float64 a, float_status *status)
3656 {
3657     flag aSign;
3658     int aExp;
3659     uint64_t aSig;
3660     uint32_t zSig;
3661     a = float64_squash_input_denormal(a, status);
3662 
3663     aSig = extractFloat64Frac( a );
3664     aExp = extractFloat64Exp( a );
3665     aSign = extractFloat64Sign( a );
3666     if ( aExp == 0x7FF ) {
3667         if (aSig) {
3668             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3669         }
3670         return packFloat32( aSign, 0xFF, 0 );
3671     }
3672     shift64RightJamming( aSig, 22, &aSig );
3673     zSig = aSig;
3674     if ( aExp || zSig ) {
3675         zSig |= 0x40000000;
3676         aExp -= 0x381;
3677     }
3678     return roundAndPackFloat32(aSign, aExp, zSig, status);
3679 
3680 }
3681 
3682 
3683 /*----------------------------------------------------------------------------
3684 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3685 | half-precision floating-point value, returning the result.  After being
3686 | shifted into the proper positions, the three fields are simply added
3687 | together to form the result.  This means that any integer portion of `zSig'
3688 | will be added into the exponent.  Since a properly normalized significand
3689 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3690 | than the desired result exponent whenever `zSig' is a complete, normalized
3691 | significand.
3692 *----------------------------------------------------------------------------*/
3693 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3694 {
3695     return make_float16(
3696         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3697 }
3698 
3699 /*----------------------------------------------------------------------------
3700 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3701 | and significand `zSig', and returns the proper half-precision floating-
3702 | point value corresponding to the abstract input.  Ordinarily, the abstract
3703 | value is simply rounded and packed into the half-precision format, with
3704 | the inexact exception raised if the abstract input cannot be represented
3705 | exactly.  However, if the abstract value is too large, the overflow and
3706 | inexact exceptions are raised and an infinity or maximal finite value is
3707 | returned.  If the abstract value is too small, the input value is rounded to
3708 | a subnormal number, and the underflow and inexact exceptions are raised if
3709 | the abstract input cannot be represented exactly as a subnormal half-
3710 | precision floating-point number.
3711 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3712 | ARM-style "alternative representation", which omits the NaN and Inf
3713 | encodings in order to raise the maximum representable exponent by one.
3714 |     The input significand `zSig' has its binary point between bits 22
3715 | and 23, which is 13 bits to the left of the usual location.  This shifted
3716 | significand must be normalized or smaller.  If `zSig' is not normalized,
3717 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3718 | and it must not require rounding.  In the usual case that `zSig' is
3719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3720 | Note the slightly odd position of the binary point in zSig compared with the
3721 | other roundAndPackFloat functions. This should probably be fixed if we
3722 | need to implement more float16 routines than just conversion.
3723 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3724 | Binary Floating-Point Arithmetic.
3725 *----------------------------------------------------------------------------*/
3726 
3727 static float16 roundAndPackFloat16(flag zSign, int zExp,
3728                                    uint32_t zSig, flag ieee,
3729                                    float_status *status)
3730 {
3731     int maxexp = ieee ? 29 : 30;
3732     uint32_t mask;
3733     uint32_t increment;
3734     bool rounding_bumps_exp;
3735     bool is_tiny = false;
3736 
3737     /* Calculate the mask of bits of the mantissa which are not
3738      * representable in half-precision and will be lost.
3739      */
3740     if (zExp < 1) {
3741         /* Will be denormal in halfprec */
3742         mask = 0x00ffffff;
3743         if (zExp >= -11) {
3744             mask >>= 11 + zExp;
3745         }
3746     } else {
3747         /* Normal number in halfprec */
3748         mask = 0x00001fff;
3749     }
3750 
3751     switch (status->float_rounding_mode) {
3752     case float_round_nearest_even:
3753         increment = (mask + 1) >> 1;
3754         if ((zSig & mask) == increment) {
3755             increment = zSig & (increment << 1);
3756         }
3757         break;
3758     case float_round_ties_away:
3759         increment = (mask + 1) >> 1;
3760         break;
3761     case float_round_up:
3762         increment = zSign ? 0 : mask;
3763         break;
3764     case float_round_down:
3765         increment = zSign ? mask : 0;
3766         break;
3767     default: /* round_to_zero */
3768         increment = 0;
3769         break;
3770     }
3771 
3772     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3773 
3774     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3775         if (ieee) {
3776             float_raise(float_flag_overflow | float_flag_inexact, status);
3777             return packFloat16(zSign, 0x1f, 0);
3778         } else {
3779             float_raise(float_flag_invalid, status);
3780             return packFloat16(zSign, 0x1f, 0x3ff);
3781         }
3782     }
3783 
3784     if (zExp < 0) {
3785         /* Note that flush-to-zero does not affect half-precision results */
3786         is_tiny =
3787             (status->float_detect_tininess == float_tininess_before_rounding)
3788             || (zExp < -1)
3789             || (!rounding_bumps_exp);
3790     }
3791     if (zSig & mask) {
3792         float_raise(float_flag_inexact, status);
3793         if (is_tiny) {
3794             float_raise(float_flag_underflow, status);
3795         }
3796     }
3797 
3798     zSig += increment;
3799     if (rounding_bumps_exp) {
3800         zSig >>= 1;
3801         zExp++;
3802     }
3803 
3804     if (zExp < -10) {
3805         return packFloat16(zSign, 0, 0);
3806     }
3807     if (zExp < 0) {
3808         zSig >>= -zExp;
3809         zExp = 0;
3810     }
3811     return packFloat16(zSign, zExp, zSig >> 13);
3812 }
3813 
3814 /*----------------------------------------------------------------------------
3815 | If `a' is denormal and we are in flush-to-zero mode then set the
3816 | input-denormal exception and return zero. Otherwise just return the value.
3817 *----------------------------------------------------------------------------*/
3818 float16 float16_squash_input_denormal(float16 a, float_status *status)
3819 {
3820     if (status->flush_inputs_to_zero) {
3821         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3822             float_raise(float_flag_input_denormal, status);
3823             return make_float16(float16_val(a) & 0x8000);
3824         }
3825     }
3826     return a;
3827 }
3828 
3829 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3830                                       uint32_t *zSigPtr)
3831 {
3832     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3833     *zSigPtr = aSig << shiftCount;
3834     *zExpPtr = 1 - shiftCount;
3835 }
3836 
3837 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3838    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3839 
3840 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3841 {
3842     flag aSign;
3843     int aExp;
3844     uint32_t aSig;
3845 
3846     aSign = extractFloat16Sign(a);
3847     aExp = extractFloat16Exp(a);
3848     aSig = extractFloat16Frac(a);
3849 
3850     if (aExp == 0x1f && ieee) {
3851         if (aSig) {
3852             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3853         }
3854         return packFloat32(aSign, 0xff, 0);
3855     }
3856     if (aExp == 0) {
3857         if (aSig == 0) {
3858             return packFloat32(aSign, 0, 0);
3859         }
3860 
3861         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3862         aExp--;
3863     }
3864     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3865 }
3866 
3867 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3868 {
3869     flag aSign;
3870     int aExp;
3871     uint32_t aSig;
3872 
3873     a = float32_squash_input_denormal(a, status);
3874 
3875     aSig = extractFloat32Frac( a );
3876     aExp = extractFloat32Exp( a );
3877     aSign = extractFloat32Sign( a );
3878     if ( aExp == 0xFF ) {
3879         if (aSig) {
3880             /* Input is a NaN */
3881             if (!ieee) {
3882                 float_raise(float_flag_invalid, status);
3883                 return packFloat16(aSign, 0, 0);
3884             }
3885             return commonNaNToFloat16(
3886                 float32ToCommonNaN(a, status), status);
3887         }
3888         /* Infinity */
3889         if (!ieee) {
3890             float_raise(float_flag_invalid, status);
3891             return packFloat16(aSign, 0x1f, 0x3ff);
3892         }
3893         return packFloat16(aSign, 0x1f, 0);
3894     }
3895     if (aExp == 0 && aSig == 0) {
3896         return packFloat16(aSign, 0, 0);
3897     }
3898     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3899      * even if the input is denormal; however this is harmless because
3900      * the largest possible single-precision denormal is still smaller
3901      * than the smallest representable half-precision denormal, and so we
3902      * will end up ignoring aSig and returning via the "always return zero"
3903      * codepath.
3904      */
3905     aSig |= 0x00800000;
3906     aExp -= 0x71;
3907 
3908     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3909 }
3910 
3911 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3912 {
3913     flag aSign;
3914     int aExp;
3915     uint32_t aSig;
3916 
3917     aSign = extractFloat16Sign(a);
3918     aExp = extractFloat16Exp(a);
3919     aSig = extractFloat16Frac(a);
3920 
3921     if (aExp == 0x1f && ieee) {
3922         if (aSig) {
3923             return commonNaNToFloat64(
3924                 float16ToCommonNaN(a, status), status);
3925         }
3926         return packFloat64(aSign, 0x7ff, 0);
3927     }
3928     if (aExp == 0) {
3929         if (aSig == 0) {
3930             return packFloat64(aSign, 0, 0);
3931         }
3932 
3933         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3934         aExp--;
3935     }
3936     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3937 }
3938 
3939 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3940 {
3941     flag aSign;
3942     int aExp;
3943     uint64_t aSig;
3944     uint32_t zSig;
3945 
3946     a = float64_squash_input_denormal(a, status);
3947 
3948     aSig = extractFloat64Frac(a);
3949     aExp = extractFloat64Exp(a);
3950     aSign = extractFloat64Sign(a);
3951     if (aExp == 0x7FF) {
3952         if (aSig) {
3953             /* Input is a NaN */
3954             if (!ieee) {
3955                 float_raise(float_flag_invalid, status);
3956                 return packFloat16(aSign, 0, 0);
3957             }
3958             return commonNaNToFloat16(
3959                 float64ToCommonNaN(a, status), status);
3960         }
3961         /* Infinity */
3962         if (!ieee) {
3963             float_raise(float_flag_invalid, status);
3964             return packFloat16(aSign, 0x1f, 0x3ff);
3965         }
3966         return packFloat16(aSign, 0x1f, 0);
3967     }
3968     shift64RightJamming(aSig, 29, &aSig);
3969     zSig = aSig;
3970     if (aExp == 0 && zSig == 0) {
3971         return packFloat16(aSign, 0, 0);
3972     }
3973     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3974      * even if the input is denormal; however this is harmless because
3975      * the largest possible single-precision denormal is still smaller
3976      * than the smallest representable half-precision denormal, and so we
3977      * will end up ignoring aSig and returning via the "always return zero"
3978      * codepath.
3979      */
3980     zSig |= 0x00800000;
3981     aExp -= 0x3F1;
3982 
3983     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3984 }
3985 
3986 /*----------------------------------------------------------------------------
3987 | Returns the result of converting the double-precision floating-point value
3988 | `a' to the extended double-precision floating-point format.  The conversion
3989 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3990 | Arithmetic.
3991 *----------------------------------------------------------------------------*/
3992 
3993 floatx80 float64_to_floatx80(float64 a, float_status *status)
3994 {
3995     flag aSign;
3996     int aExp;
3997     uint64_t aSig;
3998 
3999     a = float64_squash_input_denormal(a, status);
4000     aSig = extractFloat64Frac( a );
4001     aExp = extractFloat64Exp( a );
4002     aSign = extractFloat64Sign( a );
4003     if ( aExp == 0x7FF ) {
4004         if (aSig) {
4005             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4006         }
4007         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4008     }
4009     if ( aExp == 0 ) {
4010         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4011         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4012     }
4013     return
4014         packFloatx80(
4015             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4016 
4017 }
4018 
4019 /*----------------------------------------------------------------------------
4020 | Returns the result of converting the double-precision floating-point value
4021 | `a' to the quadruple-precision floating-point format.  The conversion is
4022 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4023 | Arithmetic.
4024 *----------------------------------------------------------------------------*/
4025 
4026 float128 float64_to_float128(float64 a, float_status *status)
4027 {
4028     flag aSign;
4029     int aExp;
4030     uint64_t aSig, zSig0, zSig1;
4031 
4032     a = float64_squash_input_denormal(a, status);
4033     aSig = extractFloat64Frac( a );
4034     aExp = extractFloat64Exp( a );
4035     aSign = extractFloat64Sign( a );
4036     if ( aExp == 0x7FF ) {
4037         if (aSig) {
4038             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4039         }
4040         return packFloat128( aSign, 0x7FFF, 0, 0 );
4041     }
4042     if ( aExp == 0 ) {
4043         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4044         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4045         --aExp;
4046     }
4047     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4048     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4049 
4050 }
4051 
4052 /*----------------------------------------------------------------------------
4053 | Rounds the double-precision floating-point value `a' to an integer, and
4054 | returns the result as a double-precision floating-point value.  The
4055 | operation is performed according to the IEC/IEEE Standard for Binary
4056 | Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4058 
4059 float64 float64_round_to_int(float64 a, float_status *status)
4060 {
4061     flag aSign;
4062     int aExp;
4063     uint64_t lastBitMask, roundBitsMask;
4064     uint64_t z;
4065     a = float64_squash_input_denormal(a, status);
4066 
4067     aExp = extractFloat64Exp( a );
4068     if ( 0x433 <= aExp ) {
4069         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
4070             return propagateFloat64NaN(a, a, status);
4071         }
4072         return a;
4073     }
4074     if ( aExp < 0x3FF ) {
4075         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
4076         status->float_exception_flags |= float_flag_inexact;
4077         aSign = extractFloat64Sign( a );
4078         switch (status->float_rounding_mode) {
4079          case float_round_nearest_even:
4080             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
4081                 return packFloat64( aSign, 0x3FF, 0 );
4082             }
4083             break;
4084         case float_round_ties_away:
4085             if (aExp == 0x3FE) {
4086                 return packFloat64(aSign, 0x3ff, 0);
4087             }
4088             break;
4089          case float_round_down:
4090             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
4091          case float_round_up:
4092             return make_float64(
4093             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
4094         }
4095         return packFloat64( aSign, 0, 0 );
4096     }
4097     lastBitMask = 1;
4098     lastBitMask <<= 0x433 - aExp;
4099     roundBitsMask = lastBitMask - 1;
4100     z = float64_val(a);
4101     switch (status->float_rounding_mode) {
4102     case float_round_nearest_even:
4103         z += lastBitMask >> 1;
4104         if ((z & roundBitsMask) == 0) {
4105             z &= ~lastBitMask;
4106         }
4107         break;
4108     case float_round_ties_away:
4109         z += lastBitMask >> 1;
4110         break;
4111     case float_round_to_zero:
4112         break;
4113     case float_round_up:
4114         if (!extractFloat64Sign(make_float64(z))) {
4115             z += roundBitsMask;
4116         }
4117         break;
4118     case float_round_down:
4119         if (extractFloat64Sign(make_float64(z))) {
4120             z += roundBitsMask;
4121         }
4122         break;
4123     default:
4124         abort();
4125     }
4126     z &= ~ roundBitsMask;
4127     if (z != float64_val(a)) {
4128         status->float_exception_flags |= float_flag_inexact;
4129     }
4130     return make_float64(z);
4131 
4132 }
4133 
4134 float64 float64_trunc_to_int(float64 a, float_status *status)
4135 {
4136     int oldmode;
4137     float64 res;
4138     oldmode = status->float_rounding_mode;
4139     status->float_rounding_mode = float_round_to_zero;
4140     res = float64_round_to_int(a, status);
4141     status->float_rounding_mode = oldmode;
4142     return res;
4143 }
4144 
4145 
4146 /*----------------------------------------------------------------------------
4147 | Returns the result of multiplying the double-precision floating-point values
4148 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4149 | for Binary Floating-Point Arithmetic.
4150 *----------------------------------------------------------------------------*/
4151 
4152 float64 float64_mul(float64 a, float64 b, float_status *status)
4153 {
4154     flag aSign, bSign, zSign;
4155     int aExp, bExp, zExp;
4156     uint64_t aSig, bSig, zSig0, zSig1;
4157 
4158     a = float64_squash_input_denormal(a, status);
4159     b = float64_squash_input_denormal(b, status);
4160 
4161     aSig = extractFloat64Frac( a );
4162     aExp = extractFloat64Exp( a );
4163     aSign = extractFloat64Sign( a );
4164     bSig = extractFloat64Frac( b );
4165     bExp = extractFloat64Exp( b );
4166     bSign = extractFloat64Sign( b );
4167     zSign = aSign ^ bSign;
4168     if ( aExp == 0x7FF ) {
4169         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4170             return propagateFloat64NaN(a, b, status);
4171         }
4172         if ( ( bExp | bSig ) == 0 ) {
4173             float_raise(float_flag_invalid, status);
4174             return float64_default_nan(status);
4175         }
4176         return packFloat64( zSign, 0x7FF, 0 );
4177     }
4178     if ( bExp == 0x7FF ) {
4179         if (bSig) {
4180             return propagateFloat64NaN(a, b, status);
4181         }
4182         if ( ( aExp | aSig ) == 0 ) {
4183             float_raise(float_flag_invalid, status);
4184             return float64_default_nan(status);
4185         }
4186         return packFloat64( zSign, 0x7FF, 0 );
4187     }
4188     if ( aExp == 0 ) {
4189         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4190         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4191     }
4192     if ( bExp == 0 ) {
4193         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4194         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4195     }
4196     zExp = aExp + bExp - 0x3FF;
4197     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4198     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4199     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4200     zSig0 |= ( zSig1 != 0 );
4201     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4202         zSig0 <<= 1;
4203         --zExp;
4204     }
4205     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4206 
4207 }
4208 
4209 /*----------------------------------------------------------------------------
4210 | Returns the result of dividing the double-precision floating-point value `a'
4211 | by the corresponding value `b'.  The operation is performed according to
4212 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4213 *----------------------------------------------------------------------------*/
4214 
4215 float64 float64_div(float64 a, float64 b, float_status *status)
4216 {
4217     flag aSign, bSign, zSign;
4218     int aExp, bExp, zExp;
4219     uint64_t aSig, bSig, zSig;
4220     uint64_t rem0, rem1;
4221     uint64_t term0, term1;
4222     a = float64_squash_input_denormal(a, status);
4223     b = float64_squash_input_denormal(b, status);
4224 
4225     aSig = extractFloat64Frac( a );
4226     aExp = extractFloat64Exp( a );
4227     aSign = extractFloat64Sign( a );
4228     bSig = extractFloat64Frac( b );
4229     bExp = extractFloat64Exp( b );
4230     bSign = extractFloat64Sign( b );
4231     zSign = aSign ^ bSign;
4232     if ( aExp == 0x7FF ) {
4233         if (aSig) {
4234             return propagateFloat64NaN(a, b, status);
4235         }
4236         if ( bExp == 0x7FF ) {
4237             if (bSig) {
4238                 return propagateFloat64NaN(a, b, status);
4239             }
4240             float_raise(float_flag_invalid, status);
4241             return float64_default_nan(status);
4242         }
4243         return packFloat64( zSign, 0x7FF, 0 );
4244     }
4245     if ( bExp == 0x7FF ) {
4246         if (bSig) {
4247             return propagateFloat64NaN(a, b, status);
4248         }
4249         return packFloat64( zSign, 0, 0 );
4250     }
4251     if ( bExp == 0 ) {
4252         if ( bSig == 0 ) {
4253             if ( ( aExp | aSig ) == 0 ) {
4254                 float_raise(float_flag_invalid, status);
4255                 return float64_default_nan(status);
4256             }
4257             float_raise(float_flag_divbyzero, status);
4258             return packFloat64( zSign, 0x7FF, 0 );
4259         }
4260         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4261     }
4262     if ( aExp == 0 ) {
4263         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4264         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4265     }
4266     zExp = aExp - bExp + 0x3FD;
4267     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4268     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4269     if ( bSig <= ( aSig + aSig ) ) {
4270         aSig >>= 1;
4271         ++zExp;
4272     }
4273     zSig = estimateDiv128To64( aSig, 0, bSig );
4274     if ( ( zSig & 0x1FF ) <= 2 ) {
4275         mul64To128( bSig, zSig, &term0, &term1 );
4276         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4277         while ( (int64_t) rem0 < 0 ) {
4278             --zSig;
4279             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4280         }
4281         zSig |= ( rem1 != 0 );
4282     }
4283     return roundAndPackFloat64(zSign, zExp, zSig, status);
4284 
4285 }
4286 
4287 /*----------------------------------------------------------------------------
4288 | Returns the remainder of the double-precision floating-point value `a'
4289 | with respect to the corresponding value `b'.  The operation is performed
4290 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4291 *----------------------------------------------------------------------------*/
4292 
4293 float64 float64_rem(float64 a, float64 b, float_status *status)
4294 {
4295     flag aSign, zSign;
4296     int aExp, bExp, expDiff;
4297     uint64_t aSig, bSig;
4298     uint64_t q, alternateASig;
4299     int64_t sigMean;
4300 
4301     a = float64_squash_input_denormal(a, status);
4302     b = float64_squash_input_denormal(b, status);
4303     aSig = extractFloat64Frac( a );
4304     aExp = extractFloat64Exp( a );
4305     aSign = extractFloat64Sign( a );
4306     bSig = extractFloat64Frac( b );
4307     bExp = extractFloat64Exp( b );
4308     if ( aExp == 0x7FF ) {
4309         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4310             return propagateFloat64NaN(a, b, status);
4311         }
4312         float_raise(float_flag_invalid, status);
4313         return float64_default_nan(status);
4314     }
4315     if ( bExp == 0x7FF ) {
4316         if (bSig) {
4317             return propagateFloat64NaN(a, b, status);
4318         }
4319         return a;
4320     }
4321     if ( bExp == 0 ) {
4322         if ( bSig == 0 ) {
4323             float_raise(float_flag_invalid, status);
4324             return float64_default_nan(status);
4325         }
4326         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4327     }
4328     if ( aExp == 0 ) {
4329         if ( aSig == 0 ) return a;
4330         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4331     }
4332     expDiff = aExp - bExp;
4333     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4334     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4335     if ( expDiff < 0 ) {
4336         if ( expDiff < -1 ) return a;
4337         aSig >>= 1;
4338     }
4339     q = ( bSig <= aSig );
4340     if ( q ) aSig -= bSig;
4341     expDiff -= 64;
4342     while ( 0 < expDiff ) {
4343         q = estimateDiv128To64( aSig, 0, bSig );
4344         q = ( 2 < q ) ? q - 2 : 0;
4345         aSig = - ( ( bSig>>2 ) * q );
4346         expDiff -= 62;
4347     }
4348     expDiff += 64;
4349     if ( 0 < expDiff ) {
4350         q = estimateDiv128To64( aSig, 0, bSig );
4351         q = ( 2 < q ) ? q - 2 : 0;
4352         q >>= 64 - expDiff;
4353         bSig >>= 2;
4354         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4355     }
4356     else {
4357         aSig >>= 2;
4358         bSig >>= 2;
4359     }
4360     do {
4361         alternateASig = aSig;
4362         ++q;
4363         aSig -= bSig;
4364     } while ( 0 <= (int64_t) aSig );
4365     sigMean = aSig + alternateASig;
4366     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4367         aSig = alternateASig;
4368     }
4369     zSign = ( (int64_t) aSig < 0 );
4370     if ( zSign ) aSig = - aSig;
4371     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4372 
4373 }
4374 
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of multiplying the double-precision floating-point values
4377 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4378 | multiplication.  The operation is performed according to the IEC/IEEE
4379 | Standard for Binary Floating-Point Arithmetic 754-2008.
4380 | The flags argument allows the caller to select negation of the
4381 | addend, the intermediate product, or the final result. (The difference
4382 | between this and having the caller do a separate negation is that negating
4383 | externally will flip the sign bit on NaNs.)
4384 *----------------------------------------------------------------------------*/
4385 
4386 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4387                        float_status *status)
4388 {
4389     flag aSign, bSign, cSign, zSign;
4390     int aExp, bExp, cExp, pExp, zExp, expDiff;
4391     uint64_t aSig, bSig, cSig;
4392     flag pInf, pZero, pSign;
4393     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4394     int shiftcount;
4395     flag signflip, infzero;
4396 
4397     a = float64_squash_input_denormal(a, status);
4398     b = float64_squash_input_denormal(b, status);
4399     c = float64_squash_input_denormal(c, status);
4400     aSig = extractFloat64Frac(a);
4401     aExp = extractFloat64Exp(a);
4402     aSign = extractFloat64Sign(a);
4403     bSig = extractFloat64Frac(b);
4404     bExp = extractFloat64Exp(b);
4405     bSign = extractFloat64Sign(b);
4406     cSig = extractFloat64Frac(c);
4407     cExp = extractFloat64Exp(c);
4408     cSign = extractFloat64Sign(c);
4409 
4410     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4411                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4412 
4413     /* It is implementation-defined whether the cases of (0,inf,qnan)
4414      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4415      * they return if they do), so we have to hand this information
4416      * off to the target-specific pick-a-NaN routine.
4417      */
4418     if (((aExp == 0x7ff) && aSig) ||
4419         ((bExp == 0x7ff) && bSig) ||
4420         ((cExp == 0x7ff) && cSig)) {
4421         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4422     }
4423 
4424     if (infzero) {
4425         float_raise(float_flag_invalid, status);
4426         return float64_default_nan(status);
4427     }
4428 
4429     if (flags & float_muladd_negate_c) {
4430         cSign ^= 1;
4431     }
4432 
4433     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4434 
4435     /* Work out the sign and type of the product */
4436     pSign = aSign ^ bSign;
4437     if (flags & float_muladd_negate_product) {
4438         pSign ^= 1;
4439     }
4440     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4441     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4442 
4443     if (cExp == 0x7ff) {
4444         if (pInf && (pSign ^ cSign)) {
4445             /* addition of opposite-signed infinities => InvalidOperation */
4446             float_raise(float_flag_invalid, status);
4447             return float64_default_nan(status);
4448         }
4449         /* Otherwise generate an infinity of the same sign */
4450         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4451     }
4452 
4453     if (pInf) {
4454         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4455     }
4456 
4457     if (pZero) {
4458         if (cExp == 0) {
4459             if (cSig == 0) {
4460                 /* Adding two exact zeroes */
4461                 if (pSign == cSign) {
4462                     zSign = pSign;
4463                 } else if (status->float_rounding_mode == float_round_down) {
4464                     zSign = 1;
4465                 } else {
4466                     zSign = 0;
4467                 }
4468                 return packFloat64(zSign ^ signflip, 0, 0);
4469             }
4470             /* Exact zero plus a denorm */
4471             if (status->flush_to_zero) {
4472                 float_raise(float_flag_output_denormal, status);
4473                 return packFloat64(cSign ^ signflip, 0, 0);
4474             }
4475         }
4476         /* Zero plus something non-zero : just return the something */
4477         if (flags & float_muladd_halve_result) {
4478             if (cExp == 0) {
4479                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4480             }
4481             /* Subtract one to halve, and one again because roundAndPackFloat64
4482              * wants one less than the true exponent.
4483              */
4484             cExp -= 2;
4485             cSig = (cSig | 0x0010000000000000ULL) << 10;
4486             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4487         }
4488         return packFloat64(cSign ^ signflip, cExp, cSig);
4489     }
4490 
4491     if (aExp == 0) {
4492         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4493     }
4494     if (bExp == 0) {
4495         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4496     }
4497 
4498     /* Calculate the actual result a * b + c */
4499 
4500     /* Multiply first; this is easy. */
4501     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4502      * because we want the true exponent, not the "one-less-than"
4503      * flavour that roundAndPackFloat64() takes.
4504      */
4505     pExp = aExp + bExp - 0x3fe;
4506     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4507     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4508     mul64To128(aSig, bSig, &pSig0, &pSig1);
4509     if ((int64_t)(pSig0 << 1) >= 0) {
4510         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4511         pExp--;
4512     }
4513 
4514     zSign = pSign ^ signflip;
4515 
4516     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4517      * bit in position 126.
4518      */
4519     if (cExp == 0) {
4520         if (!cSig) {
4521             /* Throw out the special case of c being an exact zero now */
4522             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4523             if (flags & float_muladd_halve_result) {
4524                 pExp--;
4525             }
4526             return roundAndPackFloat64(zSign, pExp - 1,
4527                                        pSig1, status);
4528         }
4529         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4530     }
4531 
4532     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4533      * significand of the addend, with the explicit bit in position 126.
4534      */
4535     cSig0 = cSig << (126 - 64 - 52);
4536     cSig1 = 0;
4537     cSig0 |= LIT64(0x4000000000000000);
4538     expDiff = pExp - cExp;
4539 
4540     if (pSign == cSign) {
4541         /* Addition */
4542         if (expDiff > 0) {
4543             /* scale c to match p */
4544             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4545             zExp = pExp;
4546         } else if (expDiff < 0) {
4547             /* scale p to match c */
4548             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4549             zExp = cExp;
4550         } else {
4551             /* no scaling needed */
4552             zExp = cExp;
4553         }
4554         /* Add significands and make sure explicit bit ends up in posn 126 */
4555         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4556         if ((int64_t)zSig0 < 0) {
4557             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4558         } else {
4559             zExp--;
4560         }
4561         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4562         if (flags & float_muladd_halve_result) {
4563             zExp--;
4564         }
4565         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4566     } else {
4567         /* Subtraction */
4568         if (expDiff > 0) {
4569             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4570             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4571             zExp = pExp;
4572         } else if (expDiff < 0) {
4573             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4574             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4575             zExp = cExp;
4576             zSign ^= 1;
4577         } else {
4578             zExp = pExp;
4579             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4580                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4581             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4582                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4583                 zSign ^= 1;
4584             } else {
4585                 /* Exact zero */
4586                 zSign = signflip;
4587                 if (status->float_rounding_mode == float_round_down) {
4588                     zSign ^= 1;
4589                 }
4590                 return packFloat64(zSign, 0, 0);
4591             }
4592         }
4593         --zExp;
4594         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4595          * starting with the significand in a pair of uint64_t.
4596          */
4597         if (zSig0) {
4598             shiftcount = countLeadingZeros64(zSig0) - 1;
4599             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4600             if (zSig1) {
4601                 zSig0 |= 1;
4602             }
4603             zExp -= shiftcount;
4604         } else {
4605             shiftcount = countLeadingZeros64(zSig1);
4606             if (shiftcount == 0) {
4607                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4608                 zExp -= 63;
4609             } else {
4610                 shiftcount--;
4611                 zSig0 = zSig1 << shiftcount;
4612                 zExp -= (shiftcount + 64);
4613             }
4614         }
4615         if (flags & float_muladd_halve_result) {
4616             zExp--;
4617         }
4618         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4619     }
4620 }
4621 
4622 /*----------------------------------------------------------------------------
4623 | Returns the square root of the double-precision floating-point value `a'.
4624 | The operation is performed according to the IEC/IEEE Standard for Binary
4625 | Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4627 
4628 float64 float64_sqrt(float64 a, float_status *status)
4629 {
4630     flag aSign;
4631     int aExp, zExp;
4632     uint64_t aSig, zSig, doubleZSig;
4633     uint64_t rem0, rem1, term0, term1;
4634     a = float64_squash_input_denormal(a, status);
4635 
4636     aSig = extractFloat64Frac( a );
4637     aExp = extractFloat64Exp( a );
4638     aSign = extractFloat64Sign( a );
4639     if ( aExp == 0x7FF ) {
4640         if (aSig) {
4641             return propagateFloat64NaN(a, a, status);
4642         }
4643         if ( ! aSign ) return a;
4644         float_raise(float_flag_invalid, status);
4645         return float64_default_nan(status);
4646     }
4647     if ( aSign ) {
4648         if ( ( aExp | aSig ) == 0 ) return a;
4649         float_raise(float_flag_invalid, status);
4650         return float64_default_nan(status);
4651     }
4652     if ( aExp == 0 ) {
4653         if ( aSig == 0 ) return float64_zero;
4654         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4655     }
4656     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4657     aSig |= LIT64( 0x0010000000000000 );
4658     zSig = estimateSqrt32( aExp, aSig>>21 );
4659     aSig <<= 9 - ( aExp & 1 );
4660     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4661     if ( ( zSig & 0x1FF ) <= 5 ) {
4662         doubleZSig = zSig<<1;
4663         mul64To128( zSig, zSig, &term0, &term1 );
4664         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4665         while ( (int64_t) rem0 < 0 ) {
4666             --zSig;
4667             doubleZSig -= 2;
4668             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4669         }
4670         zSig |= ( ( rem0 | rem1 ) != 0 );
4671     }
4672     return roundAndPackFloat64(0, zExp, zSig, status);
4673 
4674 }
4675 
4676 /*----------------------------------------------------------------------------
4677 | Returns the binary log of the double-precision floating-point value `a'.
4678 | The operation is performed according to the IEC/IEEE Standard for Binary
4679 | Floating-Point Arithmetic.
4680 *----------------------------------------------------------------------------*/
4681 float64 float64_log2(float64 a, float_status *status)
4682 {
4683     flag aSign, zSign;
4684     int aExp;
4685     uint64_t aSig, aSig0, aSig1, zSig, i;
4686     a = float64_squash_input_denormal(a, status);
4687 
4688     aSig = extractFloat64Frac( a );
4689     aExp = extractFloat64Exp( a );
4690     aSign = extractFloat64Sign( a );
4691 
4692     if ( aExp == 0 ) {
4693         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4694         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4695     }
4696     if ( aSign ) {
4697         float_raise(float_flag_invalid, status);
4698         return float64_default_nan(status);
4699     }
4700     if ( aExp == 0x7FF ) {
4701         if (aSig) {
4702             return propagateFloat64NaN(a, float64_zero, status);
4703         }
4704         return a;
4705     }
4706 
4707     aExp -= 0x3FF;
4708     aSig |= LIT64( 0x0010000000000000 );
4709     zSign = aExp < 0;
4710     zSig = (uint64_t)aExp << 52;
4711     for (i = 1LL << 51; i > 0; i >>= 1) {
4712         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4713         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4714         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4715             aSig >>= 1;
4716             zSig |= i;
4717         }
4718     }
4719 
4720     if ( zSign )
4721         zSig = -zSig;
4722     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4723 }
4724 
4725 /*----------------------------------------------------------------------------
4726 | Returns 1 if the double-precision floating-point value `a' is equal to the
4727 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4728 | if either operand is a NaN.  Otherwise, the comparison is performed
4729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4731 
4732 int float64_eq(float64 a, float64 b, float_status *status)
4733 {
4734     uint64_t av, bv;
4735     a = float64_squash_input_denormal(a, status);
4736     b = float64_squash_input_denormal(b, status);
4737 
4738     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4739          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4740        ) {
4741         float_raise(float_flag_invalid, status);
4742         return 0;
4743     }
4744     av = float64_val(a);
4745     bv = float64_val(b);
4746     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4747 
4748 }
4749 
4750 /*----------------------------------------------------------------------------
4751 | Returns 1 if the double-precision floating-point value `a' is less than or
4752 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4753 | exception is raised if either operand is a NaN.  The comparison is performed
4754 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4755 *----------------------------------------------------------------------------*/
4756 
4757 int float64_le(float64 a, float64 b, float_status *status)
4758 {
4759     flag aSign, bSign;
4760     uint64_t av, bv;
4761     a = float64_squash_input_denormal(a, status);
4762     b = float64_squash_input_denormal(b, status);
4763 
4764     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4765          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4766        ) {
4767         float_raise(float_flag_invalid, status);
4768         return 0;
4769     }
4770     aSign = extractFloat64Sign( a );
4771     bSign = extractFloat64Sign( b );
4772     av = float64_val(a);
4773     bv = float64_val(b);
4774     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4775     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4776 
4777 }
4778 
4779 /*----------------------------------------------------------------------------
4780 | Returns 1 if the double-precision floating-point value `a' is less than
4781 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4782 | raised if either operand is a NaN.  The comparison is performed according
4783 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4785 
4786 int float64_lt(float64 a, float64 b, float_status *status)
4787 {
4788     flag aSign, bSign;
4789     uint64_t av, bv;
4790 
4791     a = float64_squash_input_denormal(a, status);
4792     b = float64_squash_input_denormal(b, status);
4793     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4794          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4795        ) {
4796         float_raise(float_flag_invalid, status);
4797         return 0;
4798     }
4799     aSign = extractFloat64Sign( a );
4800     bSign = extractFloat64Sign( b );
4801     av = float64_val(a);
4802     bv = float64_val(b);
4803     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4804     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4805 
4806 }
4807 
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4810 | be compared, and 0 otherwise.  The invalid exception is raised if either
4811 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4812 | Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814 
4815 int float64_unordered(float64 a, float64 b, float_status *status)
4816 {
4817     a = float64_squash_input_denormal(a, status);
4818     b = float64_squash_input_denormal(b, status);
4819 
4820     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4821          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4822        ) {
4823         float_raise(float_flag_invalid, status);
4824         return 1;
4825     }
4826     return 0;
4827 }
4828 
4829 /*----------------------------------------------------------------------------
4830 | Returns 1 if the double-precision floating-point value `a' is equal to the
4831 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4832 | exception.The comparison is performed according to the IEC/IEEE Standard
4833 | for Binary Floating-Point Arithmetic.
4834 *----------------------------------------------------------------------------*/
4835 
4836 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4837 {
4838     uint64_t av, bv;
4839     a = float64_squash_input_denormal(a, status);
4840     b = float64_squash_input_denormal(b, status);
4841 
4842     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4843          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4844        ) {
4845         if (float64_is_signaling_nan(a, status)
4846          || float64_is_signaling_nan(b, status)) {
4847             float_raise(float_flag_invalid, status);
4848         }
4849         return 0;
4850     }
4851     av = float64_val(a);
4852     bv = float64_val(b);
4853     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4854 
4855 }
4856 
4857 /*----------------------------------------------------------------------------
4858 | Returns 1 if the double-precision floating-point value `a' is less than or
4859 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4860 | cause an exception.  Otherwise, the comparison is performed according to the
4861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4862 *----------------------------------------------------------------------------*/
4863 
4864 int float64_le_quiet(float64 a, float64 b, float_status *status)
4865 {
4866     flag aSign, bSign;
4867     uint64_t av, bv;
4868     a = float64_squash_input_denormal(a, status);
4869     b = float64_squash_input_denormal(b, status);
4870 
4871     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4872          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4873        ) {
4874         if (float64_is_signaling_nan(a, status)
4875          || float64_is_signaling_nan(b, status)) {
4876             float_raise(float_flag_invalid, status);
4877         }
4878         return 0;
4879     }
4880     aSign = extractFloat64Sign( a );
4881     bSign = extractFloat64Sign( b );
4882     av = float64_val(a);
4883     bv = float64_val(b);
4884     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4885     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4886 
4887 }
4888 
4889 /*----------------------------------------------------------------------------
4890 | Returns 1 if the double-precision floating-point value `a' is less than
4891 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4892 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4893 | Standard for Binary Floating-Point Arithmetic.
4894 *----------------------------------------------------------------------------*/
4895 
4896 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4897 {
4898     flag aSign, bSign;
4899     uint64_t av, bv;
4900     a = float64_squash_input_denormal(a, status);
4901     b = float64_squash_input_denormal(b, status);
4902 
4903     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4904          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4905        ) {
4906         if (float64_is_signaling_nan(a, status)
4907          || float64_is_signaling_nan(b, status)) {
4908             float_raise(float_flag_invalid, status);
4909         }
4910         return 0;
4911     }
4912     aSign = extractFloat64Sign( a );
4913     bSign = extractFloat64Sign( b );
4914     av = float64_val(a);
4915     bv = float64_val(b);
4916     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4917     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4918 
4919 }
4920 
4921 /*----------------------------------------------------------------------------
4922 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4923 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4924 | comparison is performed according to the IEC/IEEE Standard for Binary
4925 | Floating-Point Arithmetic.
4926 *----------------------------------------------------------------------------*/
4927 
4928 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4929 {
4930     a = float64_squash_input_denormal(a, status);
4931     b = float64_squash_input_denormal(b, status);
4932 
4933     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4934          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4935        ) {
4936         if (float64_is_signaling_nan(a, status)
4937          || float64_is_signaling_nan(b, status)) {
4938             float_raise(float_flag_invalid, status);
4939         }
4940         return 1;
4941     }
4942     return 0;
4943 }
4944 
4945 /*----------------------------------------------------------------------------
4946 | Returns the result of converting the extended double-precision floating-
4947 | point value `a' to the 32-bit two's complement integer format.  The
4948 | conversion is performed according to the IEC/IEEE Standard for Binary
4949 | Floating-Point Arithmetic---which means in particular that the conversion
4950 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4951 | largest positive integer is returned.  Otherwise, if the conversion
4952 | overflows, the largest integer with the same sign as `a' is returned.
4953 *----------------------------------------------------------------------------*/
4954 
4955 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4956 {
4957     flag aSign;
4958     int32_t aExp, shiftCount;
4959     uint64_t aSig;
4960 
4961     if (floatx80_invalid_encoding(a)) {
4962         float_raise(float_flag_invalid, status);
4963         return 1 << 31;
4964     }
4965     aSig = extractFloatx80Frac( a );
4966     aExp = extractFloatx80Exp( a );
4967     aSign = extractFloatx80Sign( a );
4968     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4969     shiftCount = 0x4037 - aExp;
4970     if ( shiftCount <= 0 ) shiftCount = 1;
4971     shift64RightJamming( aSig, shiftCount, &aSig );
4972     return roundAndPackInt32(aSign, aSig, status);
4973 
4974 }
4975 
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of converting the extended double-precision floating-
4978 | point value `a' to the 32-bit two's complement integer format.  The
4979 | conversion is performed according to the IEC/IEEE Standard for Binary
4980 | Floating-Point Arithmetic, except that the conversion is always rounded
4981 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4982 | Otherwise, if the conversion overflows, the largest integer with the same
4983 | sign as `a' is returned.
4984 *----------------------------------------------------------------------------*/
4985 
4986 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4987 {
4988     flag aSign;
4989     int32_t aExp, shiftCount;
4990     uint64_t aSig, savedASig;
4991     int32_t z;
4992 
4993     if (floatx80_invalid_encoding(a)) {
4994         float_raise(float_flag_invalid, status);
4995         return 1 << 31;
4996     }
4997     aSig = extractFloatx80Frac( a );
4998     aExp = extractFloatx80Exp( a );
4999     aSign = extractFloatx80Sign( a );
5000     if ( 0x401E < aExp ) {
5001         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5002         goto invalid;
5003     }
5004     else if ( aExp < 0x3FFF ) {
5005         if (aExp || aSig) {
5006             status->float_exception_flags |= float_flag_inexact;
5007         }
5008         return 0;
5009     }
5010     shiftCount = 0x403E - aExp;
5011     savedASig = aSig;
5012     aSig >>= shiftCount;
5013     z = aSig;
5014     if ( aSign ) z = - z;
5015     if ( ( z < 0 ) ^ aSign ) {
5016  invalid:
5017         float_raise(float_flag_invalid, status);
5018         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5019     }
5020     if ( ( aSig<<shiftCount ) != savedASig ) {
5021         status->float_exception_flags |= float_flag_inexact;
5022     }
5023     return z;
5024 
5025 }
5026 
5027 /*----------------------------------------------------------------------------
5028 | Returns the result of converting the extended double-precision floating-
5029 | point value `a' to the 64-bit two's complement integer format.  The
5030 | conversion is performed according to the IEC/IEEE Standard for Binary
5031 | Floating-Point Arithmetic---which means in particular that the conversion
5032 | is rounded according to the current rounding mode.  If `a' is a NaN,
5033 | the largest positive integer is returned.  Otherwise, if the conversion
5034 | overflows, the largest integer with the same sign as `a' is returned.
5035 *----------------------------------------------------------------------------*/
5036 
5037 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5038 {
5039     flag aSign;
5040     int32_t aExp, shiftCount;
5041     uint64_t aSig, aSigExtra;
5042 
5043     if (floatx80_invalid_encoding(a)) {
5044         float_raise(float_flag_invalid, status);
5045         return 1ULL << 63;
5046     }
5047     aSig = extractFloatx80Frac( a );
5048     aExp = extractFloatx80Exp( a );
5049     aSign = extractFloatx80Sign( a );
5050     shiftCount = 0x403E - aExp;
5051     if ( shiftCount <= 0 ) {
5052         if ( shiftCount ) {
5053             float_raise(float_flag_invalid, status);
5054             if (    ! aSign
5055                  || (    ( aExp == 0x7FFF )
5056                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
5057                ) {
5058                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5059             }
5060             return (int64_t) LIT64( 0x8000000000000000 );
5061         }
5062         aSigExtra = 0;
5063     }
5064     else {
5065         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5066     }
5067     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5068 
5069 }
5070 
5071 /*----------------------------------------------------------------------------
5072 | Returns the result of converting the extended double-precision floating-
5073 | point value `a' to the 64-bit two's complement integer format.  The
5074 | conversion is performed according to the IEC/IEEE Standard for Binary
5075 | Floating-Point Arithmetic, except that the conversion is always rounded
5076 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5077 | Otherwise, if the conversion overflows, the largest integer with the same
5078 | sign as `a' is returned.
5079 *----------------------------------------------------------------------------*/
5080 
5081 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5082 {
5083     flag aSign;
5084     int32_t aExp, shiftCount;
5085     uint64_t aSig;
5086     int64_t z;
5087 
5088     if (floatx80_invalid_encoding(a)) {
5089         float_raise(float_flag_invalid, status);
5090         return 1ULL << 63;
5091     }
5092     aSig = extractFloatx80Frac( a );
5093     aExp = extractFloatx80Exp( a );
5094     aSign = extractFloatx80Sign( a );
5095     shiftCount = aExp - 0x403E;
5096     if ( 0 <= shiftCount ) {
5097         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5098         if ( ( a.high != 0xC03E ) || aSig ) {
5099             float_raise(float_flag_invalid, status);
5100             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5101                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5102             }
5103         }
5104         return (int64_t) LIT64( 0x8000000000000000 );
5105     }
5106     else if ( aExp < 0x3FFF ) {
5107         if (aExp | aSig) {
5108             status->float_exception_flags |= float_flag_inexact;
5109         }
5110         return 0;
5111     }
5112     z = aSig>>( - shiftCount );
5113     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5114         status->float_exception_flags |= float_flag_inexact;
5115     }
5116     if ( aSign ) z = - z;
5117     return z;
5118 
5119 }
5120 
5121 /*----------------------------------------------------------------------------
5122 | Returns the result of converting the extended double-precision floating-
5123 | point value `a' to the single-precision floating-point format.  The
5124 | conversion is performed according to the IEC/IEEE Standard for Binary
5125 | Floating-Point Arithmetic.
5126 *----------------------------------------------------------------------------*/
5127 
5128 float32 floatx80_to_float32(floatx80 a, float_status *status)
5129 {
5130     flag aSign;
5131     int32_t aExp;
5132     uint64_t aSig;
5133 
5134     if (floatx80_invalid_encoding(a)) {
5135         float_raise(float_flag_invalid, status);
5136         return float32_default_nan(status);
5137     }
5138     aSig = extractFloatx80Frac( a );
5139     aExp = extractFloatx80Exp( a );
5140     aSign = extractFloatx80Sign( a );
5141     if ( aExp == 0x7FFF ) {
5142         if ( (uint64_t) ( aSig<<1 ) ) {
5143             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5144         }
5145         return packFloat32( aSign, 0xFF, 0 );
5146     }
5147     shift64RightJamming( aSig, 33, &aSig );
5148     if ( aExp || aSig ) aExp -= 0x3F81;
5149     return roundAndPackFloat32(aSign, aExp, aSig, status);
5150 
5151 }
5152 
5153 /*----------------------------------------------------------------------------
5154 | Returns the result of converting the extended double-precision floating-
5155 | point value `a' to the double-precision floating-point format.  The
5156 | conversion is performed according to the IEC/IEEE Standard for Binary
5157 | Floating-Point Arithmetic.
5158 *----------------------------------------------------------------------------*/
5159 
5160 float64 floatx80_to_float64(floatx80 a, float_status *status)
5161 {
5162     flag aSign;
5163     int32_t aExp;
5164     uint64_t aSig, zSig;
5165 
5166     if (floatx80_invalid_encoding(a)) {
5167         float_raise(float_flag_invalid, status);
5168         return float64_default_nan(status);
5169     }
5170     aSig = extractFloatx80Frac( a );
5171     aExp = extractFloatx80Exp( a );
5172     aSign = extractFloatx80Sign( a );
5173     if ( aExp == 0x7FFF ) {
5174         if ( (uint64_t) ( aSig<<1 ) ) {
5175             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5176         }
5177         return packFloat64( aSign, 0x7FF, 0 );
5178     }
5179     shift64RightJamming( aSig, 1, &zSig );
5180     if ( aExp || aSig ) aExp -= 0x3C01;
5181     return roundAndPackFloat64(aSign, aExp, zSig, status);
5182 
5183 }
5184 
5185 /*----------------------------------------------------------------------------
5186 | Returns the result of converting the extended double-precision floating-
5187 | point value `a' to the quadruple-precision floating-point format.  The
5188 | conversion is performed according to the IEC/IEEE Standard for Binary
5189 | Floating-Point Arithmetic.
5190 *----------------------------------------------------------------------------*/
5191 
5192 float128 floatx80_to_float128(floatx80 a, float_status *status)
5193 {
5194     flag aSign;
5195     int aExp;
5196     uint64_t aSig, zSig0, zSig1;
5197 
5198     if (floatx80_invalid_encoding(a)) {
5199         float_raise(float_flag_invalid, status);
5200         return float128_default_nan(status);
5201     }
5202     aSig = extractFloatx80Frac( a );
5203     aExp = extractFloatx80Exp( a );
5204     aSign = extractFloatx80Sign( a );
5205     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5206         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5207     }
5208     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5209     return packFloat128( aSign, aExp, zSig0, zSig1 );
5210 
5211 }
5212 
5213 /*----------------------------------------------------------------------------
5214 | Rounds the extended double-precision floating-point value `a'
5215 | to the precision provided by floatx80_rounding_precision and returns the
5216 | result as an extended double-precision floating-point value.
5217 | The operation is performed according to the IEC/IEEE Standard for Binary
5218 | Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220 
5221 floatx80 floatx80_round(floatx80 a, float_status *status)
5222 {
5223     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5224                                 extractFloatx80Sign(a),
5225                                 extractFloatx80Exp(a),
5226                                 extractFloatx80Frac(a), 0, status);
5227 }
5228 
5229 /*----------------------------------------------------------------------------
5230 | Rounds the extended double-precision floating-point value `a' to an integer,
5231 | and returns the result as an extended quadruple-precision floating-point
5232 | value.  The operation is performed according to the IEC/IEEE Standard for
5233 | Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5235 
5236 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5237 {
5238     flag aSign;
5239     int32_t aExp;
5240     uint64_t lastBitMask, roundBitsMask;
5241     floatx80 z;
5242 
5243     if (floatx80_invalid_encoding(a)) {
5244         float_raise(float_flag_invalid, status);
5245         return floatx80_default_nan(status);
5246     }
5247     aExp = extractFloatx80Exp( a );
5248     if ( 0x403E <= aExp ) {
5249         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5250             return propagateFloatx80NaN(a, a, status);
5251         }
5252         return a;
5253     }
5254     if ( aExp < 0x3FFF ) {
5255         if (    ( aExp == 0 )
5256              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5257             return a;
5258         }
5259         status->float_exception_flags |= float_flag_inexact;
5260         aSign = extractFloatx80Sign( a );
5261         switch (status->float_rounding_mode) {
5262          case float_round_nearest_even:
5263             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5264                ) {
5265                 return
5266                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5267             }
5268             break;
5269         case float_round_ties_away:
5270             if (aExp == 0x3FFE) {
5271                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5272             }
5273             break;
5274          case float_round_down:
5275             return
5276                   aSign ?
5277                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5278                 : packFloatx80( 0, 0, 0 );
5279          case float_round_up:
5280             return
5281                   aSign ? packFloatx80( 1, 0, 0 )
5282                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5283         }
5284         return packFloatx80( aSign, 0, 0 );
5285     }
5286     lastBitMask = 1;
5287     lastBitMask <<= 0x403E - aExp;
5288     roundBitsMask = lastBitMask - 1;
5289     z = a;
5290     switch (status->float_rounding_mode) {
5291     case float_round_nearest_even:
5292         z.low += lastBitMask>>1;
5293         if ((z.low & roundBitsMask) == 0) {
5294             z.low &= ~lastBitMask;
5295         }
5296         break;
5297     case float_round_ties_away:
5298         z.low += lastBitMask >> 1;
5299         break;
5300     case float_round_to_zero:
5301         break;
5302     case float_round_up:
5303         if (!extractFloatx80Sign(z)) {
5304             z.low += roundBitsMask;
5305         }
5306         break;
5307     case float_round_down:
5308         if (extractFloatx80Sign(z)) {
5309             z.low += roundBitsMask;
5310         }
5311         break;
5312     default:
5313         abort();
5314     }
5315     z.low &= ~ roundBitsMask;
5316     if ( z.low == 0 ) {
5317         ++z.high;
5318         z.low = LIT64( 0x8000000000000000 );
5319     }
5320     if (z.low != a.low) {
5321         status->float_exception_flags |= float_flag_inexact;
5322     }
5323     return z;
5324 
5325 }
5326 
5327 /*----------------------------------------------------------------------------
5328 | Returns the result of adding the absolute values of the extended double-
5329 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5330 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5331 | The addition is performed according to the IEC/IEEE Standard for Binary
5332 | Floating-Point Arithmetic.
5333 *----------------------------------------------------------------------------*/
5334 
5335 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5336                                 float_status *status)
5337 {
5338     int32_t aExp, bExp, zExp;
5339     uint64_t aSig, bSig, zSig0, zSig1;
5340     int32_t expDiff;
5341 
5342     aSig = extractFloatx80Frac( a );
5343     aExp = extractFloatx80Exp( a );
5344     bSig = extractFloatx80Frac( b );
5345     bExp = extractFloatx80Exp( b );
5346     expDiff = aExp - bExp;
5347     if ( 0 < expDiff ) {
5348         if ( aExp == 0x7FFF ) {
5349             if ((uint64_t)(aSig << 1)) {
5350                 return propagateFloatx80NaN(a, b, status);
5351             }
5352             return a;
5353         }
5354         if ( bExp == 0 ) --expDiff;
5355         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5356         zExp = aExp;
5357     }
5358     else if ( expDiff < 0 ) {
5359         if ( bExp == 0x7FFF ) {
5360             if ((uint64_t)(bSig << 1)) {
5361                 return propagateFloatx80NaN(a, b, status);
5362             }
5363             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5364         }
5365         if ( aExp == 0 ) ++expDiff;
5366         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5367         zExp = bExp;
5368     }
5369     else {
5370         if ( aExp == 0x7FFF ) {
5371             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5372                 return propagateFloatx80NaN(a, b, status);
5373             }
5374             return a;
5375         }
5376         zSig1 = 0;
5377         zSig0 = aSig + bSig;
5378         if ( aExp == 0 ) {
5379             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5380             goto roundAndPack;
5381         }
5382         zExp = aExp;
5383         goto shiftRight1;
5384     }
5385     zSig0 = aSig + bSig;
5386     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5387  shiftRight1:
5388     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5389     zSig0 |= LIT64( 0x8000000000000000 );
5390     ++zExp;
5391  roundAndPack:
5392     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5393                                 zSign, zExp, zSig0, zSig1, status);
5394 }
5395 
5396 /*----------------------------------------------------------------------------
5397 | Returns the result of subtracting the absolute values of the extended
5398 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5399 | difference is negated before being returned.  `zSign' is ignored if the
5400 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5401 | Standard for Binary Floating-Point Arithmetic.
5402 *----------------------------------------------------------------------------*/
5403 
5404 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5405                                 float_status *status)
5406 {
5407     int32_t aExp, bExp, zExp;
5408     uint64_t aSig, bSig, zSig0, zSig1;
5409     int32_t expDiff;
5410 
5411     aSig = extractFloatx80Frac( a );
5412     aExp = extractFloatx80Exp( a );
5413     bSig = extractFloatx80Frac( b );
5414     bExp = extractFloatx80Exp( b );
5415     expDiff = aExp - bExp;
5416     if ( 0 < expDiff ) goto aExpBigger;
5417     if ( expDiff < 0 ) goto bExpBigger;
5418     if ( aExp == 0x7FFF ) {
5419         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5420             return propagateFloatx80NaN(a, b, status);
5421         }
5422         float_raise(float_flag_invalid, status);
5423         return floatx80_default_nan(status);
5424     }
5425     if ( aExp == 0 ) {
5426         aExp = 1;
5427         bExp = 1;
5428     }
5429     zSig1 = 0;
5430     if ( bSig < aSig ) goto aBigger;
5431     if ( aSig < bSig ) goto bBigger;
5432     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5433  bExpBigger:
5434     if ( bExp == 0x7FFF ) {
5435         if ((uint64_t)(bSig << 1)) {
5436             return propagateFloatx80NaN(a, b, status);
5437         }
5438         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5439     }
5440     if ( aExp == 0 ) ++expDiff;
5441     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5442  bBigger:
5443     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5444     zExp = bExp;
5445     zSign ^= 1;
5446     goto normalizeRoundAndPack;
5447  aExpBigger:
5448     if ( aExp == 0x7FFF ) {
5449         if ((uint64_t)(aSig << 1)) {
5450             return propagateFloatx80NaN(a, b, status);
5451         }
5452         return a;
5453     }
5454     if ( bExp == 0 ) --expDiff;
5455     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5456  aBigger:
5457     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5458     zExp = aExp;
5459  normalizeRoundAndPack:
5460     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5461                                          zSign, zExp, zSig0, zSig1, status);
5462 }
5463 
5464 /*----------------------------------------------------------------------------
5465 | Returns the result of adding the extended double-precision floating-point
5466 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5467 | Standard for Binary Floating-Point Arithmetic.
5468 *----------------------------------------------------------------------------*/
5469 
5470 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5471 {
5472     flag aSign, bSign;
5473 
5474     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5475         float_raise(float_flag_invalid, status);
5476         return floatx80_default_nan(status);
5477     }
5478     aSign = extractFloatx80Sign( a );
5479     bSign = extractFloatx80Sign( b );
5480     if ( aSign == bSign ) {
5481         return addFloatx80Sigs(a, b, aSign, status);
5482     }
5483     else {
5484         return subFloatx80Sigs(a, b, aSign, status);
5485     }
5486 
5487 }
5488 
5489 /*----------------------------------------------------------------------------
5490 | Returns the result of subtracting the extended double-precision floating-
5491 | point values `a' and `b'.  The operation is performed according to the
5492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493 *----------------------------------------------------------------------------*/
5494 
5495 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5496 {
5497     flag aSign, bSign;
5498 
5499     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5500         float_raise(float_flag_invalid, status);
5501         return floatx80_default_nan(status);
5502     }
5503     aSign = extractFloatx80Sign( a );
5504     bSign = extractFloatx80Sign( b );
5505     if ( aSign == bSign ) {
5506         return subFloatx80Sigs(a, b, aSign, status);
5507     }
5508     else {
5509         return addFloatx80Sigs(a, b, aSign, status);
5510     }
5511 
5512 }
5513 
5514 /*----------------------------------------------------------------------------
5515 | Returns the result of multiplying the extended double-precision floating-
5516 | point values `a' and `b'.  The operation is performed according to the
5517 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5518 *----------------------------------------------------------------------------*/
5519 
5520 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5521 {
5522     flag aSign, bSign, zSign;
5523     int32_t aExp, bExp, zExp;
5524     uint64_t aSig, bSig, zSig0, zSig1;
5525 
5526     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5527         float_raise(float_flag_invalid, status);
5528         return floatx80_default_nan(status);
5529     }
5530     aSig = extractFloatx80Frac( a );
5531     aExp = extractFloatx80Exp( a );
5532     aSign = extractFloatx80Sign( a );
5533     bSig = extractFloatx80Frac( b );
5534     bExp = extractFloatx80Exp( b );
5535     bSign = extractFloatx80Sign( b );
5536     zSign = aSign ^ bSign;
5537     if ( aExp == 0x7FFF ) {
5538         if (    (uint64_t) ( aSig<<1 )
5539              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5540             return propagateFloatx80NaN(a, b, status);
5541         }
5542         if ( ( bExp | bSig ) == 0 ) goto invalid;
5543         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5544     }
5545     if ( bExp == 0x7FFF ) {
5546         if ((uint64_t)(bSig << 1)) {
5547             return propagateFloatx80NaN(a, b, status);
5548         }
5549         if ( ( aExp | aSig ) == 0 ) {
5550  invalid:
5551             float_raise(float_flag_invalid, status);
5552             return floatx80_default_nan(status);
5553         }
5554         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5555     }
5556     if ( aExp == 0 ) {
5557         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5558         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5559     }
5560     if ( bExp == 0 ) {
5561         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5562         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5563     }
5564     zExp = aExp + bExp - 0x3FFE;
5565     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5566     if ( 0 < (int64_t) zSig0 ) {
5567         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5568         --zExp;
5569     }
5570     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5571                                 zSign, zExp, zSig0, zSig1, status);
5572 }
5573 
5574 /*----------------------------------------------------------------------------
5575 | Returns the result of dividing the extended double-precision floating-point
5576 | value `a' by the corresponding value `b'.  The operation is performed
5577 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5578 *----------------------------------------------------------------------------*/
5579 
5580 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5581 {
5582     flag aSign, bSign, zSign;
5583     int32_t aExp, bExp, zExp;
5584     uint64_t aSig, bSig, zSig0, zSig1;
5585     uint64_t rem0, rem1, rem2, term0, term1, term2;
5586 
5587     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5588         float_raise(float_flag_invalid, status);
5589         return floatx80_default_nan(status);
5590     }
5591     aSig = extractFloatx80Frac( a );
5592     aExp = extractFloatx80Exp( a );
5593     aSign = extractFloatx80Sign( a );
5594     bSig = extractFloatx80Frac( b );
5595     bExp = extractFloatx80Exp( b );
5596     bSign = extractFloatx80Sign( b );
5597     zSign = aSign ^ bSign;
5598     if ( aExp == 0x7FFF ) {
5599         if ((uint64_t)(aSig << 1)) {
5600             return propagateFloatx80NaN(a, b, status);
5601         }
5602         if ( bExp == 0x7FFF ) {
5603             if ((uint64_t)(bSig << 1)) {
5604                 return propagateFloatx80NaN(a, b, status);
5605             }
5606             goto invalid;
5607         }
5608         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5609     }
5610     if ( bExp == 0x7FFF ) {
5611         if ((uint64_t)(bSig << 1)) {
5612             return propagateFloatx80NaN(a, b, status);
5613         }
5614         return packFloatx80( zSign, 0, 0 );
5615     }
5616     if ( bExp == 0 ) {
5617         if ( bSig == 0 ) {
5618             if ( ( aExp | aSig ) == 0 ) {
5619  invalid:
5620                 float_raise(float_flag_invalid, status);
5621                 return floatx80_default_nan(status);
5622             }
5623             float_raise(float_flag_divbyzero, status);
5624             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5625         }
5626         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5627     }
5628     if ( aExp == 0 ) {
5629         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5630         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5631     }
5632     zExp = aExp - bExp + 0x3FFE;
5633     rem1 = 0;
5634     if ( bSig <= aSig ) {
5635         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5636         ++zExp;
5637     }
5638     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5639     mul64To128( bSig, zSig0, &term0, &term1 );
5640     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5641     while ( (int64_t) rem0 < 0 ) {
5642         --zSig0;
5643         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5644     }
5645     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5646     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5647         mul64To128( bSig, zSig1, &term1, &term2 );
5648         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5649         while ( (int64_t) rem1 < 0 ) {
5650             --zSig1;
5651             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5652         }
5653         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5654     }
5655     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5656                                 zSign, zExp, zSig0, zSig1, status);
5657 }
5658 
5659 /*----------------------------------------------------------------------------
5660 | Returns the remainder of the extended double-precision floating-point value
5661 | `a' with respect to the corresponding value `b'.  The operation is performed
5662 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5663 *----------------------------------------------------------------------------*/
5664 
5665 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5666 {
5667     flag aSign, zSign;
5668     int32_t aExp, bExp, expDiff;
5669     uint64_t aSig0, aSig1, bSig;
5670     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5671 
5672     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5673         float_raise(float_flag_invalid, status);
5674         return floatx80_default_nan(status);
5675     }
5676     aSig0 = extractFloatx80Frac( a );
5677     aExp = extractFloatx80Exp( a );
5678     aSign = extractFloatx80Sign( a );
5679     bSig = extractFloatx80Frac( b );
5680     bExp = extractFloatx80Exp( b );
5681     if ( aExp == 0x7FFF ) {
5682         if (    (uint64_t) ( aSig0<<1 )
5683              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5684             return propagateFloatx80NaN(a, b, status);
5685         }
5686         goto invalid;
5687     }
5688     if ( bExp == 0x7FFF ) {
5689         if ((uint64_t)(bSig << 1)) {
5690             return propagateFloatx80NaN(a, b, status);
5691         }
5692         return a;
5693     }
5694     if ( bExp == 0 ) {
5695         if ( bSig == 0 ) {
5696  invalid:
5697             float_raise(float_flag_invalid, status);
5698             return floatx80_default_nan(status);
5699         }
5700         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5701     }
5702     if ( aExp == 0 ) {
5703         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5704         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5705     }
5706     bSig |= LIT64( 0x8000000000000000 );
5707     zSign = aSign;
5708     expDiff = aExp - bExp;
5709     aSig1 = 0;
5710     if ( expDiff < 0 ) {
5711         if ( expDiff < -1 ) return a;
5712         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5713         expDiff = 0;
5714     }
5715     q = ( bSig <= aSig0 );
5716     if ( q ) aSig0 -= bSig;
5717     expDiff -= 64;
5718     while ( 0 < expDiff ) {
5719         q = estimateDiv128To64( aSig0, aSig1, bSig );
5720         q = ( 2 < q ) ? q - 2 : 0;
5721         mul64To128( bSig, q, &term0, &term1 );
5722         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5723         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5724         expDiff -= 62;
5725     }
5726     expDiff += 64;
5727     if ( 0 < expDiff ) {
5728         q = estimateDiv128To64( aSig0, aSig1, bSig );
5729         q = ( 2 < q ) ? q - 2 : 0;
5730         q >>= 64 - expDiff;
5731         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5732         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5733         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5734         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5735             ++q;
5736             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5737         }
5738     }
5739     else {
5740         term1 = 0;
5741         term0 = bSig;
5742     }
5743     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5744     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5745          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5746               && ( q & 1 ) )
5747        ) {
5748         aSig0 = alternateASig0;
5749         aSig1 = alternateASig1;
5750         zSign = ! zSign;
5751     }
5752     return
5753         normalizeRoundAndPackFloatx80(
5754             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5755 
5756 }
5757 
5758 /*----------------------------------------------------------------------------
5759 | Returns the square root of the extended double-precision floating-point
5760 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5761 | for Binary Floating-Point Arithmetic.
5762 *----------------------------------------------------------------------------*/
5763 
5764 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5765 {
5766     flag aSign;
5767     int32_t aExp, zExp;
5768     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5769     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5770 
5771     if (floatx80_invalid_encoding(a)) {
5772         float_raise(float_flag_invalid, status);
5773         return floatx80_default_nan(status);
5774     }
5775     aSig0 = extractFloatx80Frac( a );
5776     aExp = extractFloatx80Exp( a );
5777     aSign = extractFloatx80Sign( a );
5778     if ( aExp == 0x7FFF ) {
5779         if ((uint64_t)(aSig0 << 1)) {
5780             return propagateFloatx80NaN(a, a, status);
5781         }
5782         if ( ! aSign ) return a;
5783         goto invalid;
5784     }
5785     if ( aSign ) {
5786         if ( ( aExp | aSig0 ) == 0 ) return a;
5787  invalid:
5788         float_raise(float_flag_invalid, status);
5789         return floatx80_default_nan(status);
5790     }
5791     if ( aExp == 0 ) {
5792         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5793         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5794     }
5795     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5796     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5797     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5798     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5799     doubleZSig0 = zSig0<<1;
5800     mul64To128( zSig0, zSig0, &term0, &term1 );
5801     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5802     while ( (int64_t) rem0 < 0 ) {
5803         --zSig0;
5804         doubleZSig0 -= 2;
5805         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5806     }
5807     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5808     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5809         if ( zSig1 == 0 ) zSig1 = 1;
5810         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5811         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5812         mul64To128( zSig1, zSig1, &term2, &term3 );
5813         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5814         while ( (int64_t) rem1 < 0 ) {
5815             --zSig1;
5816             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5817             term3 |= 1;
5818             term2 |= doubleZSig0;
5819             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5820         }
5821         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5822     }
5823     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5824     zSig0 |= doubleZSig0;
5825     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5826                                 0, zExp, zSig0, zSig1, status);
5827 }
5828 
5829 /*----------------------------------------------------------------------------
5830 | Returns 1 if the extended double-precision floating-point value `a' is equal
5831 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5832 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5835 
5836 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5837 {
5838 
5839     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5840         || (extractFloatx80Exp(a) == 0x7FFF
5841             && (uint64_t) (extractFloatx80Frac(a) << 1))
5842         || (extractFloatx80Exp(b) == 0x7FFF
5843             && (uint64_t) (extractFloatx80Frac(b) << 1))
5844        ) {
5845         float_raise(float_flag_invalid, status);
5846         return 0;
5847     }
5848     return
5849            ( a.low == b.low )
5850         && (    ( a.high == b.high )
5851              || (    ( a.low == 0 )
5852                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5853            );
5854 
5855 }
5856 
5857 /*----------------------------------------------------------------------------
5858 | Returns 1 if the extended double-precision floating-point value `a' is
5859 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5860 | invalid exception is raised if either operand is a NaN.  The comparison is
5861 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5862 | Arithmetic.
5863 *----------------------------------------------------------------------------*/
5864 
5865 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5866 {
5867     flag aSign, bSign;
5868 
5869     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5870         || (extractFloatx80Exp(a) == 0x7FFF
5871             && (uint64_t) (extractFloatx80Frac(a) << 1))
5872         || (extractFloatx80Exp(b) == 0x7FFF
5873             && (uint64_t) (extractFloatx80Frac(b) << 1))
5874        ) {
5875         float_raise(float_flag_invalid, status);
5876         return 0;
5877     }
5878     aSign = extractFloatx80Sign( a );
5879     bSign = extractFloatx80Sign( b );
5880     if ( aSign != bSign ) {
5881         return
5882                aSign
5883             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5884                  == 0 );
5885     }
5886     return
5887           aSign ? le128( b.high, b.low, a.high, a.low )
5888         : le128( a.high, a.low, b.high, b.low );
5889 
5890 }
5891 
5892 /*----------------------------------------------------------------------------
5893 | Returns 1 if the extended double-precision floating-point value `a' is
5894 | less than the corresponding value `b', and 0 otherwise.  The invalid
5895 | exception is raised if either operand is a NaN.  The comparison is performed
5896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5897 *----------------------------------------------------------------------------*/
5898 
5899 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5900 {
5901     flag aSign, bSign;
5902 
5903     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5904         || (extractFloatx80Exp(a) == 0x7FFF
5905             && (uint64_t) (extractFloatx80Frac(a) << 1))
5906         || (extractFloatx80Exp(b) == 0x7FFF
5907             && (uint64_t) (extractFloatx80Frac(b) << 1))
5908        ) {
5909         float_raise(float_flag_invalid, status);
5910         return 0;
5911     }
5912     aSign = extractFloatx80Sign( a );
5913     bSign = extractFloatx80Sign( b );
5914     if ( aSign != bSign ) {
5915         return
5916                aSign
5917             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5918                  != 0 );
5919     }
5920     return
5921           aSign ? lt128( b.high, b.low, a.high, a.low )
5922         : lt128( a.high, a.low, b.high, b.low );
5923 
5924 }
5925 
5926 /*----------------------------------------------------------------------------
5927 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5928 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5929 | either operand is a NaN.   The comparison is performed according to the
5930 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5931 *----------------------------------------------------------------------------*/
5932 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5933 {
5934     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5935         || (extractFloatx80Exp(a) == 0x7FFF
5936             && (uint64_t) (extractFloatx80Frac(a) << 1))
5937         || (extractFloatx80Exp(b) == 0x7FFF
5938             && (uint64_t) (extractFloatx80Frac(b) << 1))
5939        ) {
5940         float_raise(float_flag_invalid, status);
5941         return 1;
5942     }
5943     return 0;
5944 }
5945 
5946 /*----------------------------------------------------------------------------
5947 | Returns 1 if the extended double-precision floating-point value `a' is
5948 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5949 | cause an exception.  The comparison is performed according to the IEC/IEEE
5950 | Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5952 
5953 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5954 {
5955 
5956     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5957         float_raise(float_flag_invalid, status);
5958         return 0;
5959     }
5960     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5961               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5962          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5963               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5964        ) {
5965         if (floatx80_is_signaling_nan(a, status)
5966          || floatx80_is_signaling_nan(b, status)) {
5967             float_raise(float_flag_invalid, status);
5968         }
5969         return 0;
5970     }
5971     return
5972            ( a.low == b.low )
5973         && (    ( a.high == b.high )
5974              || (    ( a.low == 0 )
5975                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5976            );
5977 
5978 }
5979 
5980 /*----------------------------------------------------------------------------
5981 | Returns 1 if the extended double-precision floating-point value `a' is less
5982 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5983 | do not cause an exception.  Otherwise, the comparison is performed according
5984 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5985 *----------------------------------------------------------------------------*/
5986 
5987 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5988 {
5989     flag aSign, bSign;
5990 
5991     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5992         float_raise(float_flag_invalid, status);
5993         return 0;
5994     }
5995     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5996               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5997          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5998               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5999        ) {
6000         if (floatx80_is_signaling_nan(a, status)
6001          || floatx80_is_signaling_nan(b, status)) {
6002             float_raise(float_flag_invalid, status);
6003         }
6004         return 0;
6005     }
6006     aSign = extractFloatx80Sign( a );
6007     bSign = extractFloatx80Sign( b );
6008     if ( aSign != bSign ) {
6009         return
6010                aSign
6011             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6012                  == 0 );
6013     }
6014     return
6015           aSign ? le128( b.high, b.low, a.high, a.low )
6016         : le128( a.high, a.low, b.high, b.low );
6017 
6018 }
6019 
6020 /*----------------------------------------------------------------------------
6021 | Returns 1 if the extended double-precision floating-point value `a' is less
6022 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6023 | an exception.  Otherwise, the comparison is performed according to the
6024 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6025 *----------------------------------------------------------------------------*/
6026 
6027 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6028 {
6029     flag aSign, bSign;
6030 
6031     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6032         float_raise(float_flag_invalid, status);
6033         return 0;
6034     }
6035     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6036               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6037          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6038               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6039        ) {
6040         if (floatx80_is_signaling_nan(a, status)
6041          || floatx80_is_signaling_nan(b, status)) {
6042             float_raise(float_flag_invalid, status);
6043         }
6044         return 0;
6045     }
6046     aSign = extractFloatx80Sign( a );
6047     bSign = extractFloatx80Sign( b );
6048     if ( aSign != bSign ) {
6049         return
6050                aSign
6051             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6052                  != 0 );
6053     }
6054     return
6055           aSign ? lt128( b.high, b.low, a.high, a.low )
6056         : lt128( a.high, a.low, b.high, b.low );
6057 
6058 }
6059 
6060 /*----------------------------------------------------------------------------
6061 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6062 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6063 | The comparison is performed according to the IEC/IEEE Standard for Binary
6064 | Floating-Point Arithmetic.
6065 *----------------------------------------------------------------------------*/
6066 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6067 {
6068     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6069         float_raise(float_flag_invalid, status);
6070         return 1;
6071     }
6072     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6073               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6074          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6075               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6076        ) {
6077         if (floatx80_is_signaling_nan(a, status)
6078          || floatx80_is_signaling_nan(b, status)) {
6079             float_raise(float_flag_invalid, status);
6080         }
6081         return 1;
6082     }
6083     return 0;
6084 }
6085 
6086 /*----------------------------------------------------------------------------
6087 | Returns the result of converting the quadruple-precision floating-point
6088 | value `a' to the 32-bit two's complement integer format.  The conversion
6089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6090 | Arithmetic---which means in particular that the conversion is rounded
6091 | according to the current rounding mode.  If `a' is a NaN, the largest
6092 | positive integer is returned.  Otherwise, if the conversion overflows, the
6093 | largest integer with the same sign as `a' is returned.
6094 *----------------------------------------------------------------------------*/
6095 
6096 int32_t float128_to_int32(float128 a, float_status *status)
6097 {
6098     flag aSign;
6099     int32_t aExp, shiftCount;
6100     uint64_t aSig0, aSig1;
6101 
6102     aSig1 = extractFloat128Frac1( a );
6103     aSig0 = extractFloat128Frac0( a );
6104     aExp = extractFloat128Exp( a );
6105     aSign = extractFloat128Sign( a );
6106     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6107     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6108     aSig0 |= ( aSig1 != 0 );
6109     shiftCount = 0x4028 - aExp;
6110     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6111     return roundAndPackInt32(aSign, aSig0, status);
6112 
6113 }
6114 
6115 /*----------------------------------------------------------------------------
6116 | Returns the result of converting the quadruple-precision floating-point
6117 | value `a' to the 32-bit two's complement integer format.  The conversion
6118 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6119 | Arithmetic, except that the conversion is always rounded toward zero.  If
6120 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6121 | conversion overflows, the largest integer with the same sign as `a' is
6122 | returned.
6123 *----------------------------------------------------------------------------*/
6124 
6125 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6126 {
6127     flag aSign;
6128     int32_t aExp, shiftCount;
6129     uint64_t aSig0, aSig1, savedASig;
6130     int32_t z;
6131 
6132     aSig1 = extractFloat128Frac1( a );
6133     aSig0 = extractFloat128Frac0( a );
6134     aExp = extractFloat128Exp( a );
6135     aSign = extractFloat128Sign( a );
6136     aSig0 |= ( aSig1 != 0 );
6137     if ( 0x401E < aExp ) {
6138         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6139         goto invalid;
6140     }
6141     else if ( aExp < 0x3FFF ) {
6142         if (aExp || aSig0) {
6143             status->float_exception_flags |= float_flag_inexact;
6144         }
6145         return 0;
6146     }
6147     aSig0 |= LIT64( 0x0001000000000000 );
6148     shiftCount = 0x402F - aExp;
6149     savedASig = aSig0;
6150     aSig0 >>= shiftCount;
6151     z = aSig0;
6152     if ( aSign ) z = - z;
6153     if ( ( z < 0 ) ^ aSign ) {
6154  invalid:
6155         float_raise(float_flag_invalid, status);
6156         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6157     }
6158     if ( ( aSig0<<shiftCount ) != savedASig ) {
6159         status->float_exception_flags |= float_flag_inexact;
6160     }
6161     return z;
6162 
6163 }
6164 
6165 /*----------------------------------------------------------------------------
6166 | Returns the result of converting the quadruple-precision floating-point
6167 | value `a' to the 64-bit two's complement integer format.  The conversion
6168 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6169 | Arithmetic---which means in particular that the conversion is rounded
6170 | according to the current rounding mode.  If `a' is a NaN, the largest
6171 | positive integer is returned.  Otherwise, if the conversion overflows, the
6172 | largest integer with the same sign as `a' is returned.
6173 *----------------------------------------------------------------------------*/
6174 
6175 int64_t float128_to_int64(float128 a, float_status *status)
6176 {
6177     flag aSign;
6178     int32_t aExp, shiftCount;
6179     uint64_t aSig0, aSig1;
6180 
6181     aSig1 = extractFloat128Frac1( a );
6182     aSig0 = extractFloat128Frac0( a );
6183     aExp = extractFloat128Exp( a );
6184     aSign = extractFloat128Sign( a );
6185     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6186     shiftCount = 0x402F - aExp;
6187     if ( shiftCount <= 0 ) {
6188         if ( 0x403E < aExp ) {
6189             float_raise(float_flag_invalid, status);
6190             if (    ! aSign
6191                  || (    ( aExp == 0x7FFF )
6192                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6193                     )
6194                ) {
6195                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6196             }
6197             return (int64_t) LIT64( 0x8000000000000000 );
6198         }
6199         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6200     }
6201     else {
6202         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6203     }
6204     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6205 
6206 }
6207 
6208 /*----------------------------------------------------------------------------
6209 | Returns the result of converting the quadruple-precision floating-point
6210 | value `a' to the 64-bit two's complement integer format.  The conversion
6211 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6212 | Arithmetic, except that the conversion is always rounded toward zero.
6213 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6214 | the conversion overflows, the largest integer with the same sign as `a' is
6215 | returned.
6216 *----------------------------------------------------------------------------*/
6217 
6218 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6219 {
6220     flag aSign;
6221     int32_t aExp, shiftCount;
6222     uint64_t aSig0, aSig1;
6223     int64_t z;
6224 
6225     aSig1 = extractFloat128Frac1( a );
6226     aSig0 = extractFloat128Frac0( a );
6227     aExp = extractFloat128Exp( a );
6228     aSign = extractFloat128Sign( a );
6229     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6230     shiftCount = aExp - 0x402F;
6231     if ( 0 < shiftCount ) {
6232         if ( 0x403E <= aExp ) {
6233             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6234             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6235                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6236                 if (aSig1) {
6237                     status->float_exception_flags |= float_flag_inexact;
6238                 }
6239             }
6240             else {
6241                 float_raise(float_flag_invalid, status);
6242                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6243                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6244                 }
6245             }
6246             return (int64_t) LIT64( 0x8000000000000000 );
6247         }
6248         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6249         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6250             status->float_exception_flags |= float_flag_inexact;
6251         }
6252     }
6253     else {
6254         if ( aExp < 0x3FFF ) {
6255             if ( aExp | aSig0 | aSig1 ) {
6256                 status->float_exception_flags |= float_flag_inexact;
6257             }
6258             return 0;
6259         }
6260         z = aSig0>>( - shiftCount );
6261         if (    aSig1
6262              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6263             status->float_exception_flags |= float_flag_inexact;
6264         }
6265     }
6266     if ( aSign ) z = - z;
6267     return z;
6268 
6269 }
6270 
6271 /*----------------------------------------------------------------------------
6272 | Returns the result of converting the quadruple-precision floating-point value
6273 | `a' to the 64-bit unsigned integer format.  The conversion is
6274 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6275 | Arithmetic---which means in particular that the conversion is rounded
6276 | according to the current rounding mode.  If `a' is a NaN, the largest
6277 | positive integer is returned.  If the conversion overflows, the
6278 | largest unsigned integer is returned.  If 'a' is negative, the value is
6279 | rounded and zero is returned; negative values that do not round to zero
6280 | will raise the inexact exception.
6281 *----------------------------------------------------------------------------*/
6282 
6283 uint64_t float128_to_uint64(float128 a, float_status *status)
6284 {
6285     flag aSign;
6286     int aExp;
6287     int shiftCount;
6288     uint64_t aSig0, aSig1;
6289 
6290     aSig0 = extractFloat128Frac0(a);
6291     aSig1 = extractFloat128Frac1(a);
6292     aExp = extractFloat128Exp(a);
6293     aSign = extractFloat128Sign(a);
6294     if (aSign && (aExp > 0x3FFE)) {
6295         float_raise(float_flag_invalid, status);
6296         if (float128_is_any_nan(a)) {
6297             return LIT64(0xFFFFFFFFFFFFFFFF);
6298         } else {
6299             return 0;
6300         }
6301     }
6302     if (aExp) {
6303         aSig0 |= LIT64(0x0001000000000000);
6304     }
6305     shiftCount = 0x402F - aExp;
6306     if (shiftCount <= 0) {
6307         if (0x403E < aExp) {
6308             float_raise(float_flag_invalid, status);
6309             return LIT64(0xFFFFFFFFFFFFFFFF);
6310         }
6311         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6312     } else {
6313         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6314     }
6315     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6316 }
6317 
6318 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6319 {
6320     uint64_t v;
6321     signed char current_rounding_mode = status->float_rounding_mode;
6322 
6323     set_float_rounding_mode(float_round_to_zero, status);
6324     v = float128_to_uint64(a, status);
6325     set_float_rounding_mode(current_rounding_mode, status);
6326 
6327     return v;
6328 }
6329 
6330 /*----------------------------------------------------------------------------
6331 | Returns the result of converting the quadruple-precision floating-point
6332 | value `a' to the 32-bit unsigned integer format.  The conversion
6333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6334 | Arithmetic except that the conversion is always rounded toward zero.
6335 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6336 | if the conversion overflows, the largest unsigned integer is returned.
6337 | If 'a' is negative, the value is rounded and zero is returned; negative
6338 | values that do not round to zero will raise the inexact exception.
6339 *----------------------------------------------------------------------------*/
6340 
6341 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6342 {
6343     uint64_t v;
6344     uint32_t res;
6345     int old_exc_flags = get_float_exception_flags(status);
6346 
6347     v = float128_to_uint64_round_to_zero(a, status);
6348     if (v > 0xffffffff) {
6349         res = 0xffffffff;
6350     } else {
6351         return v;
6352     }
6353     set_float_exception_flags(old_exc_flags, status);
6354     float_raise(float_flag_invalid, status);
6355     return res;
6356 }
6357 
6358 /*----------------------------------------------------------------------------
6359 | Returns the result of converting the quadruple-precision floating-point
6360 | value `a' to the single-precision floating-point format.  The conversion
6361 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6362 | Arithmetic.
6363 *----------------------------------------------------------------------------*/
6364 
6365 float32 float128_to_float32(float128 a, float_status *status)
6366 {
6367     flag aSign;
6368     int32_t aExp;
6369     uint64_t aSig0, aSig1;
6370     uint32_t zSig;
6371 
6372     aSig1 = extractFloat128Frac1( a );
6373     aSig0 = extractFloat128Frac0( a );
6374     aExp = extractFloat128Exp( a );
6375     aSign = extractFloat128Sign( a );
6376     if ( aExp == 0x7FFF ) {
6377         if ( aSig0 | aSig1 ) {
6378             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6379         }
6380         return packFloat32( aSign, 0xFF, 0 );
6381     }
6382     aSig0 |= ( aSig1 != 0 );
6383     shift64RightJamming( aSig0, 18, &aSig0 );
6384     zSig = aSig0;
6385     if ( aExp || zSig ) {
6386         zSig |= 0x40000000;
6387         aExp -= 0x3F81;
6388     }
6389     return roundAndPackFloat32(aSign, aExp, zSig, status);
6390 
6391 }
6392 
6393 /*----------------------------------------------------------------------------
6394 | Returns the result of converting the quadruple-precision floating-point
6395 | value `a' to the double-precision floating-point format.  The conversion
6396 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6397 | Arithmetic.
6398 *----------------------------------------------------------------------------*/
6399 
6400 float64 float128_to_float64(float128 a, float_status *status)
6401 {
6402     flag aSign;
6403     int32_t aExp;
6404     uint64_t aSig0, aSig1;
6405 
6406     aSig1 = extractFloat128Frac1( a );
6407     aSig0 = extractFloat128Frac0( a );
6408     aExp = extractFloat128Exp( a );
6409     aSign = extractFloat128Sign( a );
6410     if ( aExp == 0x7FFF ) {
6411         if ( aSig0 | aSig1 ) {
6412             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6413         }
6414         return packFloat64( aSign, 0x7FF, 0 );
6415     }
6416     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6417     aSig0 |= ( aSig1 != 0 );
6418     if ( aExp || aSig0 ) {
6419         aSig0 |= LIT64( 0x4000000000000000 );
6420         aExp -= 0x3C01;
6421     }
6422     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6423 
6424 }
6425 
6426 /*----------------------------------------------------------------------------
6427 | Returns the result of converting the quadruple-precision floating-point
6428 | value `a' to the extended double-precision floating-point format.  The
6429 | conversion is performed according to the IEC/IEEE Standard for Binary
6430 | Floating-Point Arithmetic.
6431 *----------------------------------------------------------------------------*/
6432 
6433 floatx80 float128_to_floatx80(float128 a, float_status *status)
6434 {
6435     flag aSign;
6436     int32_t aExp;
6437     uint64_t aSig0, aSig1;
6438 
6439     aSig1 = extractFloat128Frac1( a );
6440     aSig0 = extractFloat128Frac0( a );
6441     aExp = extractFloat128Exp( a );
6442     aSign = extractFloat128Sign( a );
6443     if ( aExp == 0x7FFF ) {
6444         if ( aSig0 | aSig1 ) {
6445             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6446         }
6447         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6448     }
6449     if ( aExp == 0 ) {
6450         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6451         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6452     }
6453     else {
6454         aSig0 |= LIT64( 0x0001000000000000 );
6455     }
6456     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6457     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6458 
6459 }
6460 
6461 /*----------------------------------------------------------------------------
6462 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6463 | returns the result as a quadruple-precision floating-point value.  The
6464 | operation is performed according to the IEC/IEEE Standard for Binary
6465 | Floating-Point Arithmetic.
6466 *----------------------------------------------------------------------------*/
6467 
6468 float128 float128_round_to_int(float128 a, float_status *status)
6469 {
6470     flag aSign;
6471     int32_t aExp;
6472     uint64_t lastBitMask, roundBitsMask;
6473     float128 z;
6474 
6475     aExp = extractFloat128Exp( a );
6476     if ( 0x402F <= aExp ) {
6477         if ( 0x406F <= aExp ) {
6478             if (    ( aExp == 0x7FFF )
6479                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6480                ) {
6481                 return propagateFloat128NaN(a, a, status);
6482             }
6483             return a;
6484         }
6485         lastBitMask = 1;
6486         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6487         roundBitsMask = lastBitMask - 1;
6488         z = a;
6489         switch (status->float_rounding_mode) {
6490         case float_round_nearest_even:
6491             if ( lastBitMask ) {
6492                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6493                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6494             }
6495             else {
6496                 if ( (int64_t) z.low < 0 ) {
6497                     ++z.high;
6498                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6499                 }
6500             }
6501             break;
6502         case float_round_ties_away:
6503             if (lastBitMask) {
6504                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6505             } else {
6506                 if ((int64_t) z.low < 0) {
6507                     ++z.high;
6508                 }
6509             }
6510             break;
6511         case float_round_to_zero:
6512             break;
6513         case float_round_up:
6514             if (!extractFloat128Sign(z)) {
6515                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6516             }
6517             break;
6518         case float_round_down:
6519             if (extractFloat128Sign(z)) {
6520                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6521             }
6522             break;
6523         default:
6524             abort();
6525         }
6526         z.low &= ~ roundBitsMask;
6527     }
6528     else {
6529         if ( aExp < 0x3FFF ) {
6530             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6531             status->float_exception_flags |= float_flag_inexact;
6532             aSign = extractFloat128Sign( a );
6533             switch (status->float_rounding_mode) {
6534              case float_round_nearest_even:
6535                 if (    ( aExp == 0x3FFE )
6536                      && (   extractFloat128Frac0( a )
6537                           | extractFloat128Frac1( a ) )
6538                    ) {
6539                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6540                 }
6541                 break;
6542             case float_round_ties_away:
6543                 if (aExp == 0x3FFE) {
6544                     return packFloat128(aSign, 0x3FFF, 0, 0);
6545                 }
6546                 break;
6547              case float_round_down:
6548                 return
6549                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6550                     : packFloat128( 0, 0, 0, 0 );
6551              case float_round_up:
6552                 return
6553                       aSign ? packFloat128( 1, 0, 0, 0 )
6554                     : packFloat128( 0, 0x3FFF, 0, 0 );
6555             }
6556             return packFloat128( aSign, 0, 0, 0 );
6557         }
6558         lastBitMask = 1;
6559         lastBitMask <<= 0x402F - aExp;
6560         roundBitsMask = lastBitMask - 1;
6561         z.low = 0;
6562         z.high = a.high;
6563         switch (status->float_rounding_mode) {
6564         case float_round_nearest_even:
6565             z.high += lastBitMask>>1;
6566             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6567                 z.high &= ~ lastBitMask;
6568             }
6569             break;
6570         case float_round_ties_away:
6571             z.high += lastBitMask>>1;
6572             break;
6573         case float_round_to_zero:
6574             break;
6575         case float_round_up:
6576             if (!extractFloat128Sign(z)) {
6577                 z.high |= ( a.low != 0 );
6578                 z.high += roundBitsMask;
6579             }
6580             break;
6581         case float_round_down:
6582             if (extractFloat128Sign(z)) {
6583                 z.high |= (a.low != 0);
6584                 z.high += roundBitsMask;
6585             }
6586             break;
6587         default:
6588             abort();
6589         }
6590         z.high &= ~ roundBitsMask;
6591     }
6592     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6593         status->float_exception_flags |= float_flag_inexact;
6594     }
6595     return z;
6596 
6597 }
6598 
6599 /*----------------------------------------------------------------------------
6600 | Returns the result of adding the absolute values of the quadruple-precision
6601 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6602 | before being returned.  `zSign' is ignored if the result is a NaN.
6603 | The addition is performed according to the IEC/IEEE Standard for Binary
6604 | Floating-Point Arithmetic.
6605 *----------------------------------------------------------------------------*/
6606 
6607 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6608                                 float_status *status)
6609 {
6610     int32_t aExp, bExp, zExp;
6611     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6612     int32_t expDiff;
6613 
6614     aSig1 = extractFloat128Frac1( a );
6615     aSig0 = extractFloat128Frac0( a );
6616     aExp = extractFloat128Exp( a );
6617     bSig1 = extractFloat128Frac1( b );
6618     bSig0 = extractFloat128Frac0( b );
6619     bExp = extractFloat128Exp( b );
6620     expDiff = aExp - bExp;
6621     if ( 0 < expDiff ) {
6622         if ( aExp == 0x7FFF ) {
6623             if (aSig0 | aSig1) {
6624                 return propagateFloat128NaN(a, b, status);
6625             }
6626             return a;
6627         }
6628         if ( bExp == 0 ) {
6629             --expDiff;
6630         }
6631         else {
6632             bSig0 |= LIT64( 0x0001000000000000 );
6633         }
6634         shift128ExtraRightJamming(
6635             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6636         zExp = aExp;
6637     }
6638     else if ( expDiff < 0 ) {
6639         if ( bExp == 0x7FFF ) {
6640             if (bSig0 | bSig1) {
6641                 return propagateFloat128NaN(a, b, status);
6642             }
6643             return packFloat128( zSign, 0x7FFF, 0, 0 );
6644         }
6645         if ( aExp == 0 ) {
6646             ++expDiff;
6647         }
6648         else {
6649             aSig0 |= LIT64( 0x0001000000000000 );
6650         }
6651         shift128ExtraRightJamming(
6652             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6653         zExp = bExp;
6654     }
6655     else {
6656         if ( aExp == 0x7FFF ) {
6657             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6658                 return propagateFloat128NaN(a, b, status);
6659             }
6660             return a;
6661         }
6662         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6663         if ( aExp == 0 ) {
6664             if (status->flush_to_zero) {
6665                 if (zSig0 | zSig1) {
6666                     float_raise(float_flag_output_denormal, status);
6667                 }
6668                 return packFloat128(zSign, 0, 0, 0);
6669             }
6670             return packFloat128( zSign, 0, zSig0, zSig1 );
6671         }
6672         zSig2 = 0;
6673         zSig0 |= LIT64( 0x0002000000000000 );
6674         zExp = aExp;
6675         goto shiftRight1;
6676     }
6677     aSig0 |= LIT64( 0x0001000000000000 );
6678     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6679     --zExp;
6680     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6681     ++zExp;
6682  shiftRight1:
6683     shift128ExtraRightJamming(
6684         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6685  roundAndPack:
6686     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6687 
6688 }
6689 
6690 /*----------------------------------------------------------------------------
6691 | Returns the result of subtracting the absolute values of the quadruple-
6692 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6693 | difference is negated before being returned.  `zSign' is ignored if the
6694 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6695 | Standard for Binary Floating-Point Arithmetic.
6696 *----------------------------------------------------------------------------*/
6697 
6698 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6699                                 float_status *status)
6700 {
6701     int32_t aExp, bExp, zExp;
6702     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6703     int32_t expDiff;
6704 
6705     aSig1 = extractFloat128Frac1( a );
6706     aSig0 = extractFloat128Frac0( a );
6707     aExp = extractFloat128Exp( a );
6708     bSig1 = extractFloat128Frac1( b );
6709     bSig0 = extractFloat128Frac0( b );
6710     bExp = extractFloat128Exp( b );
6711     expDiff = aExp - bExp;
6712     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6713     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6714     if ( 0 < expDiff ) goto aExpBigger;
6715     if ( expDiff < 0 ) goto bExpBigger;
6716     if ( aExp == 0x7FFF ) {
6717         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6718             return propagateFloat128NaN(a, b, status);
6719         }
6720         float_raise(float_flag_invalid, status);
6721         return float128_default_nan(status);
6722     }
6723     if ( aExp == 0 ) {
6724         aExp = 1;
6725         bExp = 1;
6726     }
6727     if ( bSig0 < aSig0 ) goto aBigger;
6728     if ( aSig0 < bSig0 ) goto bBigger;
6729     if ( bSig1 < aSig1 ) goto aBigger;
6730     if ( aSig1 < bSig1 ) goto bBigger;
6731     return packFloat128(status->float_rounding_mode == float_round_down,
6732                         0, 0, 0);
6733  bExpBigger:
6734     if ( bExp == 0x7FFF ) {
6735         if (bSig0 | bSig1) {
6736             return propagateFloat128NaN(a, b, status);
6737         }
6738         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6739     }
6740     if ( aExp == 0 ) {
6741         ++expDiff;
6742     }
6743     else {
6744         aSig0 |= LIT64( 0x4000000000000000 );
6745     }
6746     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6747     bSig0 |= LIT64( 0x4000000000000000 );
6748  bBigger:
6749     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6750     zExp = bExp;
6751     zSign ^= 1;
6752     goto normalizeRoundAndPack;
6753  aExpBigger:
6754     if ( aExp == 0x7FFF ) {
6755         if (aSig0 | aSig1) {
6756             return propagateFloat128NaN(a, b, status);
6757         }
6758         return a;
6759     }
6760     if ( bExp == 0 ) {
6761         --expDiff;
6762     }
6763     else {
6764         bSig0 |= LIT64( 0x4000000000000000 );
6765     }
6766     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6767     aSig0 |= LIT64( 0x4000000000000000 );
6768  aBigger:
6769     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6770     zExp = aExp;
6771  normalizeRoundAndPack:
6772     --zExp;
6773     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6774                                          status);
6775 
6776 }
6777 
6778 /*----------------------------------------------------------------------------
6779 | Returns the result of adding the quadruple-precision floating-point values
6780 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6781 | for Binary Floating-Point Arithmetic.
6782 *----------------------------------------------------------------------------*/
6783 
6784 float128 float128_add(float128 a, float128 b, float_status *status)
6785 {
6786     flag aSign, bSign;
6787 
6788     aSign = extractFloat128Sign( a );
6789     bSign = extractFloat128Sign( b );
6790     if ( aSign == bSign ) {
6791         return addFloat128Sigs(a, b, aSign, status);
6792     }
6793     else {
6794         return subFloat128Sigs(a, b, aSign, status);
6795     }
6796 
6797 }
6798 
6799 /*----------------------------------------------------------------------------
6800 | Returns the result of subtracting the quadruple-precision floating-point
6801 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6802 | Standard for Binary Floating-Point Arithmetic.
6803 *----------------------------------------------------------------------------*/
6804 
6805 float128 float128_sub(float128 a, float128 b, float_status *status)
6806 {
6807     flag aSign, bSign;
6808 
6809     aSign = extractFloat128Sign( a );
6810     bSign = extractFloat128Sign( b );
6811     if ( aSign == bSign ) {
6812         return subFloat128Sigs(a, b, aSign, status);
6813     }
6814     else {
6815         return addFloat128Sigs(a, b, aSign, status);
6816     }
6817 
6818 }
6819 
6820 /*----------------------------------------------------------------------------
6821 | Returns the result of multiplying the quadruple-precision floating-point
6822 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6823 | Standard for Binary Floating-Point Arithmetic.
6824 *----------------------------------------------------------------------------*/
6825 
6826 float128 float128_mul(float128 a, float128 b, float_status *status)
6827 {
6828     flag aSign, bSign, zSign;
6829     int32_t aExp, bExp, zExp;
6830     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6831 
6832     aSig1 = extractFloat128Frac1( a );
6833     aSig0 = extractFloat128Frac0( a );
6834     aExp = extractFloat128Exp( a );
6835     aSign = extractFloat128Sign( a );
6836     bSig1 = extractFloat128Frac1( b );
6837     bSig0 = extractFloat128Frac0( b );
6838     bExp = extractFloat128Exp( b );
6839     bSign = extractFloat128Sign( b );
6840     zSign = aSign ^ bSign;
6841     if ( aExp == 0x7FFF ) {
6842         if (    ( aSig0 | aSig1 )
6843              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6844             return propagateFloat128NaN(a, b, status);
6845         }
6846         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6847         return packFloat128( zSign, 0x7FFF, 0, 0 );
6848     }
6849     if ( bExp == 0x7FFF ) {
6850         if (bSig0 | bSig1) {
6851             return propagateFloat128NaN(a, b, status);
6852         }
6853         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6854  invalid:
6855             float_raise(float_flag_invalid, status);
6856             return float128_default_nan(status);
6857         }
6858         return packFloat128( zSign, 0x7FFF, 0, 0 );
6859     }
6860     if ( aExp == 0 ) {
6861         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6862         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6863     }
6864     if ( bExp == 0 ) {
6865         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6866         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6867     }
6868     zExp = aExp + bExp - 0x4000;
6869     aSig0 |= LIT64( 0x0001000000000000 );
6870     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6871     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6872     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6873     zSig2 |= ( zSig3 != 0 );
6874     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6875         shift128ExtraRightJamming(
6876             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6877         ++zExp;
6878     }
6879     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6880 
6881 }
6882 
6883 /*----------------------------------------------------------------------------
6884 | Returns the result of dividing the quadruple-precision floating-point value
6885 | `a' by the corresponding value `b'.  The operation is performed according to
6886 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6888 
6889 float128 float128_div(float128 a, float128 b, float_status *status)
6890 {
6891     flag aSign, bSign, zSign;
6892     int32_t aExp, bExp, zExp;
6893     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6894     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6895 
6896     aSig1 = extractFloat128Frac1( a );
6897     aSig0 = extractFloat128Frac0( a );
6898     aExp = extractFloat128Exp( a );
6899     aSign = extractFloat128Sign( a );
6900     bSig1 = extractFloat128Frac1( b );
6901     bSig0 = extractFloat128Frac0( b );
6902     bExp = extractFloat128Exp( b );
6903     bSign = extractFloat128Sign( b );
6904     zSign = aSign ^ bSign;
6905     if ( aExp == 0x7FFF ) {
6906         if (aSig0 | aSig1) {
6907             return propagateFloat128NaN(a, b, status);
6908         }
6909         if ( bExp == 0x7FFF ) {
6910             if (bSig0 | bSig1) {
6911                 return propagateFloat128NaN(a, b, status);
6912             }
6913             goto invalid;
6914         }
6915         return packFloat128( zSign, 0x7FFF, 0, 0 );
6916     }
6917     if ( bExp == 0x7FFF ) {
6918         if (bSig0 | bSig1) {
6919             return propagateFloat128NaN(a, b, status);
6920         }
6921         return packFloat128( zSign, 0, 0, 0 );
6922     }
6923     if ( bExp == 0 ) {
6924         if ( ( bSig0 | bSig1 ) == 0 ) {
6925             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6926  invalid:
6927                 float_raise(float_flag_invalid, status);
6928                 return float128_default_nan(status);
6929             }
6930             float_raise(float_flag_divbyzero, status);
6931             return packFloat128( zSign, 0x7FFF, 0, 0 );
6932         }
6933         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6934     }
6935     if ( aExp == 0 ) {
6936         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6937         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6938     }
6939     zExp = aExp - bExp + 0x3FFD;
6940     shortShift128Left(
6941         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6942     shortShift128Left(
6943         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6944     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6945         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6946         ++zExp;
6947     }
6948     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6949     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6950     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6951     while ( (int64_t) rem0 < 0 ) {
6952         --zSig0;
6953         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6954     }
6955     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6956     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6957         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6958         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6959         while ( (int64_t) rem1 < 0 ) {
6960             --zSig1;
6961             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6962         }
6963         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6964     }
6965     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6966     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6967 
6968 }
6969 
6970 /*----------------------------------------------------------------------------
6971 | Returns the remainder of the quadruple-precision floating-point value `a'
6972 | with respect to the corresponding value `b'.  The operation is performed
6973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6974 *----------------------------------------------------------------------------*/
6975 
6976 float128 float128_rem(float128 a, float128 b, float_status *status)
6977 {
6978     flag aSign, zSign;
6979     int32_t aExp, bExp, expDiff;
6980     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6981     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6982     int64_t sigMean0;
6983 
6984     aSig1 = extractFloat128Frac1( a );
6985     aSig0 = extractFloat128Frac0( a );
6986     aExp = extractFloat128Exp( a );
6987     aSign = extractFloat128Sign( a );
6988     bSig1 = extractFloat128Frac1( b );
6989     bSig0 = extractFloat128Frac0( b );
6990     bExp = extractFloat128Exp( b );
6991     if ( aExp == 0x7FFF ) {
6992         if (    ( aSig0 | aSig1 )
6993              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6994             return propagateFloat128NaN(a, b, status);
6995         }
6996         goto invalid;
6997     }
6998     if ( bExp == 0x7FFF ) {
6999         if (bSig0 | bSig1) {
7000             return propagateFloat128NaN(a, b, status);
7001         }
7002         return a;
7003     }
7004     if ( bExp == 0 ) {
7005         if ( ( bSig0 | bSig1 ) == 0 ) {
7006  invalid:
7007             float_raise(float_flag_invalid, status);
7008             return float128_default_nan(status);
7009         }
7010         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7011     }
7012     if ( aExp == 0 ) {
7013         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7014         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7015     }
7016     expDiff = aExp - bExp;
7017     if ( expDiff < -1 ) return a;
7018     shortShift128Left(
7019         aSig0 | LIT64( 0x0001000000000000 ),
7020         aSig1,
7021         15 - ( expDiff < 0 ),
7022         &aSig0,
7023         &aSig1
7024     );
7025     shortShift128Left(
7026         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7027     q = le128( bSig0, bSig1, aSig0, aSig1 );
7028     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7029     expDiff -= 64;
7030     while ( 0 < expDiff ) {
7031         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7032         q = ( 4 < q ) ? q - 4 : 0;
7033         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7034         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7035         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7036         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7037         expDiff -= 61;
7038     }
7039     if ( -64 < expDiff ) {
7040         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7041         q = ( 4 < q ) ? q - 4 : 0;
7042         q >>= - expDiff;
7043         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7044         expDiff += 52;
7045         if ( expDiff < 0 ) {
7046             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7047         }
7048         else {
7049             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7050         }
7051         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7052         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7053     }
7054     else {
7055         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7056         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7057     }
7058     do {
7059         alternateASig0 = aSig0;
7060         alternateASig1 = aSig1;
7061         ++q;
7062         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7063     } while ( 0 <= (int64_t) aSig0 );
7064     add128(
7065         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7066     if (    ( sigMean0 < 0 )
7067          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7068         aSig0 = alternateASig0;
7069         aSig1 = alternateASig1;
7070     }
7071     zSign = ( (int64_t) aSig0 < 0 );
7072     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7073     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7074                                          status);
7075 }
7076 
7077 /*----------------------------------------------------------------------------
7078 | Returns the square root of the quadruple-precision floating-point value `a'.
7079 | The operation is performed according to the IEC/IEEE Standard for Binary
7080 | Floating-Point Arithmetic.
7081 *----------------------------------------------------------------------------*/
7082 
7083 float128 float128_sqrt(float128 a, float_status *status)
7084 {
7085     flag aSign;
7086     int32_t aExp, zExp;
7087     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7088     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7089 
7090     aSig1 = extractFloat128Frac1( a );
7091     aSig0 = extractFloat128Frac0( a );
7092     aExp = extractFloat128Exp( a );
7093     aSign = extractFloat128Sign( a );
7094     if ( aExp == 0x7FFF ) {
7095         if (aSig0 | aSig1) {
7096             return propagateFloat128NaN(a, a, status);
7097         }
7098         if ( ! aSign ) return a;
7099         goto invalid;
7100     }
7101     if ( aSign ) {
7102         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7103  invalid:
7104         float_raise(float_flag_invalid, status);
7105         return float128_default_nan(status);
7106     }
7107     if ( aExp == 0 ) {
7108         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7109         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7110     }
7111     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7112     aSig0 |= LIT64( 0x0001000000000000 );
7113     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7114     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7115     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7116     doubleZSig0 = zSig0<<1;
7117     mul64To128( zSig0, zSig0, &term0, &term1 );
7118     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7119     while ( (int64_t) rem0 < 0 ) {
7120         --zSig0;
7121         doubleZSig0 -= 2;
7122         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7123     }
7124     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7125     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7126         if ( zSig1 == 0 ) zSig1 = 1;
7127         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7128         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7129         mul64To128( zSig1, zSig1, &term2, &term3 );
7130         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7131         while ( (int64_t) rem1 < 0 ) {
7132             --zSig1;
7133             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7134             term3 |= 1;
7135             term2 |= doubleZSig0;
7136             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7137         }
7138         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7139     }
7140     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7141     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7142 
7143 }
7144 
7145 /*----------------------------------------------------------------------------
7146 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7147 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7148 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7149 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7150 *----------------------------------------------------------------------------*/
7151 
7152 int float128_eq(float128 a, float128 b, float_status *status)
7153 {
7154 
7155     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7156               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7157          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7158               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7159        ) {
7160         float_raise(float_flag_invalid, status);
7161         return 0;
7162     }
7163     return
7164            ( a.low == b.low )
7165         && (    ( a.high == b.high )
7166              || (    ( a.low == 0 )
7167                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7168            );
7169 
7170 }
7171 
7172 /*----------------------------------------------------------------------------
7173 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7174 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7175 | exception is raised if either operand is a NaN.  The comparison is performed
7176 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7177 *----------------------------------------------------------------------------*/
7178 
7179 int float128_le(float128 a, float128 b, float_status *status)
7180 {
7181     flag aSign, bSign;
7182 
7183     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7184               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7185          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7186               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7187        ) {
7188         float_raise(float_flag_invalid, status);
7189         return 0;
7190     }
7191     aSign = extractFloat128Sign( a );
7192     bSign = extractFloat128Sign( b );
7193     if ( aSign != bSign ) {
7194         return
7195                aSign
7196             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7197                  == 0 );
7198     }
7199     return
7200           aSign ? le128( b.high, b.low, a.high, a.low )
7201         : le128( a.high, a.low, b.high, b.low );
7202 
7203 }
7204 
7205 /*----------------------------------------------------------------------------
7206 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7207 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7208 | raised if either operand is a NaN.  The comparison is performed according
7209 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7210 *----------------------------------------------------------------------------*/
7211 
7212 int float128_lt(float128 a, float128 b, float_status *status)
7213 {
7214     flag aSign, bSign;
7215 
7216     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7217               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7218          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7219               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7220        ) {
7221         float_raise(float_flag_invalid, status);
7222         return 0;
7223     }
7224     aSign = extractFloat128Sign( a );
7225     bSign = extractFloat128Sign( b );
7226     if ( aSign != bSign ) {
7227         return
7228                aSign
7229             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7230                  != 0 );
7231     }
7232     return
7233           aSign ? lt128( b.high, b.low, a.high, a.low )
7234         : lt128( a.high, a.low, b.high, b.low );
7235 
7236 }
7237 
7238 /*----------------------------------------------------------------------------
7239 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7240 | be compared, and 0 otherwise.  The invalid exception is raised if either
7241 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7242 | Standard for Binary Floating-Point Arithmetic.
7243 *----------------------------------------------------------------------------*/
7244 
7245 int float128_unordered(float128 a, float128 b, float_status *status)
7246 {
7247     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7248               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7249          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7250               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7251        ) {
7252         float_raise(float_flag_invalid, status);
7253         return 1;
7254     }
7255     return 0;
7256 }
7257 
7258 /*----------------------------------------------------------------------------
7259 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7260 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7261 | exception.  The comparison is performed according to the IEC/IEEE Standard
7262 | for Binary Floating-Point Arithmetic.
7263 *----------------------------------------------------------------------------*/
7264 
7265 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7266 {
7267 
7268     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7269               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7270          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7271               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7272        ) {
7273         if (float128_is_signaling_nan(a, status)
7274          || float128_is_signaling_nan(b, status)) {
7275             float_raise(float_flag_invalid, status);
7276         }
7277         return 0;
7278     }
7279     return
7280            ( a.low == b.low )
7281         && (    ( a.high == b.high )
7282              || (    ( a.low == 0 )
7283                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7284            );
7285 
7286 }
7287 
7288 /*----------------------------------------------------------------------------
7289 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7290 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7291 | cause an exception.  Otherwise, the comparison is performed according to the
7292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7293 *----------------------------------------------------------------------------*/
7294 
7295 int float128_le_quiet(float128 a, float128 b, float_status *status)
7296 {
7297     flag aSign, bSign;
7298 
7299     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7300               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7301          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7302               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7303        ) {
7304         if (float128_is_signaling_nan(a, status)
7305          || float128_is_signaling_nan(b, status)) {
7306             float_raise(float_flag_invalid, status);
7307         }
7308         return 0;
7309     }
7310     aSign = extractFloat128Sign( a );
7311     bSign = extractFloat128Sign( b );
7312     if ( aSign != bSign ) {
7313         return
7314                aSign
7315             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7316                  == 0 );
7317     }
7318     return
7319           aSign ? le128( b.high, b.low, a.high, a.low )
7320         : le128( a.high, a.low, b.high, b.low );
7321 
7322 }
7323 
7324 /*----------------------------------------------------------------------------
7325 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7326 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7327 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7328 | Standard for Binary Floating-Point Arithmetic.
7329 *----------------------------------------------------------------------------*/
7330 
7331 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7332 {
7333     flag aSign, bSign;
7334 
7335     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7336               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7337          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7338               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7339        ) {
7340         if (float128_is_signaling_nan(a, status)
7341          || float128_is_signaling_nan(b, status)) {
7342             float_raise(float_flag_invalid, status);
7343         }
7344         return 0;
7345     }
7346     aSign = extractFloat128Sign( a );
7347     bSign = extractFloat128Sign( b );
7348     if ( aSign != bSign ) {
7349         return
7350                aSign
7351             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7352                  != 0 );
7353     }
7354     return
7355           aSign ? lt128( b.high, b.low, a.high, a.low )
7356         : lt128( a.high, a.low, b.high, b.low );
7357 
7358 }
7359 
7360 /*----------------------------------------------------------------------------
7361 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7362 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7363 | comparison is performed according to the IEC/IEEE Standard for Binary
7364 | Floating-Point Arithmetic.
7365 *----------------------------------------------------------------------------*/
7366 
7367 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7368 {
7369     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7370               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7371          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7372               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7373        ) {
7374         if (float128_is_signaling_nan(a, status)
7375          || float128_is_signaling_nan(b, status)) {
7376             float_raise(float_flag_invalid, status);
7377         }
7378         return 1;
7379     }
7380     return 0;
7381 }
7382 
7383 /* misc functions */
7384 float32 uint32_to_float32(uint32_t a, float_status *status)
7385 {
7386     return int64_to_float32(a, status);
7387 }
7388 
7389 float64 uint32_to_float64(uint32_t a, float_status *status)
7390 {
7391     return int64_to_float64(a, status);
7392 }
7393 
7394 uint32_t float32_to_uint32(float32 a, float_status *status)
7395 {
7396     int64_t v;
7397     uint32_t res;
7398     int old_exc_flags = get_float_exception_flags(status);
7399 
7400     v = float32_to_int64(a, status);
7401     if (v < 0) {
7402         res = 0;
7403     } else if (v > 0xffffffff) {
7404         res = 0xffffffff;
7405     } else {
7406         return v;
7407     }
7408     set_float_exception_flags(old_exc_flags, status);
7409     float_raise(float_flag_invalid, status);
7410     return res;
7411 }
7412 
7413 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7414 {
7415     int64_t v;
7416     uint32_t res;
7417     int old_exc_flags = get_float_exception_flags(status);
7418 
7419     v = float32_to_int64_round_to_zero(a, status);
7420     if (v < 0) {
7421         res = 0;
7422     } else if (v > 0xffffffff) {
7423         res = 0xffffffff;
7424     } else {
7425         return v;
7426     }
7427     set_float_exception_flags(old_exc_flags, status);
7428     float_raise(float_flag_invalid, status);
7429     return res;
7430 }
7431 
7432 int16_t float32_to_int16(float32 a, float_status *status)
7433 {
7434     int32_t v;
7435     int16_t res;
7436     int old_exc_flags = get_float_exception_flags(status);
7437 
7438     v = float32_to_int32(a, status);
7439     if (v < -0x8000) {
7440         res = -0x8000;
7441     } else if (v > 0x7fff) {
7442         res = 0x7fff;
7443     } else {
7444         return v;
7445     }
7446 
7447     set_float_exception_flags(old_exc_flags, status);
7448     float_raise(float_flag_invalid, status);
7449     return res;
7450 }
7451 
7452 uint16_t float32_to_uint16(float32 a, float_status *status)
7453 {
7454     int32_t v;
7455     uint16_t res;
7456     int old_exc_flags = get_float_exception_flags(status);
7457 
7458     v = float32_to_int32(a, status);
7459     if (v < 0) {
7460         res = 0;
7461     } else if (v > 0xffff) {
7462         res = 0xffff;
7463     } else {
7464         return v;
7465     }
7466 
7467     set_float_exception_flags(old_exc_flags, status);
7468     float_raise(float_flag_invalid, status);
7469     return res;
7470 }
7471 
7472 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7473 {
7474     int64_t v;
7475     uint16_t res;
7476     int old_exc_flags = get_float_exception_flags(status);
7477 
7478     v = float32_to_int64_round_to_zero(a, status);
7479     if (v < 0) {
7480         res = 0;
7481     } else if (v > 0xffff) {
7482         res = 0xffff;
7483     } else {
7484         return v;
7485     }
7486     set_float_exception_flags(old_exc_flags, status);
7487     float_raise(float_flag_invalid, status);
7488     return res;
7489 }
7490 
7491 uint32_t float64_to_uint32(float64 a, float_status *status)
7492 {
7493     uint64_t v;
7494     uint32_t res;
7495     int old_exc_flags = get_float_exception_flags(status);
7496 
7497     v = float64_to_uint64(a, status);
7498     if (v > 0xffffffff) {
7499         res = 0xffffffff;
7500     } else {
7501         return v;
7502     }
7503     set_float_exception_flags(old_exc_flags, status);
7504     float_raise(float_flag_invalid, status);
7505     return res;
7506 }
7507 
7508 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7509 {
7510     uint64_t v;
7511     uint32_t res;
7512     int old_exc_flags = get_float_exception_flags(status);
7513 
7514     v = float64_to_uint64_round_to_zero(a, status);
7515     if (v > 0xffffffff) {
7516         res = 0xffffffff;
7517     } else {
7518         return v;
7519     }
7520     set_float_exception_flags(old_exc_flags, status);
7521     float_raise(float_flag_invalid, status);
7522     return res;
7523 }
7524 
7525 int16_t float64_to_int16(float64 a, float_status *status)
7526 {
7527     int64_t v;
7528     int16_t res;
7529     int old_exc_flags = get_float_exception_flags(status);
7530 
7531     v = float64_to_int32(a, status);
7532     if (v < -0x8000) {
7533         res = -0x8000;
7534     } else if (v > 0x7fff) {
7535         res = 0x7fff;
7536     } else {
7537         return v;
7538     }
7539 
7540     set_float_exception_flags(old_exc_flags, status);
7541     float_raise(float_flag_invalid, status);
7542     return res;
7543 }
7544 
7545 uint16_t float64_to_uint16(float64 a, float_status *status)
7546 {
7547     int64_t v;
7548     uint16_t res;
7549     int old_exc_flags = get_float_exception_flags(status);
7550 
7551     v = float64_to_int32(a, status);
7552     if (v < 0) {
7553         res = 0;
7554     } else if (v > 0xffff) {
7555         res = 0xffff;
7556     } else {
7557         return v;
7558     }
7559 
7560     set_float_exception_flags(old_exc_flags, status);
7561     float_raise(float_flag_invalid, status);
7562     return res;
7563 }
7564 
7565 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7566 {
7567     int64_t v;
7568     uint16_t res;
7569     int old_exc_flags = get_float_exception_flags(status);
7570 
7571     v = float64_to_int64_round_to_zero(a, status);
7572     if (v < 0) {
7573         res = 0;
7574     } else if (v > 0xffff) {
7575         res = 0xffff;
7576     } else {
7577         return v;
7578     }
7579     set_float_exception_flags(old_exc_flags, status);
7580     float_raise(float_flag_invalid, status);
7581     return res;
7582 }
7583 
7584 /*----------------------------------------------------------------------------
7585 | Returns the result of converting the double-precision floating-point value
7586 | `a' to the 64-bit unsigned integer format.  The conversion is
7587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7588 | Arithmetic---which means in particular that the conversion is rounded
7589 | according to the current rounding mode.  If `a' is a NaN, the largest
7590 | positive integer is returned.  If the conversion overflows, the
7591 | largest unsigned integer is returned.  If 'a' is negative, the value is
7592 | rounded and zero is returned; negative values that do not round to zero
7593 | will raise the inexact exception.
7594 *----------------------------------------------------------------------------*/
7595 
7596 uint64_t float64_to_uint64(float64 a, float_status *status)
7597 {
7598     flag aSign;
7599     int aExp;
7600     int shiftCount;
7601     uint64_t aSig, aSigExtra;
7602     a = float64_squash_input_denormal(a, status);
7603 
7604     aSig = extractFloat64Frac(a);
7605     aExp = extractFloat64Exp(a);
7606     aSign = extractFloat64Sign(a);
7607     if (aSign && (aExp > 1022)) {
7608         float_raise(float_flag_invalid, status);
7609         if (float64_is_any_nan(a)) {
7610             return LIT64(0xFFFFFFFFFFFFFFFF);
7611         } else {
7612             return 0;
7613         }
7614     }
7615     if (aExp) {
7616         aSig |= LIT64(0x0010000000000000);
7617     }
7618     shiftCount = 0x433 - aExp;
7619     if (shiftCount <= 0) {
7620         if (0x43E < aExp) {
7621             float_raise(float_flag_invalid, status);
7622             return LIT64(0xFFFFFFFFFFFFFFFF);
7623         }
7624         aSigExtra = 0;
7625         aSig <<= -shiftCount;
7626     } else {
7627         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7628     }
7629     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7630 }
7631 
7632 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7633 {
7634     signed char current_rounding_mode = status->float_rounding_mode;
7635     set_float_rounding_mode(float_round_to_zero, status);
7636     uint64_t v = float64_to_uint64(a, status);
7637     set_float_rounding_mode(current_rounding_mode, status);
7638     return v;
7639 }
7640 
7641 #define COMPARE(s, nan_exp)                                                  \
7642 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7643                                       int is_quiet, float_status *status)    \
7644 {                                                                            \
7645     flag aSign, bSign;                                                       \
7646     uint ## s ## _t av, bv;                                                  \
7647     a = float ## s ## _squash_input_denormal(a, status);                     \
7648     b = float ## s ## _squash_input_denormal(b, status);                     \
7649                                                                              \
7650     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7651          extractFloat ## s ## Frac( a ) ) ||                                 \
7652         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7653           extractFloat ## s ## Frac( b ) )) {                                \
7654         if (!is_quiet ||                                                     \
7655             float ## s ## _is_signaling_nan(a, status) ||                  \
7656             float ## s ## _is_signaling_nan(b, status)) {                 \
7657             float_raise(float_flag_invalid, status);                         \
7658         }                                                                    \
7659         return float_relation_unordered;                                     \
7660     }                                                                        \
7661     aSign = extractFloat ## s ## Sign( a );                                  \
7662     bSign = extractFloat ## s ## Sign( b );                                  \
7663     av = float ## s ## _val(a);                                              \
7664     bv = float ## s ## _val(b);                                              \
7665     if ( aSign != bSign ) {                                                  \
7666         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7667             /* zero case */                                                  \
7668             return float_relation_equal;                                     \
7669         } else {                                                             \
7670             return 1 - (2 * aSign);                                          \
7671         }                                                                    \
7672     } else {                                                                 \
7673         if (av == bv) {                                                      \
7674             return float_relation_equal;                                     \
7675         } else {                                                             \
7676             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7677         }                                                                    \
7678     }                                                                        \
7679 }                                                                            \
7680                                                                              \
7681 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7682 {                                                                            \
7683     return float ## s ## _compare_internal(a, b, 0, status);                 \
7684 }                                                                            \
7685                                                                              \
7686 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7687                                  float_status *status)                       \
7688 {                                                                            \
7689     return float ## s ## _compare_internal(a, b, 1, status);                 \
7690 }
7691 
7692 COMPARE(32, 0xff)
7693 COMPARE(64, 0x7ff)
7694 
7695 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7696                                             int is_quiet, float_status *status)
7697 {
7698     flag aSign, bSign;
7699 
7700     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7701         float_raise(float_flag_invalid, status);
7702         return float_relation_unordered;
7703     }
7704     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7705           ( extractFloatx80Frac( a )<<1 ) ) ||
7706         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7707           ( extractFloatx80Frac( b )<<1 ) )) {
7708         if (!is_quiet ||
7709             floatx80_is_signaling_nan(a, status) ||
7710             floatx80_is_signaling_nan(b, status)) {
7711             float_raise(float_flag_invalid, status);
7712         }
7713         return float_relation_unordered;
7714     }
7715     aSign = extractFloatx80Sign( a );
7716     bSign = extractFloatx80Sign( b );
7717     if ( aSign != bSign ) {
7718 
7719         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7720              ( ( a.low | b.low ) == 0 ) ) {
7721             /* zero case */
7722             return float_relation_equal;
7723         } else {
7724             return 1 - (2 * aSign);
7725         }
7726     } else {
7727         if (a.low == b.low && a.high == b.high) {
7728             return float_relation_equal;
7729         } else {
7730             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7731         }
7732     }
7733 }
7734 
7735 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7736 {
7737     return floatx80_compare_internal(a, b, 0, status);
7738 }
7739 
7740 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7741 {
7742     return floatx80_compare_internal(a, b, 1, status);
7743 }
7744 
7745 static inline int float128_compare_internal(float128 a, float128 b,
7746                                             int is_quiet, float_status *status)
7747 {
7748     flag aSign, bSign;
7749 
7750     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7751           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7752         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7753           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7754         if (!is_quiet ||
7755             float128_is_signaling_nan(a, status) ||
7756             float128_is_signaling_nan(b, status)) {
7757             float_raise(float_flag_invalid, status);
7758         }
7759         return float_relation_unordered;
7760     }
7761     aSign = extractFloat128Sign( a );
7762     bSign = extractFloat128Sign( b );
7763     if ( aSign != bSign ) {
7764         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7765             /* zero case */
7766             return float_relation_equal;
7767         } else {
7768             return 1 - (2 * aSign);
7769         }
7770     } else {
7771         if (a.low == b.low && a.high == b.high) {
7772             return float_relation_equal;
7773         } else {
7774             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7775         }
7776     }
7777 }
7778 
7779 int float128_compare(float128 a, float128 b, float_status *status)
7780 {
7781     return float128_compare_internal(a, b, 0, status);
7782 }
7783 
7784 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7785 {
7786     return float128_compare_internal(a, b, 1, status);
7787 }
7788 
7789 /* min() and max() functions. These can't be implemented as
7790  * 'compare and pick one input' because that would mishandle
7791  * NaNs and +0 vs -0.
7792  *
7793  * minnum() and maxnum() functions. These are similar to the min()
7794  * and max() functions but if one of the arguments is a QNaN and
7795  * the other is numerical then the numerical argument is returned.
7796  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7797  * and maxNum() operations. min() and max() are the typical min/max
7798  * semantics provided by many CPUs which predate that specification.
7799  *
7800  * minnummag() and maxnummag() functions correspond to minNumMag()
7801  * and minNumMag() from the IEEE-754 2008.
7802  */
7803 #define MINMAX(s)                                                       \
7804 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7805                                                int ismin, int isieee,   \
7806                                                int ismag,               \
7807                                                float_status *status)    \
7808 {                                                                       \
7809     flag aSign, bSign;                                                  \
7810     uint ## s ## _t av, bv, aav, abv;                                   \
7811     a = float ## s ## _squash_input_denormal(a, status);                \
7812     b = float ## s ## _squash_input_denormal(b, status);                \
7813     if (float ## s ## _is_any_nan(a) ||                                 \
7814         float ## s ## _is_any_nan(b)) {                                 \
7815         if (isieee) {                                                   \
7816             if (float ## s ## _is_quiet_nan(a, status) &&               \
7817                 !float ## s ##_is_any_nan(b)) {                         \
7818                 return b;                                               \
7819             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7820                        !float ## s ## _is_any_nan(a)) {                \
7821                 return a;                                               \
7822             }                                                           \
7823         }                                                               \
7824         return propagateFloat ## s ## NaN(a, b, status);                \
7825     }                                                                   \
7826     aSign = extractFloat ## s ## Sign(a);                               \
7827     bSign = extractFloat ## s ## Sign(b);                               \
7828     av = float ## s ## _val(a);                                         \
7829     bv = float ## s ## _val(b);                                         \
7830     if (ismag) {                                                        \
7831         aav = float ## s ## _abs(av);                                   \
7832         abv = float ## s ## _abs(bv);                                   \
7833         if (aav != abv) {                                               \
7834             if (ismin) {                                                \
7835                 return (aav < abv) ? a : b;                             \
7836             } else {                                                    \
7837                 return (aav < abv) ? b : a;                             \
7838             }                                                           \
7839         }                                                               \
7840     }                                                                   \
7841     if (aSign != bSign) {                                               \
7842         if (ismin) {                                                    \
7843             return aSign ? a : b;                                       \
7844         } else {                                                        \
7845             return aSign ? b : a;                                       \
7846         }                                                               \
7847     } else {                                                            \
7848         if (ismin) {                                                    \
7849             return (aSign ^ (av < bv)) ? a : b;                         \
7850         } else {                                                        \
7851             return (aSign ^ (av < bv)) ? b : a;                         \
7852         }                                                               \
7853     }                                                                   \
7854 }                                                                       \
7855                                                                         \
7856 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7857                               float_status *status)                     \
7858 {                                                                       \
7859     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7860 }                                                                       \
7861                                                                         \
7862 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7863                               float_status *status)                     \
7864 {                                                                       \
7865     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7866 }                                                                       \
7867                                                                         \
7868 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7869                                  float_status *status)                  \
7870 {                                                                       \
7871     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7872 }                                                                       \
7873                                                                         \
7874 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7875                                  float_status *status)                  \
7876 {                                                                       \
7877     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7878 }                                                                       \
7879                                                                         \
7880 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7881                                     float_status *status)               \
7882 {                                                                       \
7883     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7884 }                                                                       \
7885                                                                         \
7886 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7887                                     float_status *status)               \
7888 {                                                                       \
7889     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7890 }
7891 
7892 MINMAX(32)
7893 MINMAX(64)
7894 
7895 
7896 /* Multiply A by 2 raised to the power N.  */
7897 float32 float32_scalbn(float32 a, int n, float_status *status)
7898 {
7899     flag aSign;
7900     int16_t aExp;
7901     uint32_t aSig;
7902 
7903     a = float32_squash_input_denormal(a, status);
7904     aSig = extractFloat32Frac( a );
7905     aExp = extractFloat32Exp( a );
7906     aSign = extractFloat32Sign( a );
7907 
7908     if ( aExp == 0xFF ) {
7909         if ( aSig ) {
7910             return propagateFloat32NaN(a, a, status);
7911         }
7912         return a;
7913     }
7914     if (aExp != 0) {
7915         aSig |= 0x00800000;
7916     } else if (aSig == 0) {
7917         return a;
7918     } else {
7919         aExp++;
7920     }
7921 
7922     if (n > 0x200) {
7923         n = 0x200;
7924     } else if (n < -0x200) {
7925         n = -0x200;
7926     }
7927 
7928     aExp += n - 1;
7929     aSig <<= 7;
7930     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7931 }
7932 
7933 float64 float64_scalbn(float64 a, int n, float_status *status)
7934 {
7935     flag aSign;
7936     int16_t aExp;
7937     uint64_t aSig;
7938 
7939     a = float64_squash_input_denormal(a, status);
7940     aSig = extractFloat64Frac( a );
7941     aExp = extractFloat64Exp( a );
7942     aSign = extractFloat64Sign( a );
7943 
7944     if ( aExp == 0x7FF ) {
7945         if ( aSig ) {
7946             return propagateFloat64NaN(a, a, status);
7947         }
7948         return a;
7949     }
7950     if (aExp != 0) {
7951         aSig |= LIT64( 0x0010000000000000 );
7952     } else if (aSig == 0) {
7953         return a;
7954     } else {
7955         aExp++;
7956     }
7957 
7958     if (n > 0x1000) {
7959         n = 0x1000;
7960     } else if (n < -0x1000) {
7961         n = -0x1000;
7962     }
7963 
7964     aExp += n - 1;
7965     aSig <<= 10;
7966     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7967 }
7968 
7969 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7970 {
7971     flag aSign;
7972     int32_t aExp;
7973     uint64_t aSig;
7974 
7975     if (floatx80_invalid_encoding(a)) {
7976         float_raise(float_flag_invalid, status);
7977         return floatx80_default_nan(status);
7978     }
7979     aSig = extractFloatx80Frac( a );
7980     aExp = extractFloatx80Exp( a );
7981     aSign = extractFloatx80Sign( a );
7982 
7983     if ( aExp == 0x7FFF ) {
7984         if ( aSig<<1 ) {
7985             return propagateFloatx80NaN(a, a, status);
7986         }
7987         return a;
7988     }
7989 
7990     if (aExp == 0) {
7991         if (aSig == 0) {
7992             return a;
7993         }
7994         aExp++;
7995     }
7996 
7997     if (n > 0x10000) {
7998         n = 0x10000;
7999     } else if (n < -0x10000) {
8000         n = -0x10000;
8001     }
8002 
8003     aExp += n;
8004     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8005                                          aSign, aExp, aSig, 0, status);
8006 }
8007 
8008 float128 float128_scalbn(float128 a, int n, float_status *status)
8009 {
8010     flag aSign;
8011     int32_t aExp;
8012     uint64_t aSig0, aSig1;
8013 
8014     aSig1 = extractFloat128Frac1( a );
8015     aSig0 = extractFloat128Frac0( a );
8016     aExp = extractFloat128Exp( a );
8017     aSign = extractFloat128Sign( a );
8018     if ( aExp == 0x7FFF ) {
8019         if ( aSig0 | aSig1 ) {
8020             return propagateFloat128NaN(a, a, status);
8021         }
8022         return a;
8023     }
8024     if (aExp != 0) {
8025         aSig0 |= LIT64( 0x0001000000000000 );
8026     } else if (aSig0 == 0 && aSig1 == 0) {
8027         return a;
8028     } else {
8029         aExp++;
8030     }
8031 
8032     if (n > 0x10000) {
8033         n = 0x10000;
8034     } else if (n < -0x10000) {
8035         n = -0x10000;
8036     }
8037 
8038     aExp += n - 1;
8039     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8040                                          , status);
8041 
8042 }
8043