xref: /qemu/fpu/softfloat.c (revision cf07323d494f4bc225e405688c2e455c3423cc40)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
564 {
565     if (is_snan(a.cls) || is_snan(b.cls)) {
566         s->float_exception_flags |= float_flag_invalid;
567     }
568 
569     if (s->default_nan_mode) {
570         a.cls = float_class_dnan;
571     } else {
572         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
573                     is_qnan(b.cls), is_snan(b.cls),
574                     a.frac > b.frac ||
575                     (a.frac == b.frac && a.sign < b.sign))) {
576             a = b;
577         }
578         a.cls = float_class_msnan;
579     }
580     return a;
581 }
582 
583 /*
584  * Returns the result of adding or subtracting the values of the
585  * floating-point values `a' and `b'. The operation is performed
586  * according to the IEC/IEEE Standard for Binary Floating-Point
587  * Arithmetic.
588  */
589 
590 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
591                                 float_status *s)
592 {
593     bool a_sign = a.sign;
594     bool b_sign = b.sign ^ subtract;
595 
596     if (a_sign != b_sign) {
597         /* Subtraction */
598 
599         if (a.cls == float_class_normal && b.cls == float_class_normal) {
600             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
601                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
602                 a.frac = a.frac - b.frac;
603             } else {
604                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
605                 a.frac = b.frac - a.frac;
606                 a.exp = b.exp;
607                 a_sign ^= 1;
608             }
609 
610             if (a.frac == 0) {
611                 a.cls = float_class_zero;
612                 a.sign = s->float_rounding_mode == float_round_down;
613             } else {
614                 int shift = clz64(a.frac) - 1;
615                 a.frac = a.frac << shift;
616                 a.exp = a.exp - shift;
617                 a.sign = a_sign;
618             }
619             return a;
620         }
621         if (is_nan(a.cls) || is_nan(b.cls)) {
622             return pick_nan(a, b, s);
623         }
624         if (a.cls == float_class_inf) {
625             if (b.cls == float_class_inf) {
626                 float_raise(float_flag_invalid, s);
627                 a.cls = float_class_dnan;
628             }
629             return a;
630         }
631         if (a.cls == float_class_zero && b.cls == float_class_zero) {
632             a.sign = s->float_rounding_mode == float_round_down;
633             return a;
634         }
635         if (a.cls == float_class_zero || b.cls == float_class_inf) {
636             b.sign = a_sign ^ 1;
637             return b;
638         }
639         if (b.cls == float_class_zero) {
640             return a;
641         }
642     } else {
643         /* Addition */
644         if (a.cls == float_class_normal && b.cls == float_class_normal) {
645             if (a.exp > b.exp) {
646                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
647             } else if (a.exp < b.exp) {
648                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
649                 a.exp = b.exp;
650             }
651             a.frac += b.frac;
652             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
653                 a.frac >>= 1;
654                 a.exp += 1;
655             }
656             return a;
657         }
658         if (is_nan(a.cls) || is_nan(b.cls)) {
659             return pick_nan(a, b, s);
660         }
661         if (a.cls == float_class_inf || b.cls == float_class_zero) {
662             return a;
663         }
664         if (b.cls == float_class_inf || a.cls == float_class_zero) {
665             b.sign = b_sign;
666             return b;
667         }
668     }
669     g_assert_not_reached();
670 }
671 
672 /*
673  * Returns the result of adding or subtracting the floating-point
674  * values `a' and `b'. The operation is performed according to the
675  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
676  */
677 
678 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
679                                               float_status *status)
680 {
681     FloatParts pa = float16_unpack_canonical(a, status);
682     FloatParts pb = float16_unpack_canonical(b, status);
683     FloatParts pr = addsub_floats(pa, pb, false, status);
684 
685     return float16_round_pack_canonical(pr, status);
686 }
687 
688 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
689                                              float_status *status)
690 {
691     FloatParts pa = float32_unpack_canonical(a, status);
692     FloatParts pb = float32_unpack_canonical(b, status);
693     FloatParts pr = addsub_floats(pa, pb, false, status);
694 
695     return float32_round_pack_canonical(pr, status);
696 }
697 
698 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
699                                              float_status *status)
700 {
701     FloatParts pa = float64_unpack_canonical(a, status);
702     FloatParts pb = float64_unpack_canonical(b, status);
703     FloatParts pr = addsub_floats(pa, pb, false, status);
704 
705     return float64_round_pack_canonical(pr, status);
706 }
707 
708 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
709                                              float_status *status)
710 {
711     FloatParts pa = float16_unpack_canonical(a, status);
712     FloatParts pb = float16_unpack_canonical(b, status);
713     FloatParts pr = addsub_floats(pa, pb, true, status);
714 
715     return float16_round_pack_canonical(pr, status);
716 }
717 
718 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
719                                              float_status *status)
720 {
721     FloatParts pa = float32_unpack_canonical(a, status);
722     FloatParts pb = float32_unpack_canonical(b, status);
723     FloatParts pr = addsub_floats(pa, pb, true, status);
724 
725     return float32_round_pack_canonical(pr, status);
726 }
727 
728 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
729                                              float_status *status)
730 {
731     FloatParts pa = float64_unpack_canonical(a, status);
732     FloatParts pb = float64_unpack_canonical(b, status);
733     FloatParts pr = addsub_floats(pa, pb, true, status);
734 
735     return float64_round_pack_canonical(pr, status);
736 }
737 
738 /*
739  * Returns the result of multiplying the floating-point values `a' and
740  * `b'. The operation is performed according to the IEC/IEEE Standard
741  * for Binary Floating-Point Arithmetic.
742  */
743 
744 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
745 {
746     bool sign = a.sign ^ b.sign;
747 
748     if (a.cls == float_class_normal && b.cls == float_class_normal) {
749         uint64_t hi, lo;
750         int exp = a.exp + b.exp;
751 
752         mul64To128(a.frac, b.frac, &hi, &lo);
753         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
754         if (lo & DECOMPOSED_OVERFLOW_BIT) {
755             shift64RightJamming(lo, 1, &lo);
756             exp += 1;
757         }
758 
759         /* Re-use a */
760         a.exp = exp;
761         a.sign = sign;
762         a.frac = lo;
763         return a;
764     }
765     /* handle all the NaN cases */
766     if (is_nan(a.cls) || is_nan(b.cls)) {
767         return pick_nan(a, b, s);
768     }
769     /* Inf * Zero == NaN */
770     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
771         (a.cls == float_class_zero && b.cls == float_class_inf)) {
772         s->float_exception_flags |= float_flag_invalid;
773         a.cls = float_class_dnan;
774         a.sign = sign;
775         return a;
776     }
777     /* Multiply by 0 or Inf */
778     if (a.cls == float_class_inf || a.cls == float_class_zero) {
779         a.sign = sign;
780         return a;
781     }
782     if (b.cls == float_class_inf || b.cls == float_class_zero) {
783         b.sign = sign;
784         return b;
785     }
786     g_assert_not_reached();
787 }
788 
789 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
790                                              float_status *status)
791 {
792     FloatParts pa = float16_unpack_canonical(a, status);
793     FloatParts pb = float16_unpack_canonical(b, status);
794     FloatParts pr = mul_floats(pa, pb, status);
795 
796     return float16_round_pack_canonical(pr, status);
797 }
798 
799 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
800                                              float_status *status)
801 {
802     FloatParts pa = float32_unpack_canonical(a, status);
803     FloatParts pb = float32_unpack_canonical(b, status);
804     FloatParts pr = mul_floats(pa, pb, status);
805 
806     return float32_round_pack_canonical(pr, status);
807 }
808 
809 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
810                                              float_status *status)
811 {
812     FloatParts pa = float64_unpack_canonical(a, status);
813     FloatParts pb = float64_unpack_canonical(b, status);
814     FloatParts pr = mul_floats(pa, pb, status);
815 
816     return float64_round_pack_canonical(pr, status);
817 }
818 
819 /*
820  * Returns the result of dividing the floating-point value `a' by the
821  * corresponding value `b'. The operation is performed according to
822  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
823  */
824 
825 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
826 {
827     bool sign = a.sign ^ b.sign;
828 
829     if (a.cls == float_class_normal && b.cls == float_class_normal) {
830         uint64_t temp_lo, temp_hi;
831         int exp = a.exp - b.exp;
832         if (a.frac < b.frac) {
833             exp -= 1;
834             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
835                               &temp_hi, &temp_lo);
836         } else {
837             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
838                               &temp_hi, &temp_lo);
839         }
840         /* LSB of quot is set if inexact which roundandpack will use
841          * to set flags. Yet again we re-use a for the result */
842         a.frac = div128To64(temp_lo, temp_hi, b.frac);
843         a.sign = sign;
844         a.exp = exp;
845         return a;
846     }
847     /* handle all the NaN cases */
848     if (is_nan(a.cls) || is_nan(b.cls)) {
849         return pick_nan(a, b, s);
850     }
851     /* 0/0 or Inf/Inf */
852     if (a.cls == b.cls
853         &&
854         (a.cls == float_class_inf || a.cls == float_class_zero)) {
855         s->float_exception_flags |= float_flag_invalid;
856         a.cls = float_class_dnan;
857         return a;
858     }
859     /* Div 0 => Inf */
860     if (b.cls == float_class_zero) {
861         s->float_exception_flags |= float_flag_divbyzero;
862         a.cls = float_class_inf;
863         a.sign = sign;
864         return a;
865     }
866     /* Inf / x or 0 / x */
867     if (a.cls == float_class_inf || a.cls == float_class_zero) {
868         a.sign = sign;
869         return a;
870     }
871     /* Div by Inf */
872     if (b.cls == float_class_inf) {
873         a.cls = float_class_zero;
874         a.sign = sign;
875         return a;
876     }
877     g_assert_not_reached();
878 }
879 
880 float16 float16_div(float16 a, float16 b, float_status *status)
881 {
882     FloatParts pa = float16_unpack_canonical(a, status);
883     FloatParts pb = float16_unpack_canonical(b, status);
884     FloatParts pr = div_floats(pa, pb, status);
885 
886     return float16_round_pack_canonical(pr, status);
887 }
888 
889 float32 float32_div(float32 a, float32 b, float_status *status)
890 {
891     FloatParts pa = float32_unpack_canonical(a, status);
892     FloatParts pb = float32_unpack_canonical(b, status);
893     FloatParts pr = div_floats(pa, pb, status);
894 
895     return float32_round_pack_canonical(pr, status);
896 }
897 
898 float64 float64_div(float64 a, float64 b, float_status *status)
899 {
900     FloatParts pa = float64_unpack_canonical(a, status);
901     FloatParts pb = float64_unpack_canonical(b, status);
902     FloatParts pr = div_floats(pa, pb, status);
903 
904     return float64_round_pack_canonical(pr, status);
905 }
906 
907 /*----------------------------------------------------------------------------
908 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
909 | and 7, and returns the properly rounded 32-bit integer corresponding to the
910 | input.  If `zSign' is 1, the input is negated before being converted to an
911 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
912 | is simply rounded to an integer, with the inexact exception raised if the
913 | input cannot be represented exactly as an integer.  However, if the fixed-
914 | point input is too large, the invalid exception is raised and the largest
915 | positive or negative integer is returned.
916 *----------------------------------------------------------------------------*/
917 
918 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
919 {
920     int8_t roundingMode;
921     flag roundNearestEven;
922     int8_t roundIncrement, roundBits;
923     int32_t z;
924 
925     roundingMode = status->float_rounding_mode;
926     roundNearestEven = ( roundingMode == float_round_nearest_even );
927     switch (roundingMode) {
928     case float_round_nearest_even:
929     case float_round_ties_away:
930         roundIncrement = 0x40;
931         break;
932     case float_round_to_zero:
933         roundIncrement = 0;
934         break;
935     case float_round_up:
936         roundIncrement = zSign ? 0 : 0x7f;
937         break;
938     case float_round_down:
939         roundIncrement = zSign ? 0x7f : 0;
940         break;
941     default:
942         abort();
943     }
944     roundBits = absZ & 0x7F;
945     absZ = ( absZ + roundIncrement )>>7;
946     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
947     z = absZ;
948     if ( zSign ) z = - z;
949     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
950         float_raise(float_flag_invalid, status);
951         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
952     }
953     if (roundBits) {
954         status->float_exception_flags |= float_flag_inexact;
955     }
956     return z;
957 
958 }
959 
960 /*----------------------------------------------------------------------------
961 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
962 | `absZ1', with binary point between bits 63 and 64 (between the input words),
963 | and returns the properly rounded 64-bit integer corresponding to the input.
964 | If `zSign' is 1, the input is negated before being converted to an integer.
965 | Ordinarily, the fixed-point input is simply rounded to an integer, with
966 | the inexact exception raised if the input cannot be represented exactly as
967 | an integer.  However, if the fixed-point input is too large, the invalid
968 | exception is raised and the largest positive or negative integer is
969 | returned.
970 *----------------------------------------------------------------------------*/
971 
972 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
973                                float_status *status)
974 {
975     int8_t roundingMode;
976     flag roundNearestEven, increment;
977     int64_t z;
978 
979     roundingMode = status->float_rounding_mode;
980     roundNearestEven = ( roundingMode == float_round_nearest_even );
981     switch (roundingMode) {
982     case float_round_nearest_even:
983     case float_round_ties_away:
984         increment = ((int64_t) absZ1 < 0);
985         break;
986     case float_round_to_zero:
987         increment = 0;
988         break;
989     case float_round_up:
990         increment = !zSign && absZ1;
991         break;
992     case float_round_down:
993         increment = zSign && absZ1;
994         break;
995     default:
996         abort();
997     }
998     if ( increment ) {
999         ++absZ0;
1000         if ( absZ0 == 0 ) goto overflow;
1001         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1002     }
1003     z = absZ0;
1004     if ( zSign ) z = - z;
1005     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1006  overflow:
1007         float_raise(float_flag_invalid, status);
1008         return
1009               zSign ? (int64_t) LIT64( 0x8000000000000000 )
1010             : LIT64( 0x7FFFFFFFFFFFFFFF );
1011     }
1012     if (absZ1) {
1013         status->float_exception_flags |= float_flag_inexact;
1014     }
1015     return z;
1016 
1017 }
1018 
1019 /*----------------------------------------------------------------------------
1020 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1021 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1022 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1023 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
1024 | with the inexact exception raised if the input cannot be represented exactly
1025 | as an integer.  However, if the fixed-point input is too large, the invalid
1026 | exception is raised and the largest unsigned integer is returned.
1027 *----------------------------------------------------------------------------*/
1028 
1029 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1030                                 uint64_t absZ1, float_status *status)
1031 {
1032     int8_t roundingMode;
1033     flag roundNearestEven, increment;
1034 
1035     roundingMode = status->float_rounding_mode;
1036     roundNearestEven = (roundingMode == float_round_nearest_even);
1037     switch (roundingMode) {
1038     case float_round_nearest_even:
1039     case float_round_ties_away:
1040         increment = ((int64_t)absZ1 < 0);
1041         break;
1042     case float_round_to_zero:
1043         increment = 0;
1044         break;
1045     case float_round_up:
1046         increment = !zSign && absZ1;
1047         break;
1048     case float_round_down:
1049         increment = zSign && absZ1;
1050         break;
1051     default:
1052         abort();
1053     }
1054     if (increment) {
1055         ++absZ0;
1056         if (absZ0 == 0) {
1057             float_raise(float_flag_invalid, status);
1058             return LIT64(0xFFFFFFFFFFFFFFFF);
1059         }
1060         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1061     }
1062 
1063     if (zSign && absZ0) {
1064         float_raise(float_flag_invalid, status);
1065         return 0;
1066     }
1067 
1068     if (absZ1) {
1069         status->float_exception_flags |= float_flag_inexact;
1070     }
1071     return absZ0;
1072 }
1073 
1074 /*----------------------------------------------------------------------------
1075 | If `a' is denormal and we are in flush-to-zero mode then set the
1076 | input-denormal exception and return zero. Otherwise just return the value.
1077 *----------------------------------------------------------------------------*/
1078 float32 float32_squash_input_denormal(float32 a, float_status *status)
1079 {
1080     if (status->flush_inputs_to_zero) {
1081         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1082             float_raise(float_flag_input_denormal, status);
1083             return make_float32(float32_val(a) & 0x80000000);
1084         }
1085     }
1086     return a;
1087 }
1088 
1089 /*----------------------------------------------------------------------------
1090 | Normalizes the subnormal single-precision floating-point value represented
1091 | by the denormalized significand `aSig'.  The normalized exponent and
1092 | significand are stored at the locations pointed to by `zExpPtr' and
1093 | `zSigPtr', respectively.
1094 *----------------------------------------------------------------------------*/
1095 
1096 static void
1097  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
1098 {
1099     int8_t shiftCount;
1100 
1101     shiftCount = countLeadingZeros32( aSig ) - 8;
1102     *zSigPtr = aSig<<shiftCount;
1103     *zExpPtr = 1 - shiftCount;
1104 
1105 }
1106 
1107 /*----------------------------------------------------------------------------
1108 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1109 | single-precision floating-point value, returning the result.  After being
1110 | shifted into the proper positions, the three fields are simply added
1111 | together to form the result.  This means that any integer portion of `zSig'
1112 | will be added into the exponent.  Since a properly normalized significand
1113 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1114 | than the desired result exponent whenever `zSig' is a complete, normalized
1115 | significand.
1116 *----------------------------------------------------------------------------*/
1117 
1118 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
1119 {
1120 
1121     return make_float32(
1122           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
1123 
1124 }
1125 
1126 /*----------------------------------------------------------------------------
1127 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1128 | and significand `zSig', and returns the proper single-precision floating-
1129 | point value corresponding to the abstract input.  Ordinarily, the abstract
1130 | value is simply rounded and packed into the single-precision format, with
1131 | the inexact exception raised if the abstract input cannot be represented
1132 | exactly.  However, if the abstract value is too large, the overflow and
1133 | inexact exceptions are raised and an infinity or maximal finite value is
1134 | returned.  If the abstract value is too small, the input value is rounded to
1135 | a subnormal number, and the underflow and inexact exceptions are raised if
1136 | the abstract input cannot be represented exactly as a subnormal single-
1137 | precision floating-point number.
1138 |     The input significand `zSig' has its binary point between bits 30
1139 | and 29, which is 7 bits to the left of the usual location.  This shifted
1140 | significand must be normalized or smaller.  If `zSig' is not normalized,
1141 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1142 | and it must not require rounding.  In the usual case that `zSig' is
1143 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1144 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1145 | Binary Floating-Point Arithmetic.
1146 *----------------------------------------------------------------------------*/
1147 
1148 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1149                                    float_status *status)
1150 {
1151     int8_t roundingMode;
1152     flag roundNearestEven;
1153     int8_t roundIncrement, roundBits;
1154     flag isTiny;
1155 
1156     roundingMode = status->float_rounding_mode;
1157     roundNearestEven = ( roundingMode == float_round_nearest_even );
1158     switch (roundingMode) {
1159     case float_round_nearest_even:
1160     case float_round_ties_away:
1161         roundIncrement = 0x40;
1162         break;
1163     case float_round_to_zero:
1164         roundIncrement = 0;
1165         break;
1166     case float_round_up:
1167         roundIncrement = zSign ? 0 : 0x7f;
1168         break;
1169     case float_round_down:
1170         roundIncrement = zSign ? 0x7f : 0;
1171         break;
1172     default:
1173         abort();
1174         break;
1175     }
1176     roundBits = zSig & 0x7F;
1177     if ( 0xFD <= (uint16_t) zExp ) {
1178         if (    ( 0xFD < zExp )
1179              || (    ( zExp == 0xFD )
1180                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
1181            ) {
1182             float_raise(float_flag_overflow | float_flag_inexact, status);
1183             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
1184         }
1185         if ( zExp < 0 ) {
1186             if (status->flush_to_zero) {
1187                 float_raise(float_flag_output_denormal, status);
1188                 return packFloat32(zSign, 0, 0);
1189             }
1190             isTiny =
1191                 (status->float_detect_tininess
1192                  == float_tininess_before_rounding)
1193                 || ( zExp < -1 )
1194                 || ( zSig + roundIncrement < 0x80000000 );
1195             shift32RightJamming( zSig, - zExp, &zSig );
1196             zExp = 0;
1197             roundBits = zSig & 0x7F;
1198             if (isTiny && roundBits) {
1199                 float_raise(float_flag_underflow, status);
1200             }
1201         }
1202     }
1203     if (roundBits) {
1204         status->float_exception_flags |= float_flag_inexact;
1205     }
1206     zSig = ( zSig + roundIncrement )>>7;
1207     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1208     if ( zSig == 0 ) zExp = 0;
1209     return packFloat32( zSign, zExp, zSig );
1210 
1211 }
1212 
1213 /*----------------------------------------------------------------------------
1214 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1215 | and significand `zSig', and returns the proper single-precision floating-
1216 | point value corresponding to the abstract input.  This routine is just like
1217 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1218 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1219 | floating-point exponent.
1220 *----------------------------------------------------------------------------*/
1221 
1222 static float32
1223  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
1224                               float_status *status)
1225 {
1226     int8_t shiftCount;
1227 
1228     shiftCount = countLeadingZeros32( zSig ) - 1;
1229     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
1230                                status);
1231 
1232 }
1233 
1234 /*----------------------------------------------------------------------------
1235 | If `a' is denormal and we are in flush-to-zero mode then set the
1236 | input-denormal exception and return zero. Otherwise just return the value.
1237 *----------------------------------------------------------------------------*/
1238 float64 float64_squash_input_denormal(float64 a, float_status *status)
1239 {
1240     if (status->flush_inputs_to_zero) {
1241         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
1242             float_raise(float_flag_input_denormal, status);
1243             return make_float64(float64_val(a) & (1ULL << 63));
1244         }
1245     }
1246     return a;
1247 }
1248 
1249 /*----------------------------------------------------------------------------
1250 | Normalizes the subnormal double-precision floating-point value represented
1251 | by the denormalized significand `aSig'.  The normalized exponent and
1252 | significand are stored at the locations pointed to by `zExpPtr' and
1253 | `zSigPtr', respectively.
1254 *----------------------------------------------------------------------------*/
1255 
1256 static void
1257  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
1258 {
1259     int8_t shiftCount;
1260 
1261     shiftCount = countLeadingZeros64( aSig ) - 11;
1262     *zSigPtr = aSig<<shiftCount;
1263     *zExpPtr = 1 - shiftCount;
1264 
1265 }
1266 
1267 /*----------------------------------------------------------------------------
1268 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1269 | double-precision floating-point value, returning the result.  After being
1270 | shifted into the proper positions, the three fields are simply added
1271 | together to form the result.  This means that any integer portion of `zSig'
1272 | will be added into the exponent.  Since a properly normalized significand
1273 | will have an integer portion equal to 1, the `zExp' input should be 1 less
1274 | than the desired result exponent whenever `zSig' is a complete, normalized
1275 | significand.
1276 *----------------------------------------------------------------------------*/
1277 
1278 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
1279 {
1280 
1281     return make_float64(
1282         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
1283 
1284 }
1285 
1286 /*----------------------------------------------------------------------------
1287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1288 | and significand `zSig', and returns the proper double-precision floating-
1289 | point value corresponding to the abstract input.  Ordinarily, the abstract
1290 | value is simply rounded and packed into the double-precision format, with
1291 | the inexact exception raised if the abstract input cannot be represented
1292 | exactly.  However, if the abstract value is too large, the overflow and
1293 | inexact exceptions are raised and an infinity or maximal finite value is
1294 | returned.  If the abstract value is too small, the input value is rounded to
1295 | a subnormal number, and the underflow and inexact exceptions are raised if
1296 | the abstract input cannot be represented exactly as a subnormal double-
1297 | precision floating-point number.
1298 |     The input significand `zSig' has its binary point between bits 62
1299 | and 61, which is 10 bits to the left of the usual location.  This shifted
1300 | significand must be normalized or smaller.  If `zSig' is not normalized,
1301 | `zExp' must be 0; in that case, the result returned is a subnormal number,
1302 | and it must not require rounding.  In the usual case that `zSig' is
1303 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1304 | The handling of underflow and overflow follows the IEC/IEEE Standard for
1305 | Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1307 
1308 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1309                                    float_status *status)
1310 {
1311     int8_t roundingMode;
1312     flag roundNearestEven;
1313     int roundIncrement, roundBits;
1314     flag isTiny;
1315 
1316     roundingMode = status->float_rounding_mode;
1317     roundNearestEven = ( roundingMode == float_round_nearest_even );
1318     switch (roundingMode) {
1319     case float_round_nearest_even:
1320     case float_round_ties_away:
1321         roundIncrement = 0x200;
1322         break;
1323     case float_round_to_zero:
1324         roundIncrement = 0;
1325         break;
1326     case float_round_up:
1327         roundIncrement = zSign ? 0 : 0x3ff;
1328         break;
1329     case float_round_down:
1330         roundIncrement = zSign ? 0x3ff : 0;
1331         break;
1332     case float_round_to_odd:
1333         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1334         break;
1335     default:
1336         abort();
1337     }
1338     roundBits = zSig & 0x3FF;
1339     if ( 0x7FD <= (uint16_t) zExp ) {
1340         if (    ( 0x7FD < zExp )
1341              || (    ( zExp == 0x7FD )
1342                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
1343            ) {
1344             bool overflow_to_inf = roundingMode != float_round_to_odd &&
1345                                    roundIncrement != 0;
1346             float_raise(float_flag_overflow | float_flag_inexact, status);
1347             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
1348         }
1349         if ( zExp < 0 ) {
1350             if (status->flush_to_zero) {
1351                 float_raise(float_flag_output_denormal, status);
1352                 return packFloat64(zSign, 0, 0);
1353             }
1354             isTiny =
1355                    (status->float_detect_tininess
1356                     == float_tininess_before_rounding)
1357                 || ( zExp < -1 )
1358                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1359             shift64RightJamming( zSig, - zExp, &zSig );
1360             zExp = 0;
1361             roundBits = zSig & 0x3FF;
1362             if (isTiny && roundBits) {
1363                 float_raise(float_flag_underflow, status);
1364             }
1365             if (roundingMode == float_round_to_odd) {
1366                 /*
1367                  * For round-to-odd case, the roundIncrement depends on
1368                  * zSig which just changed.
1369                  */
1370                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
1371             }
1372         }
1373     }
1374     if (roundBits) {
1375         status->float_exception_flags |= float_flag_inexact;
1376     }
1377     zSig = ( zSig + roundIncrement )>>10;
1378     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1379     if ( zSig == 0 ) zExp = 0;
1380     return packFloat64( zSign, zExp, zSig );
1381 
1382 }
1383 
1384 /*----------------------------------------------------------------------------
1385 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1386 | and significand `zSig', and returns the proper double-precision floating-
1387 | point value corresponding to the abstract input.  This routine is just like
1388 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1389 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1390 | floating-point exponent.
1391 *----------------------------------------------------------------------------*/
1392 
1393 static float64
1394  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
1395                               float_status *status)
1396 {
1397     int8_t shiftCount;
1398 
1399     shiftCount = countLeadingZeros64( zSig ) - 1;
1400     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
1401                                status);
1402 
1403 }
1404 
1405 /*----------------------------------------------------------------------------
1406 | Returns the fraction bits of the extended double-precision floating-point
1407 | value `a'.
1408 *----------------------------------------------------------------------------*/
1409 
1410 static inline uint64_t extractFloatx80Frac( floatx80 a )
1411 {
1412 
1413     return a.low;
1414 
1415 }
1416 
1417 /*----------------------------------------------------------------------------
1418 | Returns the exponent bits of the extended double-precision floating-point
1419 | value `a'.
1420 *----------------------------------------------------------------------------*/
1421 
1422 static inline int32_t extractFloatx80Exp( floatx80 a )
1423 {
1424 
1425     return a.high & 0x7FFF;
1426 
1427 }
1428 
1429 /*----------------------------------------------------------------------------
1430 | Returns the sign bit of the extended double-precision floating-point value
1431 | `a'.
1432 *----------------------------------------------------------------------------*/
1433 
1434 static inline flag extractFloatx80Sign( floatx80 a )
1435 {
1436 
1437     return a.high>>15;
1438 
1439 }
1440 
1441 /*----------------------------------------------------------------------------
1442 | Normalizes the subnormal extended double-precision floating-point value
1443 | represented by the denormalized significand `aSig'.  The normalized exponent
1444 | and significand are stored at the locations pointed to by `zExpPtr' and
1445 | `zSigPtr', respectively.
1446 *----------------------------------------------------------------------------*/
1447 
1448 static void
1449  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
1450 {
1451     int8_t shiftCount;
1452 
1453     shiftCount = countLeadingZeros64( aSig );
1454     *zSigPtr = aSig<<shiftCount;
1455     *zExpPtr = 1 - shiftCount;
1456 
1457 }
1458 
1459 /*----------------------------------------------------------------------------
1460 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1461 | extended double-precision floating-point value, returning the result.
1462 *----------------------------------------------------------------------------*/
1463 
1464 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
1465 {
1466     floatx80 z;
1467 
1468     z.low = zSig;
1469     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
1470     return z;
1471 
1472 }
1473 
1474 /*----------------------------------------------------------------------------
1475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1476 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
1477 | and returns the proper extended double-precision floating-point value
1478 | corresponding to the abstract input.  Ordinarily, the abstract value is
1479 | rounded and packed into the extended double-precision format, with the
1480 | inexact exception raised if the abstract input cannot be represented
1481 | exactly.  However, if the abstract value is too large, the overflow and
1482 | inexact exceptions are raised and an infinity or maximal finite value is
1483 | returned.  If the abstract value is too small, the input value is rounded to
1484 | a subnormal number, and the underflow and inexact exceptions are raised if
1485 | the abstract input cannot be represented exactly as a subnormal extended
1486 | double-precision floating-point number.
1487 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
1488 | number of bits as single or double precision, respectively.  Otherwise, the
1489 | result is rounded to the full precision of the extended double-precision
1490 | format.
1491 |     The input significand must be normalized or smaller.  If the input
1492 | significand is not normalized, `zExp' must be 0; in that case, the result
1493 | returned is a subnormal number, and it must not require rounding.  The
1494 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1495 | Floating-Point Arithmetic.
1496 *----------------------------------------------------------------------------*/
1497 
1498 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
1499                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
1500                                      float_status *status)
1501 {
1502     int8_t roundingMode;
1503     flag roundNearestEven, increment, isTiny;
1504     int64_t roundIncrement, roundMask, roundBits;
1505 
1506     roundingMode = status->float_rounding_mode;
1507     roundNearestEven = ( roundingMode == float_round_nearest_even );
1508     if ( roundingPrecision == 80 ) goto precision80;
1509     if ( roundingPrecision == 64 ) {
1510         roundIncrement = LIT64( 0x0000000000000400 );
1511         roundMask = LIT64( 0x00000000000007FF );
1512     }
1513     else if ( roundingPrecision == 32 ) {
1514         roundIncrement = LIT64( 0x0000008000000000 );
1515         roundMask = LIT64( 0x000000FFFFFFFFFF );
1516     }
1517     else {
1518         goto precision80;
1519     }
1520     zSig0 |= ( zSig1 != 0 );
1521     switch (roundingMode) {
1522     case float_round_nearest_even:
1523     case float_round_ties_away:
1524         break;
1525     case float_round_to_zero:
1526         roundIncrement = 0;
1527         break;
1528     case float_round_up:
1529         roundIncrement = zSign ? 0 : roundMask;
1530         break;
1531     case float_round_down:
1532         roundIncrement = zSign ? roundMask : 0;
1533         break;
1534     default:
1535         abort();
1536     }
1537     roundBits = zSig0 & roundMask;
1538     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
1539         if (    ( 0x7FFE < zExp )
1540              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1541            ) {
1542             goto overflow;
1543         }
1544         if ( zExp <= 0 ) {
1545             if (status->flush_to_zero) {
1546                 float_raise(float_flag_output_denormal, status);
1547                 return packFloatx80(zSign, 0, 0);
1548             }
1549             isTiny =
1550                    (status->float_detect_tininess
1551                     == float_tininess_before_rounding)
1552                 || ( zExp < 0 )
1553                 || ( zSig0 <= zSig0 + roundIncrement );
1554             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1555             zExp = 0;
1556             roundBits = zSig0 & roundMask;
1557             if (isTiny && roundBits) {
1558                 float_raise(float_flag_underflow, status);
1559             }
1560             if (roundBits) {
1561                 status->float_exception_flags |= float_flag_inexact;
1562             }
1563             zSig0 += roundIncrement;
1564             if ( (int64_t) zSig0 < 0 ) zExp = 1;
1565             roundIncrement = roundMask + 1;
1566             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1567                 roundMask |= roundIncrement;
1568             }
1569             zSig0 &= ~ roundMask;
1570             return packFloatx80( zSign, zExp, zSig0 );
1571         }
1572     }
1573     if (roundBits) {
1574         status->float_exception_flags |= float_flag_inexact;
1575     }
1576     zSig0 += roundIncrement;
1577     if ( zSig0 < roundIncrement ) {
1578         ++zExp;
1579         zSig0 = LIT64( 0x8000000000000000 );
1580     }
1581     roundIncrement = roundMask + 1;
1582     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1583         roundMask |= roundIncrement;
1584     }
1585     zSig0 &= ~ roundMask;
1586     if ( zSig0 == 0 ) zExp = 0;
1587     return packFloatx80( zSign, zExp, zSig0 );
1588  precision80:
1589     switch (roundingMode) {
1590     case float_round_nearest_even:
1591     case float_round_ties_away:
1592         increment = ((int64_t)zSig1 < 0);
1593         break;
1594     case float_round_to_zero:
1595         increment = 0;
1596         break;
1597     case float_round_up:
1598         increment = !zSign && zSig1;
1599         break;
1600     case float_round_down:
1601         increment = zSign && zSig1;
1602         break;
1603     default:
1604         abort();
1605     }
1606     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
1607         if (    ( 0x7FFE < zExp )
1608              || (    ( zExp == 0x7FFE )
1609                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
1610                   && increment
1611                 )
1612            ) {
1613             roundMask = 0;
1614  overflow:
1615             float_raise(float_flag_overflow | float_flag_inexact, status);
1616             if (    ( roundingMode == float_round_to_zero )
1617                  || ( zSign && ( roundingMode == float_round_up ) )
1618                  || ( ! zSign && ( roundingMode == float_round_down ) )
1619                ) {
1620                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
1621             }
1622             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1623         }
1624         if ( zExp <= 0 ) {
1625             isTiny =
1626                    (status->float_detect_tininess
1627                     == float_tininess_before_rounding)
1628                 || ( zExp < 0 )
1629                 || ! increment
1630                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
1631             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
1632             zExp = 0;
1633             if (isTiny && zSig1) {
1634                 float_raise(float_flag_underflow, status);
1635             }
1636             if (zSig1) {
1637                 status->float_exception_flags |= float_flag_inexact;
1638             }
1639             switch (roundingMode) {
1640             case float_round_nearest_even:
1641             case float_round_ties_away:
1642                 increment = ((int64_t)zSig1 < 0);
1643                 break;
1644             case float_round_to_zero:
1645                 increment = 0;
1646                 break;
1647             case float_round_up:
1648                 increment = !zSign && zSig1;
1649                 break;
1650             case float_round_down:
1651                 increment = zSign && zSig1;
1652                 break;
1653             default:
1654                 abort();
1655             }
1656             if ( increment ) {
1657                 ++zSig0;
1658                 zSig0 &=
1659                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1660                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
1661             }
1662             return packFloatx80( zSign, zExp, zSig0 );
1663         }
1664     }
1665     if (zSig1) {
1666         status->float_exception_flags |= float_flag_inexact;
1667     }
1668     if ( increment ) {
1669         ++zSig0;
1670         if ( zSig0 == 0 ) {
1671             ++zExp;
1672             zSig0 = LIT64( 0x8000000000000000 );
1673         }
1674         else {
1675             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1676         }
1677     }
1678     else {
1679         if ( zSig0 == 0 ) zExp = 0;
1680     }
1681     return packFloatx80( zSign, zExp, zSig0 );
1682 
1683 }
1684 
1685 /*----------------------------------------------------------------------------
1686 | Takes an abstract floating-point value having sign `zSign', exponent
1687 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1688 | and returns the proper extended double-precision floating-point value
1689 | corresponding to the abstract input.  This routine is just like
1690 | `roundAndPackFloatx80' except that the input significand does not have to be
1691 | normalized.
1692 *----------------------------------------------------------------------------*/
1693 
1694 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
1695                                               flag zSign, int32_t zExp,
1696                                               uint64_t zSig0, uint64_t zSig1,
1697                                               float_status *status)
1698 {
1699     int8_t shiftCount;
1700 
1701     if ( zSig0 == 0 ) {
1702         zSig0 = zSig1;
1703         zSig1 = 0;
1704         zExp -= 64;
1705     }
1706     shiftCount = countLeadingZeros64( zSig0 );
1707     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1708     zExp -= shiftCount;
1709     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1710                                 zSig0, zSig1, status);
1711 
1712 }
1713 
1714 /*----------------------------------------------------------------------------
1715 | Returns the least-significant 64 fraction bits of the quadruple-precision
1716 | floating-point value `a'.
1717 *----------------------------------------------------------------------------*/
1718 
1719 static inline uint64_t extractFloat128Frac1( float128 a )
1720 {
1721 
1722     return a.low;
1723 
1724 }
1725 
1726 /*----------------------------------------------------------------------------
1727 | Returns the most-significant 48 fraction bits of the quadruple-precision
1728 | floating-point value `a'.
1729 *----------------------------------------------------------------------------*/
1730 
1731 static inline uint64_t extractFloat128Frac0( float128 a )
1732 {
1733 
1734     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1735 
1736 }
1737 
1738 /*----------------------------------------------------------------------------
1739 | Returns the exponent bits of the quadruple-precision floating-point value
1740 | `a'.
1741 *----------------------------------------------------------------------------*/
1742 
1743 static inline int32_t extractFloat128Exp( float128 a )
1744 {
1745 
1746     return ( a.high>>48 ) & 0x7FFF;
1747 
1748 }
1749 
1750 /*----------------------------------------------------------------------------
1751 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1752 *----------------------------------------------------------------------------*/
1753 
1754 static inline flag extractFloat128Sign( float128 a )
1755 {
1756 
1757     return a.high>>63;
1758 
1759 }
1760 
1761 /*----------------------------------------------------------------------------
1762 | Normalizes the subnormal quadruple-precision floating-point value
1763 | represented by the denormalized significand formed by the concatenation of
1764 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1765 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1766 | significand are stored at the location pointed to by `zSig0Ptr', and the
1767 | least significant 64 bits of the normalized significand are stored at the
1768 | location pointed to by `zSig1Ptr'.
1769 *----------------------------------------------------------------------------*/
1770 
1771 static void
1772  normalizeFloat128Subnormal(
1773      uint64_t aSig0,
1774      uint64_t aSig1,
1775      int32_t *zExpPtr,
1776      uint64_t *zSig0Ptr,
1777      uint64_t *zSig1Ptr
1778  )
1779 {
1780     int8_t shiftCount;
1781 
1782     if ( aSig0 == 0 ) {
1783         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1784         if ( shiftCount < 0 ) {
1785             *zSig0Ptr = aSig1>>( - shiftCount );
1786             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1787         }
1788         else {
1789             *zSig0Ptr = aSig1<<shiftCount;
1790             *zSig1Ptr = 0;
1791         }
1792         *zExpPtr = - shiftCount - 63;
1793     }
1794     else {
1795         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1796         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1797         *zExpPtr = 1 - shiftCount;
1798     }
1799 
1800 }
1801 
1802 /*----------------------------------------------------------------------------
1803 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1804 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1805 | floating-point value, returning the result.  After being shifted into the
1806 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1807 | added together to form the most significant 32 bits of the result.  This
1808 | means that any integer portion of `zSig0' will be added into the exponent.
1809 | Since a properly normalized significand will have an integer portion equal
1810 | to 1, the `zExp' input should be 1 less than the desired result exponent
1811 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1812 | significand.
1813 *----------------------------------------------------------------------------*/
1814 
1815 static inline float128
1816  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1817 {
1818     float128 z;
1819 
1820     z.low = zSig1;
1821     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1822     return z;
1823 
1824 }
1825 
1826 /*----------------------------------------------------------------------------
1827 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1828 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1829 | and `zSig2', and returns the proper quadruple-precision floating-point value
1830 | corresponding to the abstract input.  Ordinarily, the abstract value is
1831 | simply rounded and packed into the quadruple-precision format, with the
1832 | inexact exception raised if the abstract input cannot be represented
1833 | exactly.  However, if the abstract value is too large, the overflow and
1834 | inexact exceptions are raised and an infinity or maximal finite value is
1835 | returned.  If the abstract value is too small, the input value is rounded to
1836 | a subnormal number, and the underflow and inexact exceptions are raised if
1837 | the abstract input cannot be represented exactly as a subnormal quadruple-
1838 | precision floating-point number.
1839 |     The input significand must be normalized or smaller.  If the input
1840 | significand is not normalized, `zExp' must be 0; in that case, the result
1841 | returned is a subnormal number, and it must not require rounding.  In the
1842 | usual case that the input significand is normalized, `zExp' must be 1 less
1843 | than the ``true'' floating-point exponent.  The handling of underflow and
1844 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1845 *----------------------------------------------------------------------------*/
1846 
1847 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1848                                      uint64_t zSig0, uint64_t zSig1,
1849                                      uint64_t zSig2, float_status *status)
1850 {
1851     int8_t roundingMode;
1852     flag roundNearestEven, increment, isTiny;
1853 
1854     roundingMode = status->float_rounding_mode;
1855     roundNearestEven = ( roundingMode == float_round_nearest_even );
1856     switch (roundingMode) {
1857     case float_round_nearest_even:
1858     case float_round_ties_away:
1859         increment = ((int64_t)zSig2 < 0);
1860         break;
1861     case float_round_to_zero:
1862         increment = 0;
1863         break;
1864     case float_round_up:
1865         increment = !zSign && zSig2;
1866         break;
1867     case float_round_down:
1868         increment = zSign && zSig2;
1869         break;
1870     case float_round_to_odd:
1871         increment = !(zSig1 & 0x1) && zSig2;
1872         break;
1873     default:
1874         abort();
1875     }
1876     if ( 0x7FFD <= (uint32_t) zExp ) {
1877         if (    ( 0x7FFD < zExp )
1878              || (    ( zExp == 0x7FFD )
1879                   && eq128(
1880                          LIT64( 0x0001FFFFFFFFFFFF ),
1881                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1882                          zSig0,
1883                          zSig1
1884                      )
1885                   && increment
1886                 )
1887            ) {
1888             float_raise(float_flag_overflow | float_flag_inexact, status);
1889             if (    ( roundingMode == float_round_to_zero )
1890                  || ( zSign && ( roundingMode == float_round_up ) )
1891                  || ( ! zSign && ( roundingMode == float_round_down ) )
1892                  || (roundingMode == float_round_to_odd)
1893                ) {
1894                 return
1895                     packFloat128(
1896                         zSign,
1897                         0x7FFE,
1898                         LIT64( 0x0000FFFFFFFFFFFF ),
1899                         LIT64( 0xFFFFFFFFFFFFFFFF )
1900                     );
1901             }
1902             return packFloat128( zSign, 0x7FFF, 0, 0 );
1903         }
1904         if ( zExp < 0 ) {
1905             if (status->flush_to_zero) {
1906                 float_raise(float_flag_output_denormal, status);
1907                 return packFloat128(zSign, 0, 0, 0);
1908             }
1909             isTiny =
1910                    (status->float_detect_tininess
1911                     == float_tininess_before_rounding)
1912                 || ( zExp < -1 )
1913                 || ! increment
1914                 || lt128(
1915                        zSig0,
1916                        zSig1,
1917                        LIT64( 0x0001FFFFFFFFFFFF ),
1918                        LIT64( 0xFFFFFFFFFFFFFFFF )
1919                    );
1920             shift128ExtraRightJamming(
1921                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1922             zExp = 0;
1923             if (isTiny && zSig2) {
1924                 float_raise(float_flag_underflow, status);
1925             }
1926             switch (roundingMode) {
1927             case float_round_nearest_even:
1928             case float_round_ties_away:
1929                 increment = ((int64_t)zSig2 < 0);
1930                 break;
1931             case float_round_to_zero:
1932                 increment = 0;
1933                 break;
1934             case float_round_up:
1935                 increment = !zSign && zSig2;
1936                 break;
1937             case float_round_down:
1938                 increment = zSign && zSig2;
1939                 break;
1940             case float_round_to_odd:
1941                 increment = !(zSig1 & 0x1) && zSig2;
1942                 break;
1943             default:
1944                 abort();
1945             }
1946         }
1947     }
1948     if (zSig2) {
1949         status->float_exception_flags |= float_flag_inexact;
1950     }
1951     if ( increment ) {
1952         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1953         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1954     }
1955     else {
1956         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1957     }
1958     return packFloat128( zSign, zExp, zSig0, zSig1 );
1959 
1960 }
1961 
1962 /*----------------------------------------------------------------------------
1963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1964 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1965 | returns the proper quadruple-precision floating-point value corresponding
1966 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1967 | except that the input significand has fewer bits and does not have to be
1968 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1969 | point exponent.
1970 *----------------------------------------------------------------------------*/
1971 
1972 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1973                                               uint64_t zSig0, uint64_t zSig1,
1974                                               float_status *status)
1975 {
1976     int8_t shiftCount;
1977     uint64_t zSig2;
1978 
1979     if ( zSig0 == 0 ) {
1980         zSig0 = zSig1;
1981         zSig1 = 0;
1982         zExp -= 64;
1983     }
1984     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1985     if ( 0 <= shiftCount ) {
1986         zSig2 = 0;
1987         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1988     }
1989     else {
1990         shift128ExtraRightJamming(
1991             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1992     }
1993     zExp -= shiftCount;
1994     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1995 
1996 }
1997 
1998 /*----------------------------------------------------------------------------
1999 | Returns the result of converting the 32-bit two's complement integer `a'
2000 | to the single-precision floating-point format.  The conversion is performed
2001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2002 *----------------------------------------------------------------------------*/
2003 
2004 float32 int32_to_float32(int32_t a, float_status *status)
2005 {
2006     flag zSign;
2007 
2008     if ( a == 0 ) return float32_zero;
2009     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
2010     zSign = ( a < 0 );
2011     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
2012 }
2013 
2014 /*----------------------------------------------------------------------------
2015 | Returns the result of converting the 32-bit two's complement integer `a'
2016 | to the double-precision floating-point format.  The conversion is performed
2017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019 
2020 float64 int32_to_float64(int32_t a, float_status *status)
2021 {
2022     flag zSign;
2023     uint32_t absA;
2024     int8_t shiftCount;
2025     uint64_t zSig;
2026 
2027     if ( a == 0 ) return float64_zero;
2028     zSign = ( a < 0 );
2029     absA = zSign ? - a : a;
2030     shiftCount = countLeadingZeros32( absA ) + 21;
2031     zSig = absA;
2032     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2033 
2034 }
2035 
2036 /*----------------------------------------------------------------------------
2037 | Returns the result of converting the 32-bit two's complement integer `a'
2038 | to the extended double-precision floating-point format.  The conversion
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040 | Arithmetic.
2041 *----------------------------------------------------------------------------*/
2042 
2043 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2044 {
2045     flag zSign;
2046     uint32_t absA;
2047     int8_t shiftCount;
2048     uint64_t zSig;
2049 
2050     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2051     zSign = ( a < 0 );
2052     absA = zSign ? - a : a;
2053     shiftCount = countLeadingZeros32( absA ) + 32;
2054     zSig = absA;
2055     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2056 
2057 }
2058 
2059 /*----------------------------------------------------------------------------
2060 | Returns the result of converting the 32-bit two's complement integer `a' to
2061 | the quadruple-precision floating-point format.  The conversion is performed
2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2063 *----------------------------------------------------------------------------*/
2064 
2065 float128 int32_to_float128(int32_t a, float_status *status)
2066 {
2067     flag zSign;
2068     uint32_t absA;
2069     int8_t shiftCount;
2070     uint64_t zSig0;
2071 
2072     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2073     zSign = ( a < 0 );
2074     absA = zSign ? - a : a;
2075     shiftCount = countLeadingZeros32( absA ) + 17;
2076     zSig0 = absA;
2077     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2078 
2079 }
2080 
2081 /*----------------------------------------------------------------------------
2082 | Returns the result of converting the 64-bit two's complement integer `a'
2083 | to the single-precision floating-point format.  The conversion is performed
2084 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2086 
2087 float32 int64_to_float32(int64_t a, float_status *status)
2088 {
2089     flag zSign;
2090     uint64_t absA;
2091     int8_t shiftCount;
2092 
2093     if ( a == 0 ) return float32_zero;
2094     zSign = ( a < 0 );
2095     absA = zSign ? - a : a;
2096     shiftCount = countLeadingZeros64( absA ) - 40;
2097     if ( 0 <= shiftCount ) {
2098         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2099     }
2100     else {
2101         shiftCount += 7;
2102         if ( shiftCount < 0 ) {
2103             shift64RightJamming( absA, - shiftCount, &absA );
2104         }
2105         else {
2106             absA <<= shiftCount;
2107         }
2108         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
2109     }
2110 
2111 }
2112 
2113 /*----------------------------------------------------------------------------
2114 | Returns the result of converting the 64-bit two's complement integer `a'
2115 | to the double-precision floating-point format.  The conversion is performed
2116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2117 *----------------------------------------------------------------------------*/
2118 
2119 float64 int64_to_float64(int64_t a, float_status *status)
2120 {
2121     flag zSign;
2122 
2123     if ( a == 0 ) return float64_zero;
2124     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
2125         return packFloat64( 1, 0x43E, 0 );
2126     }
2127     zSign = ( a < 0 );
2128     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
2129 }
2130 
2131 /*----------------------------------------------------------------------------
2132 | Returns the result of converting the 64-bit two's complement integer `a'
2133 | to the extended double-precision floating-point format.  The conversion
2134 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2135 | Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137 
2138 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2139 {
2140     flag zSign;
2141     uint64_t absA;
2142     int8_t shiftCount;
2143 
2144     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2145     zSign = ( a < 0 );
2146     absA = zSign ? - a : a;
2147     shiftCount = countLeadingZeros64( absA );
2148     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2149 
2150 }
2151 
2152 /*----------------------------------------------------------------------------
2153 | Returns the result of converting the 64-bit two's complement integer `a' to
2154 | the quadruple-precision floating-point format.  The conversion is performed
2155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2156 *----------------------------------------------------------------------------*/
2157 
2158 float128 int64_to_float128(int64_t a, float_status *status)
2159 {
2160     flag zSign;
2161     uint64_t absA;
2162     int8_t shiftCount;
2163     int32_t zExp;
2164     uint64_t zSig0, zSig1;
2165 
2166     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2167     zSign = ( a < 0 );
2168     absA = zSign ? - a : a;
2169     shiftCount = countLeadingZeros64( absA ) + 49;
2170     zExp = 0x406E - shiftCount;
2171     if ( 64 <= shiftCount ) {
2172         zSig1 = 0;
2173         zSig0 = absA;
2174         shiftCount -= 64;
2175     }
2176     else {
2177         zSig1 = absA;
2178         zSig0 = 0;
2179     }
2180     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2181     return packFloat128( zSign, zExp, zSig0, zSig1 );
2182 
2183 }
2184 
2185 /*----------------------------------------------------------------------------
2186 | Returns the result of converting the 64-bit unsigned integer `a'
2187 | to the single-precision floating-point format.  The conversion is performed
2188 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2189 *----------------------------------------------------------------------------*/
2190 
2191 float32 uint64_to_float32(uint64_t a, float_status *status)
2192 {
2193     int shiftcount;
2194 
2195     if (a == 0) {
2196         return float32_zero;
2197     }
2198 
2199     /* Determine (left) shift needed to put first set bit into bit posn 23
2200      * (since packFloat32() expects the binary point between bits 23 and 22);
2201      * this is the fast case for smallish numbers.
2202      */
2203     shiftcount = countLeadingZeros64(a) - 40;
2204     if (shiftcount >= 0) {
2205         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
2206     }
2207     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
2208      * expects the binary point between bits 30 and 29, hence the + 7.
2209      */
2210     shiftcount += 7;
2211     if (shiftcount < 0) {
2212         shift64RightJamming(a, -shiftcount, &a);
2213     } else {
2214         a <<= shiftcount;
2215     }
2216 
2217     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
2218 }
2219 
2220 /*----------------------------------------------------------------------------
2221 | Returns the result of converting the 64-bit unsigned integer `a'
2222 | to the double-precision floating-point format.  The conversion is performed
2223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2224 *----------------------------------------------------------------------------*/
2225 
2226 float64 uint64_to_float64(uint64_t a, float_status *status)
2227 {
2228     int exp = 0x43C;
2229     int shiftcount;
2230 
2231     if (a == 0) {
2232         return float64_zero;
2233     }
2234 
2235     shiftcount = countLeadingZeros64(a) - 1;
2236     if (shiftcount < 0) {
2237         shift64RightJamming(a, -shiftcount, &a);
2238     } else {
2239         a <<= shiftcount;
2240     }
2241     return roundAndPackFloat64(0, exp - shiftcount, a, status);
2242 }
2243 
2244 /*----------------------------------------------------------------------------
2245 | Returns the result of converting the 64-bit unsigned integer `a'
2246 | to the quadruple-precision floating-point format.  The conversion is performed
2247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249 
2250 float128 uint64_to_float128(uint64_t a, float_status *status)
2251 {
2252     if (a == 0) {
2253         return float128_zero;
2254     }
2255     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
2256 }
2257 
2258 /*----------------------------------------------------------------------------
2259 | Returns the result of converting the single-precision floating-point value
2260 | `a' to the 32-bit two's complement integer format.  The conversion is
2261 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2262 | Arithmetic---which means in particular that the conversion is rounded
2263 | according to the current rounding mode.  If `a' is a NaN, the largest
2264 | positive integer is returned.  Otherwise, if the conversion overflows, the
2265 | largest integer with the same sign as `a' is returned.
2266 *----------------------------------------------------------------------------*/
2267 
2268 int32_t float32_to_int32(float32 a, float_status *status)
2269 {
2270     flag aSign;
2271     int aExp;
2272     int shiftCount;
2273     uint32_t aSig;
2274     uint64_t aSig64;
2275 
2276     a = float32_squash_input_denormal(a, status);
2277     aSig = extractFloat32Frac( a );
2278     aExp = extractFloat32Exp( a );
2279     aSign = extractFloat32Sign( a );
2280     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2281     if ( aExp ) aSig |= 0x00800000;
2282     shiftCount = 0xAF - aExp;
2283     aSig64 = aSig;
2284     aSig64 <<= 32;
2285     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
2286     return roundAndPackInt32(aSign, aSig64, status);
2287 
2288 }
2289 
2290 /*----------------------------------------------------------------------------
2291 | Returns the result of converting the single-precision floating-point value
2292 | `a' to the 32-bit two's complement integer format.  The conversion is
2293 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2294 | Arithmetic, except that the conversion is always rounded toward zero.
2295 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2296 | the conversion overflows, the largest integer with the same sign as `a' is
2297 | returned.
2298 *----------------------------------------------------------------------------*/
2299 
2300 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
2301 {
2302     flag aSign;
2303     int aExp;
2304     int shiftCount;
2305     uint32_t aSig;
2306     int32_t z;
2307     a = float32_squash_input_denormal(a, status);
2308 
2309     aSig = extractFloat32Frac( a );
2310     aExp = extractFloat32Exp( a );
2311     aSign = extractFloat32Sign( a );
2312     shiftCount = aExp - 0x9E;
2313     if ( 0 <= shiftCount ) {
2314         if ( float32_val(a) != 0xCF000000 ) {
2315             float_raise(float_flag_invalid, status);
2316             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2317         }
2318         return (int32_t) 0x80000000;
2319     }
2320     else if ( aExp <= 0x7E ) {
2321         if (aExp | aSig) {
2322             status->float_exception_flags |= float_flag_inexact;
2323         }
2324         return 0;
2325     }
2326     aSig = ( aSig | 0x00800000 )<<8;
2327     z = aSig>>( - shiftCount );
2328     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2329         status->float_exception_flags |= float_flag_inexact;
2330     }
2331     if ( aSign ) z = - z;
2332     return z;
2333 
2334 }
2335 
2336 /*----------------------------------------------------------------------------
2337 | Returns the result of converting the single-precision floating-point value
2338 | `a' to the 16-bit two's complement integer format.  The conversion is
2339 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2340 | Arithmetic, except that the conversion is always rounded toward zero.
2341 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2342 | the conversion overflows, the largest integer with the same sign as `a' is
2343 | returned.
2344 *----------------------------------------------------------------------------*/
2345 
2346 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
2347 {
2348     flag aSign;
2349     int aExp;
2350     int shiftCount;
2351     uint32_t aSig;
2352     int32_t z;
2353 
2354     aSig = extractFloat32Frac( a );
2355     aExp = extractFloat32Exp( a );
2356     aSign = extractFloat32Sign( a );
2357     shiftCount = aExp - 0x8E;
2358     if ( 0 <= shiftCount ) {
2359         if ( float32_val(a) != 0xC7000000 ) {
2360             float_raise(float_flag_invalid, status);
2361             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2362                 return 0x7FFF;
2363             }
2364         }
2365         return (int32_t) 0xffff8000;
2366     }
2367     else if ( aExp <= 0x7E ) {
2368         if ( aExp | aSig ) {
2369             status->float_exception_flags |= float_flag_inexact;
2370         }
2371         return 0;
2372     }
2373     shiftCount -= 0x10;
2374     aSig = ( aSig | 0x00800000 )<<8;
2375     z = aSig>>( - shiftCount );
2376     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
2377         status->float_exception_flags |= float_flag_inexact;
2378     }
2379     if ( aSign ) {
2380         z = - z;
2381     }
2382     return z;
2383 
2384 }
2385 
2386 /*----------------------------------------------------------------------------
2387 | Returns the result of converting the single-precision floating-point value
2388 | `a' to the 64-bit two's complement integer format.  The conversion is
2389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2390 | Arithmetic---which means in particular that the conversion is rounded
2391 | according to the current rounding mode.  If `a' is a NaN, the largest
2392 | positive integer is returned.  Otherwise, if the conversion overflows, the
2393 | largest integer with the same sign as `a' is returned.
2394 *----------------------------------------------------------------------------*/
2395 
2396 int64_t float32_to_int64(float32 a, float_status *status)
2397 {
2398     flag aSign;
2399     int aExp;
2400     int shiftCount;
2401     uint32_t aSig;
2402     uint64_t aSig64, aSigExtra;
2403     a = float32_squash_input_denormal(a, status);
2404 
2405     aSig = extractFloat32Frac( a );
2406     aExp = extractFloat32Exp( a );
2407     aSign = extractFloat32Sign( a );
2408     shiftCount = 0xBE - aExp;
2409     if ( shiftCount < 0 ) {
2410         float_raise(float_flag_invalid, status);
2411         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2412             return LIT64( 0x7FFFFFFFFFFFFFFF );
2413         }
2414         return (int64_t) LIT64( 0x8000000000000000 );
2415     }
2416     if ( aExp ) aSig |= 0x00800000;
2417     aSig64 = aSig;
2418     aSig64 <<= 40;
2419     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
2420     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
2421 
2422 }
2423 
2424 /*----------------------------------------------------------------------------
2425 | Returns the result of converting the single-precision floating-point value
2426 | `a' to the 64-bit unsigned integer format.  The conversion is
2427 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2428 | Arithmetic---which means in particular that the conversion is rounded
2429 | according to the current rounding mode.  If `a' is a NaN, the largest
2430 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
2431 | largest unsigned integer is returned.  If the 'a' is negative, the result
2432 | is rounded and zero is returned; values that do not round to zero will
2433 | raise the inexact exception flag.
2434 *----------------------------------------------------------------------------*/
2435 
2436 uint64_t float32_to_uint64(float32 a, float_status *status)
2437 {
2438     flag aSign;
2439     int aExp;
2440     int shiftCount;
2441     uint32_t aSig;
2442     uint64_t aSig64, aSigExtra;
2443     a = float32_squash_input_denormal(a, status);
2444 
2445     aSig = extractFloat32Frac(a);
2446     aExp = extractFloat32Exp(a);
2447     aSign = extractFloat32Sign(a);
2448     if ((aSign) && (aExp > 126)) {
2449         float_raise(float_flag_invalid, status);
2450         if (float32_is_any_nan(a)) {
2451             return LIT64(0xFFFFFFFFFFFFFFFF);
2452         } else {
2453             return 0;
2454         }
2455     }
2456     shiftCount = 0xBE - aExp;
2457     if (aExp) {
2458         aSig |= 0x00800000;
2459     }
2460     if (shiftCount < 0) {
2461         float_raise(float_flag_invalid, status);
2462         return LIT64(0xFFFFFFFFFFFFFFFF);
2463     }
2464 
2465     aSig64 = aSig;
2466     aSig64 <<= 40;
2467     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
2468     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
2469 }
2470 
2471 /*----------------------------------------------------------------------------
2472 | Returns the result of converting the single-precision floating-point value
2473 | `a' to the 64-bit unsigned integer format.  The conversion is
2474 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2475 | Arithmetic, except that the conversion is always rounded toward zero.  If
2476 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
2477 | conversion overflows, the largest unsigned integer is returned.  If the
2478 | 'a' is negative, the result is rounded and zero is returned; values that do
2479 | not round to zero will raise the inexact flag.
2480 *----------------------------------------------------------------------------*/
2481 
2482 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
2483 {
2484     signed char current_rounding_mode = status->float_rounding_mode;
2485     set_float_rounding_mode(float_round_to_zero, status);
2486     int64_t v = float32_to_uint64(a, status);
2487     set_float_rounding_mode(current_rounding_mode, status);
2488     return v;
2489 }
2490 
2491 /*----------------------------------------------------------------------------
2492 | Returns the result of converting the single-precision floating-point value
2493 | `a' to the 64-bit two's complement integer format.  The conversion is
2494 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2495 | Arithmetic, except that the conversion is always rounded toward zero.  If
2496 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2497 | conversion overflows, the largest integer with the same sign as `a' is
2498 | returned.
2499 *----------------------------------------------------------------------------*/
2500 
2501 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
2502 {
2503     flag aSign;
2504     int aExp;
2505     int shiftCount;
2506     uint32_t aSig;
2507     uint64_t aSig64;
2508     int64_t z;
2509     a = float32_squash_input_denormal(a, status);
2510 
2511     aSig = extractFloat32Frac( a );
2512     aExp = extractFloat32Exp( a );
2513     aSign = extractFloat32Sign( a );
2514     shiftCount = aExp - 0xBE;
2515     if ( 0 <= shiftCount ) {
2516         if ( float32_val(a) != 0xDF000000 ) {
2517             float_raise(float_flag_invalid, status);
2518             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2519                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2520             }
2521         }
2522         return (int64_t) LIT64( 0x8000000000000000 );
2523     }
2524     else if ( aExp <= 0x7E ) {
2525         if (aExp | aSig) {
2526             status->float_exception_flags |= float_flag_inexact;
2527         }
2528         return 0;
2529     }
2530     aSig64 = aSig | 0x00800000;
2531     aSig64 <<= 40;
2532     z = aSig64>>( - shiftCount );
2533     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
2534         status->float_exception_flags |= float_flag_inexact;
2535     }
2536     if ( aSign ) z = - z;
2537     return z;
2538 
2539 }
2540 
2541 /*----------------------------------------------------------------------------
2542 | Returns the result of converting the single-precision floating-point value
2543 | `a' to the double-precision floating-point format.  The conversion is
2544 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2545 | Arithmetic.
2546 *----------------------------------------------------------------------------*/
2547 
2548 float64 float32_to_float64(float32 a, float_status *status)
2549 {
2550     flag aSign;
2551     int aExp;
2552     uint32_t aSig;
2553     a = float32_squash_input_denormal(a, status);
2554 
2555     aSig = extractFloat32Frac( a );
2556     aExp = extractFloat32Exp( a );
2557     aSign = extractFloat32Sign( a );
2558     if ( aExp == 0xFF ) {
2559         if (aSig) {
2560             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
2561         }
2562         return packFloat64( aSign, 0x7FF, 0 );
2563     }
2564     if ( aExp == 0 ) {
2565         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2566         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2567         --aExp;
2568     }
2569     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
2570 
2571 }
2572 
2573 /*----------------------------------------------------------------------------
2574 | Returns the result of converting the single-precision floating-point value
2575 | `a' to the extended double-precision floating-point format.  The conversion
2576 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2577 | Arithmetic.
2578 *----------------------------------------------------------------------------*/
2579 
2580 floatx80 float32_to_floatx80(float32 a, float_status *status)
2581 {
2582     flag aSign;
2583     int aExp;
2584     uint32_t aSig;
2585 
2586     a = float32_squash_input_denormal(a, status);
2587     aSig = extractFloat32Frac( a );
2588     aExp = extractFloat32Exp( a );
2589     aSign = extractFloat32Sign( a );
2590     if ( aExp == 0xFF ) {
2591         if (aSig) {
2592             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
2593         }
2594         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2595     }
2596     if ( aExp == 0 ) {
2597         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2598         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2599     }
2600     aSig |= 0x00800000;
2601     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
2602 
2603 }
2604 
2605 /*----------------------------------------------------------------------------
2606 | Returns the result of converting the single-precision floating-point value
2607 | `a' to the double-precision floating-point format.  The conversion is
2608 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2609 | Arithmetic.
2610 *----------------------------------------------------------------------------*/
2611 
2612 float128 float32_to_float128(float32 a, float_status *status)
2613 {
2614     flag aSign;
2615     int aExp;
2616     uint32_t aSig;
2617 
2618     a = float32_squash_input_denormal(a, status);
2619     aSig = extractFloat32Frac( a );
2620     aExp = extractFloat32Exp( a );
2621     aSign = extractFloat32Sign( a );
2622     if ( aExp == 0xFF ) {
2623         if (aSig) {
2624             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
2625         }
2626         return packFloat128( aSign, 0x7FFF, 0, 0 );
2627     }
2628     if ( aExp == 0 ) {
2629         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2630         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2631         --aExp;
2632     }
2633     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
2634 
2635 }
2636 
2637 /*----------------------------------------------------------------------------
2638 | Rounds the single-precision floating-point value `a' to an integer, and
2639 | returns the result as a single-precision floating-point value.  The
2640 | operation is performed according to the IEC/IEEE Standard for Binary
2641 | Floating-Point Arithmetic.
2642 *----------------------------------------------------------------------------*/
2643 
2644 float32 float32_round_to_int(float32 a, float_status *status)
2645 {
2646     flag aSign;
2647     int aExp;
2648     uint32_t lastBitMask, roundBitsMask;
2649     uint32_t z;
2650     a = float32_squash_input_denormal(a, status);
2651 
2652     aExp = extractFloat32Exp( a );
2653     if ( 0x96 <= aExp ) {
2654         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
2655             return propagateFloat32NaN(a, a, status);
2656         }
2657         return a;
2658     }
2659     if ( aExp <= 0x7E ) {
2660         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
2661         status->float_exception_flags |= float_flag_inexact;
2662         aSign = extractFloat32Sign( a );
2663         switch (status->float_rounding_mode) {
2664          case float_round_nearest_even:
2665             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2666                 return packFloat32( aSign, 0x7F, 0 );
2667             }
2668             break;
2669         case float_round_ties_away:
2670             if (aExp == 0x7E) {
2671                 return packFloat32(aSign, 0x7F, 0);
2672             }
2673             break;
2674          case float_round_down:
2675             return make_float32(aSign ? 0xBF800000 : 0);
2676          case float_round_up:
2677             return make_float32(aSign ? 0x80000000 : 0x3F800000);
2678         }
2679         return packFloat32( aSign, 0, 0 );
2680     }
2681     lastBitMask = 1;
2682     lastBitMask <<= 0x96 - aExp;
2683     roundBitsMask = lastBitMask - 1;
2684     z = float32_val(a);
2685     switch (status->float_rounding_mode) {
2686     case float_round_nearest_even:
2687         z += lastBitMask>>1;
2688         if ((z & roundBitsMask) == 0) {
2689             z &= ~lastBitMask;
2690         }
2691         break;
2692     case float_round_ties_away:
2693         z += lastBitMask >> 1;
2694         break;
2695     case float_round_to_zero:
2696         break;
2697     case float_round_up:
2698         if (!extractFloat32Sign(make_float32(z))) {
2699             z += roundBitsMask;
2700         }
2701         break;
2702     case float_round_down:
2703         if (extractFloat32Sign(make_float32(z))) {
2704             z += roundBitsMask;
2705         }
2706         break;
2707     default:
2708         abort();
2709     }
2710     z &= ~ roundBitsMask;
2711     if (z != float32_val(a)) {
2712         status->float_exception_flags |= float_flag_inexact;
2713     }
2714     return make_float32(z);
2715 
2716 }
2717 
2718 /*----------------------------------------------------------------------------
2719 | Returns the remainder of the single-precision floating-point value `a'
2720 | with respect to the corresponding value `b'.  The operation is performed
2721 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2722 *----------------------------------------------------------------------------*/
2723 
2724 float32 float32_rem(float32 a, float32 b, float_status *status)
2725 {
2726     flag aSign, zSign;
2727     int aExp, bExp, expDiff;
2728     uint32_t aSig, bSig;
2729     uint32_t q;
2730     uint64_t aSig64, bSig64, q64;
2731     uint32_t alternateASig;
2732     int32_t sigMean;
2733     a = float32_squash_input_denormal(a, status);
2734     b = float32_squash_input_denormal(b, status);
2735 
2736     aSig = extractFloat32Frac( a );
2737     aExp = extractFloat32Exp( a );
2738     aSign = extractFloat32Sign( a );
2739     bSig = extractFloat32Frac( b );
2740     bExp = extractFloat32Exp( b );
2741     if ( aExp == 0xFF ) {
2742         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2743             return propagateFloat32NaN(a, b, status);
2744         }
2745         float_raise(float_flag_invalid, status);
2746         return float32_default_nan(status);
2747     }
2748     if ( bExp == 0xFF ) {
2749         if (bSig) {
2750             return propagateFloat32NaN(a, b, status);
2751         }
2752         return a;
2753     }
2754     if ( bExp == 0 ) {
2755         if ( bSig == 0 ) {
2756             float_raise(float_flag_invalid, status);
2757             return float32_default_nan(status);
2758         }
2759         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2760     }
2761     if ( aExp == 0 ) {
2762         if ( aSig == 0 ) return a;
2763         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2764     }
2765     expDiff = aExp - bExp;
2766     aSig |= 0x00800000;
2767     bSig |= 0x00800000;
2768     if ( expDiff < 32 ) {
2769         aSig <<= 8;
2770         bSig <<= 8;
2771         if ( expDiff < 0 ) {
2772             if ( expDiff < -1 ) return a;
2773             aSig >>= 1;
2774         }
2775         q = ( bSig <= aSig );
2776         if ( q ) aSig -= bSig;
2777         if ( 0 < expDiff ) {
2778             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2779             q >>= 32 - expDiff;
2780             bSig >>= 2;
2781             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2782         }
2783         else {
2784             aSig >>= 2;
2785             bSig >>= 2;
2786         }
2787     }
2788     else {
2789         if ( bSig <= aSig ) aSig -= bSig;
2790         aSig64 = ( (uint64_t) aSig )<<40;
2791         bSig64 = ( (uint64_t) bSig )<<40;
2792         expDiff -= 64;
2793         while ( 0 < expDiff ) {
2794             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2795             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2796             aSig64 = - ( ( bSig * q64 )<<38 );
2797             expDiff -= 62;
2798         }
2799         expDiff += 64;
2800         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2801         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2802         q = q64>>( 64 - expDiff );
2803         bSig <<= 6;
2804         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2805     }
2806     do {
2807         alternateASig = aSig;
2808         ++q;
2809         aSig -= bSig;
2810     } while ( 0 <= (int32_t) aSig );
2811     sigMean = aSig + alternateASig;
2812     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2813         aSig = alternateASig;
2814     }
2815     zSign = ( (int32_t) aSig < 0 );
2816     if ( zSign ) aSig = - aSig;
2817     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2818 }
2819 
2820 /*----------------------------------------------------------------------------
2821 | Returns the result of multiplying the single-precision floating-point values
2822 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2823 | multiplication.  The operation is performed according to the IEC/IEEE
2824 | Standard for Binary Floating-Point Arithmetic 754-2008.
2825 | The flags argument allows the caller to select negation of the
2826 | addend, the intermediate product, or the final result. (The difference
2827 | between this and having the caller do a separate negation is that negating
2828 | externally will flip the sign bit on NaNs.)
2829 *----------------------------------------------------------------------------*/
2830 
2831 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2832                        float_status *status)
2833 {
2834     flag aSign, bSign, cSign, zSign;
2835     int aExp, bExp, cExp, pExp, zExp, expDiff;
2836     uint32_t aSig, bSig, cSig;
2837     flag pInf, pZero, pSign;
2838     uint64_t pSig64, cSig64, zSig64;
2839     uint32_t pSig;
2840     int shiftcount;
2841     flag signflip, infzero;
2842 
2843     a = float32_squash_input_denormal(a, status);
2844     b = float32_squash_input_denormal(b, status);
2845     c = float32_squash_input_denormal(c, status);
2846     aSig = extractFloat32Frac(a);
2847     aExp = extractFloat32Exp(a);
2848     aSign = extractFloat32Sign(a);
2849     bSig = extractFloat32Frac(b);
2850     bExp = extractFloat32Exp(b);
2851     bSign = extractFloat32Sign(b);
2852     cSig = extractFloat32Frac(c);
2853     cExp = extractFloat32Exp(c);
2854     cSign = extractFloat32Sign(c);
2855 
2856     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2857                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2858 
2859     /* It is implementation-defined whether the cases of (0,inf,qnan)
2860      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2861      * they return if they do), so we have to hand this information
2862      * off to the target-specific pick-a-NaN routine.
2863      */
2864     if (((aExp == 0xff) && aSig) ||
2865         ((bExp == 0xff) && bSig) ||
2866         ((cExp == 0xff) && cSig)) {
2867         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2868     }
2869 
2870     if (infzero) {
2871         float_raise(float_flag_invalid, status);
2872         return float32_default_nan(status);
2873     }
2874 
2875     if (flags & float_muladd_negate_c) {
2876         cSign ^= 1;
2877     }
2878 
2879     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2880 
2881     /* Work out the sign and type of the product */
2882     pSign = aSign ^ bSign;
2883     if (flags & float_muladd_negate_product) {
2884         pSign ^= 1;
2885     }
2886     pInf = (aExp == 0xff) || (bExp == 0xff);
2887     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2888 
2889     if (cExp == 0xff) {
2890         if (pInf && (pSign ^ cSign)) {
2891             /* addition of opposite-signed infinities => InvalidOperation */
2892             float_raise(float_flag_invalid, status);
2893             return float32_default_nan(status);
2894         }
2895         /* Otherwise generate an infinity of the same sign */
2896         return packFloat32(cSign ^ signflip, 0xff, 0);
2897     }
2898 
2899     if (pInf) {
2900         return packFloat32(pSign ^ signflip, 0xff, 0);
2901     }
2902 
2903     if (pZero) {
2904         if (cExp == 0) {
2905             if (cSig == 0) {
2906                 /* Adding two exact zeroes */
2907                 if (pSign == cSign) {
2908                     zSign = pSign;
2909                 } else if (status->float_rounding_mode == float_round_down) {
2910                     zSign = 1;
2911                 } else {
2912                     zSign = 0;
2913                 }
2914                 return packFloat32(zSign ^ signflip, 0, 0);
2915             }
2916             /* Exact zero plus a denorm */
2917             if (status->flush_to_zero) {
2918                 float_raise(float_flag_output_denormal, status);
2919                 return packFloat32(cSign ^ signflip, 0, 0);
2920             }
2921         }
2922         /* Zero plus something non-zero : just return the something */
2923         if (flags & float_muladd_halve_result) {
2924             if (cExp == 0) {
2925                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2926             }
2927             /* Subtract one to halve, and one again because roundAndPackFloat32
2928              * wants one less than the true exponent.
2929              */
2930             cExp -= 2;
2931             cSig = (cSig | 0x00800000) << 7;
2932             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2933         }
2934         return packFloat32(cSign ^ signflip, cExp, cSig);
2935     }
2936 
2937     if (aExp == 0) {
2938         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2939     }
2940     if (bExp == 0) {
2941         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2942     }
2943 
2944     /* Calculate the actual result a * b + c */
2945 
2946     /* Multiply first; this is easy. */
2947     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2948      * because we want the true exponent, not the "one-less-than"
2949      * flavour that roundAndPackFloat32() takes.
2950      */
2951     pExp = aExp + bExp - 0x7e;
2952     aSig = (aSig | 0x00800000) << 7;
2953     bSig = (bSig | 0x00800000) << 8;
2954     pSig64 = (uint64_t)aSig * bSig;
2955     if ((int64_t)(pSig64 << 1) >= 0) {
2956         pSig64 <<= 1;
2957         pExp--;
2958     }
2959 
2960     zSign = pSign ^ signflip;
2961 
2962     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2963      * position 62.
2964      */
2965     if (cExp == 0) {
2966         if (!cSig) {
2967             /* Throw out the special case of c being an exact zero now */
2968             shift64RightJamming(pSig64, 32, &pSig64);
2969             pSig = pSig64;
2970             if (flags & float_muladd_halve_result) {
2971                 pExp--;
2972             }
2973             return roundAndPackFloat32(zSign, pExp - 1,
2974                                        pSig, status);
2975         }
2976         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2977     }
2978 
2979     cSig64 = (uint64_t)cSig << (62 - 23);
2980     cSig64 |= LIT64(0x4000000000000000);
2981     expDiff = pExp - cExp;
2982 
2983     if (pSign == cSign) {
2984         /* Addition */
2985         if (expDiff > 0) {
2986             /* scale c to match p */
2987             shift64RightJamming(cSig64, expDiff, &cSig64);
2988             zExp = pExp;
2989         } else if (expDiff < 0) {
2990             /* scale p to match c */
2991             shift64RightJamming(pSig64, -expDiff, &pSig64);
2992             zExp = cExp;
2993         } else {
2994             /* no scaling needed */
2995             zExp = cExp;
2996         }
2997         /* Add significands and make sure explicit bit ends up in posn 62 */
2998         zSig64 = pSig64 + cSig64;
2999         if ((int64_t)zSig64 < 0) {
3000             shift64RightJamming(zSig64, 1, &zSig64);
3001         } else {
3002             zExp--;
3003         }
3004     } else {
3005         /* Subtraction */
3006         if (expDiff > 0) {
3007             shift64RightJamming(cSig64, expDiff, &cSig64);
3008             zSig64 = pSig64 - cSig64;
3009             zExp = pExp;
3010         } else if (expDiff < 0) {
3011             shift64RightJamming(pSig64, -expDiff, &pSig64);
3012             zSig64 = cSig64 - pSig64;
3013             zExp = cExp;
3014             zSign ^= 1;
3015         } else {
3016             zExp = pExp;
3017             if (cSig64 < pSig64) {
3018                 zSig64 = pSig64 - cSig64;
3019             } else if (pSig64 < cSig64) {
3020                 zSig64 = cSig64 - pSig64;
3021                 zSign ^= 1;
3022             } else {
3023                 /* Exact zero */
3024                 zSign = signflip;
3025                 if (status->float_rounding_mode == float_round_down) {
3026                     zSign ^= 1;
3027                 }
3028                 return packFloat32(zSign, 0, 0);
3029             }
3030         }
3031         --zExp;
3032         /* Normalize to put the explicit bit back into bit 62. */
3033         shiftcount = countLeadingZeros64(zSig64) - 1;
3034         zSig64 <<= shiftcount;
3035         zExp -= shiftcount;
3036     }
3037     if (flags & float_muladd_halve_result) {
3038         zExp--;
3039     }
3040 
3041     shift64RightJamming(zSig64, 32, &zSig64);
3042     return roundAndPackFloat32(zSign, zExp, zSig64, status);
3043 }
3044 
3045 
3046 /*----------------------------------------------------------------------------
3047 | Returns the square root of the single-precision floating-point value `a'.
3048 | The operation is performed according to the IEC/IEEE Standard for Binary
3049 | Floating-Point Arithmetic.
3050 *----------------------------------------------------------------------------*/
3051 
3052 float32 float32_sqrt(float32 a, float_status *status)
3053 {
3054     flag aSign;
3055     int aExp, zExp;
3056     uint32_t aSig, zSig;
3057     uint64_t rem, term;
3058     a = float32_squash_input_denormal(a, status);
3059 
3060     aSig = extractFloat32Frac( a );
3061     aExp = extractFloat32Exp( a );
3062     aSign = extractFloat32Sign( a );
3063     if ( aExp == 0xFF ) {
3064         if (aSig) {
3065             return propagateFloat32NaN(a, float32_zero, status);
3066         }
3067         if ( ! aSign ) return a;
3068         float_raise(float_flag_invalid, status);
3069         return float32_default_nan(status);
3070     }
3071     if ( aSign ) {
3072         if ( ( aExp | aSig ) == 0 ) return a;
3073         float_raise(float_flag_invalid, status);
3074         return float32_default_nan(status);
3075     }
3076     if ( aExp == 0 ) {
3077         if ( aSig == 0 ) return float32_zero;
3078         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3079     }
3080     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3081     aSig = ( aSig | 0x00800000 )<<8;
3082     zSig = estimateSqrt32( aExp, aSig ) + 2;
3083     if ( ( zSig & 0x7F ) <= 5 ) {
3084         if ( zSig < 2 ) {
3085             zSig = 0x7FFFFFFF;
3086             goto roundAndPack;
3087         }
3088         aSig >>= aExp & 1;
3089         term = ( (uint64_t) zSig ) * zSig;
3090         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3091         while ( (int64_t) rem < 0 ) {
3092             --zSig;
3093             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3094         }
3095         zSig |= ( rem != 0 );
3096     }
3097     shift32RightJamming( zSig, 1, &zSig );
3098  roundAndPack:
3099     return roundAndPackFloat32(0, zExp, zSig, status);
3100 
3101 }
3102 
3103 /*----------------------------------------------------------------------------
3104 | Returns the binary exponential of the single-precision floating-point value
3105 | `a'. The operation is performed according to the IEC/IEEE Standard for
3106 | Binary Floating-Point Arithmetic.
3107 |
3108 | Uses the following identities:
3109 |
3110 | 1. -------------------------------------------------------------------------
3111 |      x    x*ln(2)
3112 |     2  = e
3113 |
3114 | 2. -------------------------------------------------------------------------
3115 |                      2     3     4     5           n
3116 |      x        x     x     x     x     x           x
3117 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3118 |               1!    2!    3!    4!    5!          n!
3119 *----------------------------------------------------------------------------*/
3120 
3121 static const float64 float32_exp2_coefficients[15] =
3122 {
3123     const_float64( 0x3ff0000000000000ll ), /*  1 */
3124     const_float64( 0x3fe0000000000000ll ), /*  2 */
3125     const_float64( 0x3fc5555555555555ll ), /*  3 */
3126     const_float64( 0x3fa5555555555555ll ), /*  4 */
3127     const_float64( 0x3f81111111111111ll ), /*  5 */
3128     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3129     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3130     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3131     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3132     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3133     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3134     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3135     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3136     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3137     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3138 };
3139 
3140 float32 float32_exp2(float32 a, float_status *status)
3141 {
3142     flag aSign;
3143     int aExp;
3144     uint32_t aSig;
3145     float64 r, x, xn;
3146     int i;
3147     a = float32_squash_input_denormal(a, status);
3148 
3149     aSig = extractFloat32Frac( a );
3150     aExp = extractFloat32Exp( a );
3151     aSign = extractFloat32Sign( a );
3152 
3153     if ( aExp == 0xFF) {
3154         if (aSig) {
3155             return propagateFloat32NaN(a, float32_zero, status);
3156         }
3157         return (aSign) ? float32_zero : a;
3158     }
3159     if (aExp == 0) {
3160         if (aSig == 0) return float32_one;
3161     }
3162 
3163     float_raise(float_flag_inexact, status);
3164 
3165     /* ******************************* */
3166     /* using float64 for approximation */
3167     /* ******************************* */
3168     x = float32_to_float64(a, status);
3169     x = float64_mul(x, float64_ln2, status);
3170 
3171     xn = x;
3172     r = float64_one;
3173     for (i = 0 ; i < 15 ; i++) {
3174         float64 f;
3175 
3176         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3177         r = float64_add(r, f, status);
3178 
3179         xn = float64_mul(xn, x, status);
3180     }
3181 
3182     return float64_to_float32(r, status);
3183 }
3184 
3185 /*----------------------------------------------------------------------------
3186 | Returns the binary log of the single-precision floating-point value `a'.
3187 | The operation is performed according to the IEC/IEEE Standard for Binary
3188 | Floating-Point Arithmetic.
3189 *----------------------------------------------------------------------------*/
3190 float32 float32_log2(float32 a, float_status *status)
3191 {
3192     flag aSign, zSign;
3193     int aExp;
3194     uint32_t aSig, zSig, i;
3195 
3196     a = float32_squash_input_denormal(a, status);
3197     aSig = extractFloat32Frac( a );
3198     aExp = extractFloat32Exp( a );
3199     aSign = extractFloat32Sign( a );
3200 
3201     if ( aExp == 0 ) {
3202         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3203         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3204     }
3205     if ( aSign ) {
3206         float_raise(float_flag_invalid, status);
3207         return float32_default_nan(status);
3208     }
3209     if ( aExp == 0xFF ) {
3210         if (aSig) {
3211             return propagateFloat32NaN(a, float32_zero, status);
3212         }
3213         return a;
3214     }
3215 
3216     aExp -= 0x7F;
3217     aSig |= 0x00800000;
3218     zSign = aExp < 0;
3219     zSig = aExp << 23;
3220 
3221     for (i = 1 << 22; i > 0; i >>= 1) {
3222         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3223         if ( aSig & 0x01000000 ) {
3224             aSig >>= 1;
3225             zSig |= i;
3226         }
3227     }
3228 
3229     if ( zSign )
3230         zSig = -zSig;
3231 
3232     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | Returns 1 if the single-precision floating-point value `a' is equal to
3237 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3238 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 *----------------------------------------------------------------------------*/
3241 
3242 int float32_eq(float32 a, float32 b, float_status *status)
3243 {
3244     uint32_t av, bv;
3245     a = float32_squash_input_denormal(a, status);
3246     b = float32_squash_input_denormal(b, status);
3247 
3248     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3249          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3250        ) {
3251         float_raise(float_flag_invalid, status);
3252         return 0;
3253     }
3254     av = float32_val(a);
3255     bv = float32_val(b);
3256     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3257 }
3258 
3259 /*----------------------------------------------------------------------------
3260 | Returns 1 if the single-precision floating-point value `a' is less than
3261 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3262 | exception is raised if either operand is a NaN.  The comparison is performed
3263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3264 *----------------------------------------------------------------------------*/
3265 
3266 int float32_le(float32 a, float32 b, float_status *status)
3267 {
3268     flag aSign, bSign;
3269     uint32_t av, bv;
3270     a = float32_squash_input_denormal(a, status);
3271     b = float32_squash_input_denormal(b, status);
3272 
3273     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3274          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3275        ) {
3276         float_raise(float_flag_invalid, status);
3277         return 0;
3278     }
3279     aSign = extractFloat32Sign( a );
3280     bSign = extractFloat32Sign( b );
3281     av = float32_val(a);
3282     bv = float32_val(b);
3283     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3284     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3285 
3286 }
3287 
3288 /*----------------------------------------------------------------------------
3289 | Returns 1 if the single-precision floating-point value `a' is less than
3290 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3291 | raised if either operand is a NaN.  The comparison is performed according
3292 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3293 *----------------------------------------------------------------------------*/
3294 
3295 int float32_lt(float32 a, float32 b, float_status *status)
3296 {
3297     flag aSign, bSign;
3298     uint32_t av, bv;
3299     a = float32_squash_input_denormal(a, status);
3300     b = float32_squash_input_denormal(b, status);
3301 
3302     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3303          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3304        ) {
3305         float_raise(float_flag_invalid, status);
3306         return 0;
3307     }
3308     aSign = extractFloat32Sign( a );
3309     bSign = extractFloat32Sign( b );
3310     av = float32_val(a);
3311     bv = float32_val(b);
3312     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3313     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3314 
3315 }
3316 
3317 /*----------------------------------------------------------------------------
3318 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3319 | be compared, and 0 otherwise.  The invalid exception is raised if either
3320 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3321 | Standard for Binary Floating-Point Arithmetic.
3322 *----------------------------------------------------------------------------*/
3323 
3324 int float32_unordered(float32 a, float32 b, float_status *status)
3325 {
3326     a = float32_squash_input_denormal(a, status);
3327     b = float32_squash_input_denormal(b, status);
3328 
3329     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3330          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3331        ) {
3332         float_raise(float_flag_invalid, status);
3333         return 1;
3334     }
3335     return 0;
3336 }
3337 
3338 /*----------------------------------------------------------------------------
3339 | Returns 1 if the single-precision floating-point value `a' is equal to
3340 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3341 | exception.  The comparison is performed according to the IEC/IEEE Standard
3342 | for Binary Floating-Point Arithmetic.
3343 *----------------------------------------------------------------------------*/
3344 
3345 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3346 {
3347     a = float32_squash_input_denormal(a, status);
3348     b = float32_squash_input_denormal(b, status);
3349 
3350     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3351          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3352        ) {
3353         if (float32_is_signaling_nan(a, status)
3354          || float32_is_signaling_nan(b, status)) {
3355             float_raise(float_flag_invalid, status);
3356         }
3357         return 0;
3358     }
3359     return ( float32_val(a) == float32_val(b) ) ||
3360             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3361 }
3362 
3363 /*----------------------------------------------------------------------------
3364 | Returns 1 if the single-precision floating-point value `a' is less than or
3365 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3366 | cause an exception.  Otherwise, the comparison is performed according to the
3367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3369 
3370 int float32_le_quiet(float32 a, float32 b, float_status *status)
3371 {
3372     flag aSign, bSign;
3373     uint32_t av, bv;
3374     a = float32_squash_input_denormal(a, status);
3375     b = float32_squash_input_denormal(b, status);
3376 
3377     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3378          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3379        ) {
3380         if (float32_is_signaling_nan(a, status)
3381          || float32_is_signaling_nan(b, status)) {
3382             float_raise(float_flag_invalid, status);
3383         }
3384         return 0;
3385     }
3386     aSign = extractFloat32Sign( a );
3387     bSign = extractFloat32Sign( b );
3388     av = float32_val(a);
3389     bv = float32_val(b);
3390     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3391     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3392 
3393 }
3394 
3395 /*----------------------------------------------------------------------------
3396 | Returns 1 if the single-precision floating-point value `a' is less than
3397 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3398 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3399 | Standard for Binary Floating-Point Arithmetic.
3400 *----------------------------------------------------------------------------*/
3401 
3402 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3403 {
3404     flag aSign, bSign;
3405     uint32_t av, bv;
3406     a = float32_squash_input_denormal(a, status);
3407     b = float32_squash_input_denormal(b, status);
3408 
3409     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3410          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3411        ) {
3412         if (float32_is_signaling_nan(a, status)
3413          || float32_is_signaling_nan(b, status)) {
3414             float_raise(float_flag_invalid, status);
3415         }
3416         return 0;
3417     }
3418     aSign = extractFloat32Sign( a );
3419     bSign = extractFloat32Sign( b );
3420     av = float32_val(a);
3421     bv = float32_val(b);
3422     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3423     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3424 
3425 }
3426 
3427 /*----------------------------------------------------------------------------
3428 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3429 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3430 | comparison is performed according to the IEC/IEEE Standard for Binary
3431 | Floating-Point Arithmetic.
3432 *----------------------------------------------------------------------------*/
3433 
3434 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3435 {
3436     a = float32_squash_input_denormal(a, status);
3437     b = float32_squash_input_denormal(b, status);
3438 
3439     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3440          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3441        ) {
3442         if (float32_is_signaling_nan(a, status)
3443          || float32_is_signaling_nan(b, status)) {
3444             float_raise(float_flag_invalid, status);
3445         }
3446         return 1;
3447     }
3448     return 0;
3449 }
3450 
3451 /*----------------------------------------------------------------------------
3452 | Returns the result of converting the double-precision floating-point value
3453 | `a' to the 32-bit two's complement integer format.  The conversion is
3454 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3455 | Arithmetic---which means in particular that the conversion is rounded
3456 | according to the current rounding mode.  If `a' is a NaN, the largest
3457 | positive integer is returned.  Otherwise, if the conversion overflows, the
3458 | largest integer with the same sign as `a' is returned.
3459 *----------------------------------------------------------------------------*/
3460 
3461 int32_t float64_to_int32(float64 a, float_status *status)
3462 {
3463     flag aSign;
3464     int aExp;
3465     int shiftCount;
3466     uint64_t aSig;
3467     a = float64_squash_input_denormal(a, status);
3468 
3469     aSig = extractFloat64Frac( a );
3470     aExp = extractFloat64Exp( a );
3471     aSign = extractFloat64Sign( a );
3472     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3473     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3474     shiftCount = 0x42C - aExp;
3475     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3476     return roundAndPackInt32(aSign, aSig, status);
3477 
3478 }
3479 
3480 /*----------------------------------------------------------------------------
3481 | Returns the result of converting the double-precision floating-point value
3482 | `a' to the 32-bit two's complement integer format.  The conversion is
3483 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3484 | Arithmetic, except that the conversion is always rounded toward zero.
3485 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3486 | the conversion overflows, the largest integer with the same sign as `a' is
3487 | returned.
3488 *----------------------------------------------------------------------------*/
3489 
3490 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3491 {
3492     flag aSign;
3493     int aExp;
3494     int shiftCount;
3495     uint64_t aSig, savedASig;
3496     int32_t z;
3497     a = float64_squash_input_denormal(a, status);
3498 
3499     aSig = extractFloat64Frac( a );
3500     aExp = extractFloat64Exp( a );
3501     aSign = extractFloat64Sign( a );
3502     if ( 0x41E < aExp ) {
3503         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3504         goto invalid;
3505     }
3506     else if ( aExp < 0x3FF ) {
3507         if (aExp || aSig) {
3508             status->float_exception_flags |= float_flag_inexact;
3509         }
3510         return 0;
3511     }
3512     aSig |= LIT64( 0x0010000000000000 );
3513     shiftCount = 0x433 - aExp;
3514     savedASig = aSig;
3515     aSig >>= shiftCount;
3516     z = aSig;
3517     if ( aSign ) z = - z;
3518     if ( ( z < 0 ) ^ aSign ) {
3519  invalid:
3520         float_raise(float_flag_invalid, status);
3521         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3522     }
3523     if ( ( aSig<<shiftCount ) != savedASig ) {
3524         status->float_exception_flags |= float_flag_inexact;
3525     }
3526     return z;
3527 
3528 }
3529 
3530 /*----------------------------------------------------------------------------
3531 | Returns the result of converting the double-precision floating-point value
3532 | `a' to the 16-bit two's complement integer format.  The conversion is
3533 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3534 | Arithmetic, except that the conversion is always rounded toward zero.
3535 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3536 | the conversion overflows, the largest integer with the same sign as `a' is
3537 | returned.
3538 *----------------------------------------------------------------------------*/
3539 
3540 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3541 {
3542     flag aSign;
3543     int aExp;
3544     int shiftCount;
3545     uint64_t aSig, savedASig;
3546     int32_t z;
3547 
3548     aSig = extractFloat64Frac( a );
3549     aExp = extractFloat64Exp( a );
3550     aSign = extractFloat64Sign( a );
3551     if ( 0x40E < aExp ) {
3552         if ( ( aExp == 0x7FF ) && aSig ) {
3553             aSign = 0;
3554         }
3555         goto invalid;
3556     }
3557     else if ( aExp < 0x3FF ) {
3558         if ( aExp || aSig ) {
3559             status->float_exception_flags |= float_flag_inexact;
3560         }
3561         return 0;
3562     }
3563     aSig |= LIT64( 0x0010000000000000 );
3564     shiftCount = 0x433 - aExp;
3565     savedASig = aSig;
3566     aSig >>= shiftCount;
3567     z = aSig;
3568     if ( aSign ) {
3569         z = - z;
3570     }
3571     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3572  invalid:
3573         float_raise(float_flag_invalid, status);
3574         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3575     }
3576     if ( ( aSig<<shiftCount ) != savedASig ) {
3577         status->float_exception_flags |= float_flag_inexact;
3578     }
3579     return z;
3580 }
3581 
3582 /*----------------------------------------------------------------------------
3583 | Returns the result of converting the double-precision floating-point value
3584 | `a' to the 64-bit two's complement integer format.  The conversion is
3585 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3586 | Arithmetic---which means in particular that the conversion is rounded
3587 | according to the current rounding mode.  If `a' is a NaN, the largest
3588 | positive integer is returned.  Otherwise, if the conversion overflows, the
3589 | largest integer with the same sign as `a' is returned.
3590 *----------------------------------------------------------------------------*/
3591 
3592 int64_t float64_to_int64(float64 a, float_status *status)
3593 {
3594     flag aSign;
3595     int aExp;
3596     int shiftCount;
3597     uint64_t aSig, aSigExtra;
3598     a = float64_squash_input_denormal(a, status);
3599 
3600     aSig = extractFloat64Frac( a );
3601     aExp = extractFloat64Exp( a );
3602     aSign = extractFloat64Sign( a );
3603     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3604     shiftCount = 0x433 - aExp;
3605     if ( shiftCount <= 0 ) {
3606         if ( 0x43E < aExp ) {
3607             float_raise(float_flag_invalid, status);
3608             if (    ! aSign
3609                  || (    ( aExp == 0x7FF )
3610                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3611                ) {
3612                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3613             }
3614             return (int64_t) LIT64( 0x8000000000000000 );
3615         }
3616         aSigExtra = 0;
3617         aSig <<= - shiftCount;
3618     }
3619     else {
3620         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3621     }
3622     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3623 
3624 }
3625 
3626 /*----------------------------------------------------------------------------
3627 | Returns the result of converting the double-precision floating-point value
3628 | `a' to the 64-bit two's complement integer format.  The conversion is
3629 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3630 | Arithmetic, except that the conversion is always rounded toward zero.
3631 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3632 | the conversion overflows, the largest integer with the same sign as `a' is
3633 | returned.
3634 *----------------------------------------------------------------------------*/
3635 
3636 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3637 {
3638     flag aSign;
3639     int aExp;
3640     int shiftCount;
3641     uint64_t aSig;
3642     int64_t z;
3643     a = float64_squash_input_denormal(a, status);
3644 
3645     aSig = extractFloat64Frac( a );
3646     aExp = extractFloat64Exp( a );
3647     aSign = extractFloat64Sign( a );
3648     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3649     shiftCount = aExp - 0x433;
3650     if ( 0 <= shiftCount ) {
3651         if ( 0x43E <= aExp ) {
3652             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3653                 float_raise(float_flag_invalid, status);
3654                 if (    ! aSign
3655                      || (    ( aExp == 0x7FF )
3656                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3657                    ) {
3658                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3659                 }
3660             }
3661             return (int64_t) LIT64( 0x8000000000000000 );
3662         }
3663         z = aSig<<shiftCount;
3664     }
3665     else {
3666         if ( aExp < 0x3FE ) {
3667             if (aExp | aSig) {
3668                 status->float_exception_flags |= float_flag_inexact;
3669             }
3670             return 0;
3671         }
3672         z = aSig>>( - shiftCount );
3673         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3674             status->float_exception_flags |= float_flag_inexact;
3675         }
3676     }
3677     if ( aSign ) z = - z;
3678     return z;
3679 
3680 }
3681 
3682 /*----------------------------------------------------------------------------
3683 | Returns the result of converting the double-precision floating-point value
3684 | `a' to the single-precision floating-point format.  The conversion is
3685 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3686 | Arithmetic.
3687 *----------------------------------------------------------------------------*/
3688 
3689 float32 float64_to_float32(float64 a, float_status *status)
3690 {
3691     flag aSign;
3692     int aExp;
3693     uint64_t aSig;
3694     uint32_t zSig;
3695     a = float64_squash_input_denormal(a, status);
3696 
3697     aSig = extractFloat64Frac( a );
3698     aExp = extractFloat64Exp( a );
3699     aSign = extractFloat64Sign( a );
3700     if ( aExp == 0x7FF ) {
3701         if (aSig) {
3702             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3703         }
3704         return packFloat32( aSign, 0xFF, 0 );
3705     }
3706     shift64RightJamming( aSig, 22, &aSig );
3707     zSig = aSig;
3708     if ( aExp || zSig ) {
3709         zSig |= 0x40000000;
3710         aExp -= 0x381;
3711     }
3712     return roundAndPackFloat32(aSign, aExp, zSig, status);
3713 
3714 }
3715 
3716 
3717 /*----------------------------------------------------------------------------
3718 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3719 | half-precision floating-point value, returning the result.  After being
3720 | shifted into the proper positions, the three fields are simply added
3721 | together to form the result.  This means that any integer portion of `zSig'
3722 | will be added into the exponent.  Since a properly normalized significand
3723 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3724 | than the desired result exponent whenever `zSig' is a complete, normalized
3725 | significand.
3726 *----------------------------------------------------------------------------*/
3727 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3728 {
3729     return make_float16(
3730         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3731 }
3732 
3733 /*----------------------------------------------------------------------------
3734 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3735 | and significand `zSig', and returns the proper half-precision floating-
3736 | point value corresponding to the abstract input.  Ordinarily, the abstract
3737 | value is simply rounded and packed into the half-precision format, with
3738 | the inexact exception raised if the abstract input cannot be represented
3739 | exactly.  However, if the abstract value is too large, the overflow and
3740 | inexact exceptions are raised and an infinity or maximal finite value is
3741 | returned.  If the abstract value is too small, the input value is rounded to
3742 | a subnormal number, and the underflow and inexact exceptions are raised if
3743 | the abstract input cannot be represented exactly as a subnormal half-
3744 | precision floating-point number.
3745 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3746 | ARM-style "alternative representation", which omits the NaN and Inf
3747 | encodings in order to raise the maximum representable exponent by one.
3748 |     The input significand `zSig' has its binary point between bits 22
3749 | and 23, which is 13 bits to the left of the usual location.  This shifted
3750 | significand must be normalized or smaller.  If `zSig' is not normalized,
3751 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3752 | and it must not require rounding.  In the usual case that `zSig' is
3753 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3754 | Note the slightly odd position of the binary point in zSig compared with the
3755 | other roundAndPackFloat functions. This should probably be fixed if we
3756 | need to implement more float16 routines than just conversion.
3757 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3758 | Binary Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3760 
3761 static float16 roundAndPackFloat16(flag zSign, int zExp,
3762                                    uint32_t zSig, flag ieee,
3763                                    float_status *status)
3764 {
3765     int maxexp = ieee ? 29 : 30;
3766     uint32_t mask;
3767     uint32_t increment;
3768     bool rounding_bumps_exp;
3769     bool is_tiny = false;
3770 
3771     /* Calculate the mask of bits of the mantissa which are not
3772      * representable in half-precision and will be lost.
3773      */
3774     if (zExp < 1) {
3775         /* Will be denormal in halfprec */
3776         mask = 0x00ffffff;
3777         if (zExp >= -11) {
3778             mask >>= 11 + zExp;
3779         }
3780     } else {
3781         /* Normal number in halfprec */
3782         mask = 0x00001fff;
3783     }
3784 
3785     switch (status->float_rounding_mode) {
3786     case float_round_nearest_even:
3787         increment = (mask + 1) >> 1;
3788         if ((zSig & mask) == increment) {
3789             increment = zSig & (increment << 1);
3790         }
3791         break;
3792     case float_round_ties_away:
3793         increment = (mask + 1) >> 1;
3794         break;
3795     case float_round_up:
3796         increment = zSign ? 0 : mask;
3797         break;
3798     case float_round_down:
3799         increment = zSign ? mask : 0;
3800         break;
3801     default: /* round_to_zero */
3802         increment = 0;
3803         break;
3804     }
3805 
3806     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3807 
3808     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3809         if (ieee) {
3810             float_raise(float_flag_overflow | float_flag_inexact, status);
3811             return packFloat16(zSign, 0x1f, 0);
3812         } else {
3813             float_raise(float_flag_invalid, status);
3814             return packFloat16(zSign, 0x1f, 0x3ff);
3815         }
3816     }
3817 
3818     if (zExp < 0) {
3819         /* Note that flush-to-zero does not affect half-precision results */
3820         is_tiny =
3821             (status->float_detect_tininess == float_tininess_before_rounding)
3822             || (zExp < -1)
3823             || (!rounding_bumps_exp);
3824     }
3825     if (zSig & mask) {
3826         float_raise(float_flag_inexact, status);
3827         if (is_tiny) {
3828             float_raise(float_flag_underflow, status);
3829         }
3830     }
3831 
3832     zSig += increment;
3833     if (rounding_bumps_exp) {
3834         zSig >>= 1;
3835         zExp++;
3836     }
3837 
3838     if (zExp < -10) {
3839         return packFloat16(zSign, 0, 0);
3840     }
3841     if (zExp < 0) {
3842         zSig >>= -zExp;
3843         zExp = 0;
3844     }
3845     return packFloat16(zSign, zExp, zSig >> 13);
3846 }
3847 
3848 /*----------------------------------------------------------------------------
3849 | If `a' is denormal and we are in flush-to-zero mode then set the
3850 | input-denormal exception and return zero. Otherwise just return the value.
3851 *----------------------------------------------------------------------------*/
3852 float16 float16_squash_input_denormal(float16 a, float_status *status)
3853 {
3854     if (status->flush_inputs_to_zero) {
3855         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3856             float_raise(float_flag_input_denormal, status);
3857             return make_float16(float16_val(a) & 0x8000);
3858         }
3859     }
3860     return a;
3861 }
3862 
3863 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3864                                       uint32_t *zSigPtr)
3865 {
3866     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3867     *zSigPtr = aSig << shiftCount;
3868     *zExpPtr = 1 - shiftCount;
3869 }
3870 
3871 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3872    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3873 
3874 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3875 {
3876     flag aSign;
3877     int aExp;
3878     uint32_t aSig;
3879 
3880     aSign = extractFloat16Sign(a);
3881     aExp = extractFloat16Exp(a);
3882     aSig = extractFloat16Frac(a);
3883 
3884     if (aExp == 0x1f && ieee) {
3885         if (aSig) {
3886             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3887         }
3888         return packFloat32(aSign, 0xff, 0);
3889     }
3890     if (aExp == 0) {
3891         if (aSig == 0) {
3892             return packFloat32(aSign, 0, 0);
3893         }
3894 
3895         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3896         aExp--;
3897     }
3898     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3899 }
3900 
3901 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3902 {
3903     flag aSign;
3904     int aExp;
3905     uint32_t aSig;
3906 
3907     a = float32_squash_input_denormal(a, status);
3908 
3909     aSig = extractFloat32Frac( a );
3910     aExp = extractFloat32Exp( a );
3911     aSign = extractFloat32Sign( a );
3912     if ( aExp == 0xFF ) {
3913         if (aSig) {
3914             /* Input is a NaN */
3915             if (!ieee) {
3916                 float_raise(float_flag_invalid, status);
3917                 return packFloat16(aSign, 0, 0);
3918             }
3919             return commonNaNToFloat16(
3920                 float32ToCommonNaN(a, status), status);
3921         }
3922         /* Infinity */
3923         if (!ieee) {
3924             float_raise(float_flag_invalid, status);
3925             return packFloat16(aSign, 0x1f, 0x3ff);
3926         }
3927         return packFloat16(aSign, 0x1f, 0);
3928     }
3929     if (aExp == 0 && aSig == 0) {
3930         return packFloat16(aSign, 0, 0);
3931     }
3932     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3933      * even if the input is denormal; however this is harmless because
3934      * the largest possible single-precision denormal is still smaller
3935      * than the smallest representable half-precision denormal, and so we
3936      * will end up ignoring aSig and returning via the "always return zero"
3937      * codepath.
3938      */
3939     aSig |= 0x00800000;
3940     aExp -= 0x71;
3941 
3942     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3943 }
3944 
3945 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3946 {
3947     flag aSign;
3948     int aExp;
3949     uint32_t aSig;
3950 
3951     aSign = extractFloat16Sign(a);
3952     aExp = extractFloat16Exp(a);
3953     aSig = extractFloat16Frac(a);
3954 
3955     if (aExp == 0x1f && ieee) {
3956         if (aSig) {
3957             return commonNaNToFloat64(
3958                 float16ToCommonNaN(a, status), status);
3959         }
3960         return packFloat64(aSign, 0x7ff, 0);
3961     }
3962     if (aExp == 0) {
3963         if (aSig == 0) {
3964             return packFloat64(aSign, 0, 0);
3965         }
3966 
3967         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3968         aExp--;
3969     }
3970     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3971 }
3972 
3973 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3974 {
3975     flag aSign;
3976     int aExp;
3977     uint64_t aSig;
3978     uint32_t zSig;
3979 
3980     a = float64_squash_input_denormal(a, status);
3981 
3982     aSig = extractFloat64Frac(a);
3983     aExp = extractFloat64Exp(a);
3984     aSign = extractFloat64Sign(a);
3985     if (aExp == 0x7FF) {
3986         if (aSig) {
3987             /* Input is a NaN */
3988             if (!ieee) {
3989                 float_raise(float_flag_invalid, status);
3990                 return packFloat16(aSign, 0, 0);
3991             }
3992             return commonNaNToFloat16(
3993                 float64ToCommonNaN(a, status), status);
3994         }
3995         /* Infinity */
3996         if (!ieee) {
3997             float_raise(float_flag_invalid, status);
3998             return packFloat16(aSign, 0x1f, 0x3ff);
3999         }
4000         return packFloat16(aSign, 0x1f, 0);
4001     }
4002     shift64RightJamming(aSig, 29, &aSig);
4003     zSig = aSig;
4004     if (aExp == 0 && zSig == 0) {
4005         return packFloat16(aSign, 0, 0);
4006     }
4007     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4008      * even if the input is denormal; however this is harmless because
4009      * the largest possible single-precision denormal is still smaller
4010      * than the smallest representable half-precision denormal, and so we
4011      * will end up ignoring aSig and returning via the "always return zero"
4012      * codepath.
4013      */
4014     zSig |= 0x00800000;
4015     aExp -= 0x3F1;
4016 
4017     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4018 }
4019 
4020 /*----------------------------------------------------------------------------
4021 | Returns the result of converting the double-precision floating-point value
4022 | `a' to the extended double-precision floating-point format.  The conversion
4023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4024 | Arithmetic.
4025 *----------------------------------------------------------------------------*/
4026 
4027 floatx80 float64_to_floatx80(float64 a, float_status *status)
4028 {
4029     flag aSign;
4030     int aExp;
4031     uint64_t aSig;
4032 
4033     a = float64_squash_input_denormal(a, status);
4034     aSig = extractFloat64Frac( a );
4035     aExp = extractFloat64Exp( a );
4036     aSign = extractFloat64Sign( a );
4037     if ( aExp == 0x7FF ) {
4038         if (aSig) {
4039             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4040         }
4041         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4042     }
4043     if ( aExp == 0 ) {
4044         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4045         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4046     }
4047     return
4048         packFloatx80(
4049             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4050 
4051 }
4052 
4053 /*----------------------------------------------------------------------------
4054 | Returns the result of converting the double-precision floating-point value
4055 | `a' to the quadruple-precision floating-point format.  The conversion is
4056 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4057 | Arithmetic.
4058 *----------------------------------------------------------------------------*/
4059 
4060 float128 float64_to_float128(float64 a, float_status *status)
4061 {
4062     flag aSign;
4063     int aExp;
4064     uint64_t aSig, zSig0, zSig1;
4065 
4066     a = float64_squash_input_denormal(a, status);
4067     aSig = extractFloat64Frac( a );
4068     aExp = extractFloat64Exp( a );
4069     aSign = extractFloat64Sign( a );
4070     if ( aExp == 0x7FF ) {
4071         if (aSig) {
4072             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4073         }
4074         return packFloat128( aSign, 0x7FFF, 0, 0 );
4075     }
4076     if ( aExp == 0 ) {
4077         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4078         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4079         --aExp;
4080     }
4081     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4082     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4083 
4084 }
4085 
4086 /*----------------------------------------------------------------------------
4087 | Rounds the double-precision floating-point value `a' to an integer, and
4088 | returns the result as a double-precision floating-point value.  The
4089 | operation is performed according to the IEC/IEEE Standard for Binary
4090 | Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092 
4093 float64 float64_round_to_int(float64 a, float_status *status)
4094 {
4095     flag aSign;
4096     int aExp;
4097     uint64_t lastBitMask, roundBitsMask;
4098     uint64_t z;
4099     a = float64_squash_input_denormal(a, status);
4100 
4101     aExp = extractFloat64Exp( a );
4102     if ( 0x433 <= aExp ) {
4103         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
4104             return propagateFloat64NaN(a, a, status);
4105         }
4106         return a;
4107     }
4108     if ( aExp < 0x3FF ) {
4109         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
4110         status->float_exception_flags |= float_flag_inexact;
4111         aSign = extractFloat64Sign( a );
4112         switch (status->float_rounding_mode) {
4113          case float_round_nearest_even:
4114             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
4115                 return packFloat64( aSign, 0x3FF, 0 );
4116             }
4117             break;
4118         case float_round_ties_away:
4119             if (aExp == 0x3FE) {
4120                 return packFloat64(aSign, 0x3ff, 0);
4121             }
4122             break;
4123          case float_round_down:
4124             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
4125          case float_round_up:
4126             return make_float64(
4127             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
4128         }
4129         return packFloat64( aSign, 0, 0 );
4130     }
4131     lastBitMask = 1;
4132     lastBitMask <<= 0x433 - aExp;
4133     roundBitsMask = lastBitMask - 1;
4134     z = float64_val(a);
4135     switch (status->float_rounding_mode) {
4136     case float_round_nearest_even:
4137         z += lastBitMask >> 1;
4138         if ((z & roundBitsMask) == 0) {
4139             z &= ~lastBitMask;
4140         }
4141         break;
4142     case float_round_ties_away:
4143         z += lastBitMask >> 1;
4144         break;
4145     case float_round_to_zero:
4146         break;
4147     case float_round_up:
4148         if (!extractFloat64Sign(make_float64(z))) {
4149             z += roundBitsMask;
4150         }
4151         break;
4152     case float_round_down:
4153         if (extractFloat64Sign(make_float64(z))) {
4154             z += roundBitsMask;
4155         }
4156         break;
4157     default:
4158         abort();
4159     }
4160     z &= ~ roundBitsMask;
4161     if (z != float64_val(a)) {
4162         status->float_exception_flags |= float_flag_inexact;
4163     }
4164     return make_float64(z);
4165 
4166 }
4167 
4168 float64 float64_trunc_to_int(float64 a, float_status *status)
4169 {
4170     int oldmode;
4171     float64 res;
4172     oldmode = status->float_rounding_mode;
4173     status->float_rounding_mode = float_round_to_zero;
4174     res = float64_round_to_int(a, status);
4175     status->float_rounding_mode = oldmode;
4176     return res;
4177 }
4178 
4179 
4180 /*----------------------------------------------------------------------------
4181 | Returns the remainder of the double-precision floating-point value `a'
4182 | with respect to the corresponding value `b'.  The operation is performed
4183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 
4186 float64 float64_rem(float64 a, float64 b, float_status *status)
4187 {
4188     flag aSign, zSign;
4189     int aExp, bExp, expDiff;
4190     uint64_t aSig, bSig;
4191     uint64_t q, alternateASig;
4192     int64_t sigMean;
4193 
4194     a = float64_squash_input_denormal(a, status);
4195     b = float64_squash_input_denormal(b, status);
4196     aSig = extractFloat64Frac( a );
4197     aExp = extractFloat64Exp( a );
4198     aSign = extractFloat64Sign( a );
4199     bSig = extractFloat64Frac( b );
4200     bExp = extractFloat64Exp( b );
4201     if ( aExp == 0x7FF ) {
4202         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4203             return propagateFloat64NaN(a, b, status);
4204         }
4205         float_raise(float_flag_invalid, status);
4206         return float64_default_nan(status);
4207     }
4208     if ( bExp == 0x7FF ) {
4209         if (bSig) {
4210             return propagateFloat64NaN(a, b, status);
4211         }
4212         return a;
4213     }
4214     if ( bExp == 0 ) {
4215         if ( bSig == 0 ) {
4216             float_raise(float_flag_invalid, status);
4217             return float64_default_nan(status);
4218         }
4219         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4220     }
4221     if ( aExp == 0 ) {
4222         if ( aSig == 0 ) return a;
4223         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4224     }
4225     expDiff = aExp - bExp;
4226     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4227     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4228     if ( expDiff < 0 ) {
4229         if ( expDiff < -1 ) return a;
4230         aSig >>= 1;
4231     }
4232     q = ( bSig <= aSig );
4233     if ( q ) aSig -= bSig;
4234     expDiff -= 64;
4235     while ( 0 < expDiff ) {
4236         q = estimateDiv128To64( aSig, 0, bSig );
4237         q = ( 2 < q ) ? q - 2 : 0;
4238         aSig = - ( ( bSig>>2 ) * q );
4239         expDiff -= 62;
4240     }
4241     expDiff += 64;
4242     if ( 0 < expDiff ) {
4243         q = estimateDiv128To64( aSig, 0, bSig );
4244         q = ( 2 < q ) ? q - 2 : 0;
4245         q >>= 64 - expDiff;
4246         bSig >>= 2;
4247         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4248     }
4249     else {
4250         aSig >>= 2;
4251         bSig >>= 2;
4252     }
4253     do {
4254         alternateASig = aSig;
4255         ++q;
4256         aSig -= bSig;
4257     } while ( 0 <= (int64_t) aSig );
4258     sigMean = aSig + alternateASig;
4259     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4260         aSig = alternateASig;
4261     }
4262     zSign = ( (int64_t) aSig < 0 );
4263     if ( zSign ) aSig = - aSig;
4264     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4265 
4266 }
4267 
4268 /*----------------------------------------------------------------------------
4269 | Returns the result of multiplying the double-precision floating-point values
4270 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4271 | multiplication.  The operation is performed according to the IEC/IEEE
4272 | Standard for Binary Floating-Point Arithmetic 754-2008.
4273 | The flags argument allows the caller to select negation of the
4274 | addend, the intermediate product, or the final result. (The difference
4275 | between this and having the caller do a separate negation is that negating
4276 | externally will flip the sign bit on NaNs.)
4277 *----------------------------------------------------------------------------*/
4278 
4279 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4280                        float_status *status)
4281 {
4282     flag aSign, bSign, cSign, zSign;
4283     int aExp, bExp, cExp, pExp, zExp, expDiff;
4284     uint64_t aSig, bSig, cSig;
4285     flag pInf, pZero, pSign;
4286     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4287     int shiftcount;
4288     flag signflip, infzero;
4289 
4290     a = float64_squash_input_denormal(a, status);
4291     b = float64_squash_input_denormal(b, status);
4292     c = float64_squash_input_denormal(c, status);
4293     aSig = extractFloat64Frac(a);
4294     aExp = extractFloat64Exp(a);
4295     aSign = extractFloat64Sign(a);
4296     bSig = extractFloat64Frac(b);
4297     bExp = extractFloat64Exp(b);
4298     bSign = extractFloat64Sign(b);
4299     cSig = extractFloat64Frac(c);
4300     cExp = extractFloat64Exp(c);
4301     cSign = extractFloat64Sign(c);
4302 
4303     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4304                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4305 
4306     /* It is implementation-defined whether the cases of (0,inf,qnan)
4307      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4308      * they return if they do), so we have to hand this information
4309      * off to the target-specific pick-a-NaN routine.
4310      */
4311     if (((aExp == 0x7ff) && aSig) ||
4312         ((bExp == 0x7ff) && bSig) ||
4313         ((cExp == 0x7ff) && cSig)) {
4314         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4315     }
4316 
4317     if (infzero) {
4318         float_raise(float_flag_invalid, status);
4319         return float64_default_nan(status);
4320     }
4321 
4322     if (flags & float_muladd_negate_c) {
4323         cSign ^= 1;
4324     }
4325 
4326     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4327 
4328     /* Work out the sign and type of the product */
4329     pSign = aSign ^ bSign;
4330     if (flags & float_muladd_negate_product) {
4331         pSign ^= 1;
4332     }
4333     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4334     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4335 
4336     if (cExp == 0x7ff) {
4337         if (pInf && (pSign ^ cSign)) {
4338             /* addition of opposite-signed infinities => InvalidOperation */
4339             float_raise(float_flag_invalid, status);
4340             return float64_default_nan(status);
4341         }
4342         /* Otherwise generate an infinity of the same sign */
4343         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4344     }
4345 
4346     if (pInf) {
4347         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4348     }
4349 
4350     if (pZero) {
4351         if (cExp == 0) {
4352             if (cSig == 0) {
4353                 /* Adding two exact zeroes */
4354                 if (pSign == cSign) {
4355                     zSign = pSign;
4356                 } else if (status->float_rounding_mode == float_round_down) {
4357                     zSign = 1;
4358                 } else {
4359                     zSign = 0;
4360                 }
4361                 return packFloat64(zSign ^ signflip, 0, 0);
4362             }
4363             /* Exact zero plus a denorm */
4364             if (status->flush_to_zero) {
4365                 float_raise(float_flag_output_denormal, status);
4366                 return packFloat64(cSign ^ signflip, 0, 0);
4367             }
4368         }
4369         /* Zero plus something non-zero : just return the something */
4370         if (flags & float_muladd_halve_result) {
4371             if (cExp == 0) {
4372                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4373             }
4374             /* Subtract one to halve, and one again because roundAndPackFloat64
4375              * wants one less than the true exponent.
4376              */
4377             cExp -= 2;
4378             cSig = (cSig | 0x0010000000000000ULL) << 10;
4379             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4380         }
4381         return packFloat64(cSign ^ signflip, cExp, cSig);
4382     }
4383 
4384     if (aExp == 0) {
4385         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4386     }
4387     if (bExp == 0) {
4388         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4389     }
4390 
4391     /* Calculate the actual result a * b + c */
4392 
4393     /* Multiply first; this is easy. */
4394     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4395      * because we want the true exponent, not the "one-less-than"
4396      * flavour that roundAndPackFloat64() takes.
4397      */
4398     pExp = aExp + bExp - 0x3fe;
4399     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4400     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4401     mul64To128(aSig, bSig, &pSig0, &pSig1);
4402     if ((int64_t)(pSig0 << 1) >= 0) {
4403         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4404         pExp--;
4405     }
4406 
4407     zSign = pSign ^ signflip;
4408 
4409     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4410      * bit in position 126.
4411      */
4412     if (cExp == 0) {
4413         if (!cSig) {
4414             /* Throw out the special case of c being an exact zero now */
4415             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4416             if (flags & float_muladd_halve_result) {
4417                 pExp--;
4418             }
4419             return roundAndPackFloat64(zSign, pExp - 1,
4420                                        pSig1, status);
4421         }
4422         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4423     }
4424 
4425     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4426      * significand of the addend, with the explicit bit in position 126.
4427      */
4428     cSig0 = cSig << (126 - 64 - 52);
4429     cSig1 = 0;
4430     cSig0 |= LIT64(0x4000000000000000);
4431     expDiff = pExp - cExp;
4432 
4433     if (pSign == cSign) {
4434         /* Addition */
4435         if (expDiff > 0) {
4436             /* scale c to match p */
4437             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4438             zExp = pExp;
4439         } else if (expDiff < 0) {
4440             /* scale p to match c */
4441             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4442             zExp = cExp;
4443         } else {
4444             /* no scaling needed */
4445             zExp = cExp;
4446         }
4447         /* Add significands and make sure explicit bit ends up in posn 126 */
4448         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4449         if ((int64_t)zSig0 < 0) {
4450             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4451         } else {
4452             zExp--;
4453         }
4454         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4455         if (flags & float_muladd_halve_result) {
4456             zExp--;
4457         }
4458         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4459     } else {
4460         /* Subtraction */
4461         if (expDiff > 0) {
4462             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4463             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4464             zExp = pExp;
4465         } else if (expDiff < 0) {
4466             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4467             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4468             zExp = cExp;
4469             zSign ^= 1;
4470         } else {
4471             zExp = pExp;
4472             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4473                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4474             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4475                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4476                 zSign ^= 1;
4477             } else {
4478                 /* Exact zero */
4479                 zSign = signflip;
4480                 if (status->float_rounding_mode == float_round_down) {
4481                     zSign ^= 1;
4482                 }
4483                 return packFloat64(zSign, 0, 0);
4484             }
4485         }
4486         --zExp;
4487         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4488          * starting with the significand in a pair of uint64_t.
4489          */
4490         if (zSig0) {
4491             shiftcount = countLeadingZeros64(zSig0) - 1;
4492             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4493             if (zSig1) {
4494                 zSig0 |= 1;
4495             }
4496             zExp -= shiftcount;
4497         } else {
4498             shiftcount = countLeadingZeros64(zSig1);
4499             if (shiftcount == 0) {
4500                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4501                 zExp -= 63;
4502             } else {
4503                 shiftcount--;
4504                 zSig0 = zSig1 << shiftcount;
4505                 zExp -= (shiftcount + 64);
4506             }
4507         }
4508         if (flags & float_muladd_halve_result) {
4509             zExp--;
4510         }
4511         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4512     }
4513 }
4514 
4515 /*----------------------------------------------------------------------------
4516 | Returns the square root of the double-precision floating-point value `a'.
4517 | The operation is performed according to the IEC/IEEE Standard for Binary
4518 | Floating-Point Arithmetic.
4519 *----------------------------------------------------------------------------*/
4520 
4521 float64 float64_sqrt(float64 a, float_status *status)
4522 {
4523     flag aSign;
4524     int aExp, zExp;
4525     uint64_t aSig, zSig, doubleZSig;
4526     uint64_t rem0, rem1, term0, term1;
4527     a = float64_squash_input_denormal(a, status);
4528 
4529     aSig = extractFloat64Frac( a );
4530     aExp = extractFloat64Exp( a );
4531     aSign = extractFloat64Sign( a );
4532     if ( aExp == 0x7FF ) {
4533         if (aSig) {
4534             return propagateFloat64NaN(a, a, status);
4535         }
4536         if ( ! aSign ) return a;
4537         float_raise(float_flag_invalid, status);
4538         return float64_default_nan(status);
4539     }
4540     if ( aSign ) {
4541         if ( ( aExp | aSig ) == 0 ) return a;
4542         float_raise(float_flag_invalid, status);
4543         return float64_default_nan(status);
4544     }
4545     if ( aExp == 0 ) {
4546         if ( aSig == 0 ) return float64_zero;
4547         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4548     }
4549     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4550     aSig |= LIT64( 0x0010000000000000 );
4551     zSig = estimateSqrt32( aExp, aSig>>21 );
4552     aSig <<= 9 - ( aExp & 1 );
4553     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4554     if ( ( zSig & 0x1FF ) <= 5 ) {
4555         doubleZSig = zSig<<1;
4556         mul64To128( zSig, zSig, &term0, &term1 );
4557         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4558         while ( (int64_t) rem0 < 0 ) {
4559             --zSig;
4560             doubleZSig -= 2;
4561             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4562         }
4563         zSig |= ( ( rem0 | rem1 ) != 0 );
4564     }
4565     return roundAndPackFloat64(0, zExp, zSig, status);
4566 
4567 }
4568 
4569 /*----------------------------------------------------------------------------
4570 | Returns the binary log of the double-precision floating-point value `a'.
4571 | The operation is performed according to the IEC/IEEE Standard for Binary
4572 | Floating-Point Arithmetic.
4573 *----------------------------------------------------------------------------*/
4574 float64 float64_log2(float64 a, float_status *status)
4575 {
4576     flag aSign, zSign;
4577     int aExp;
4578     uint64_t aSig, aSig0, aSig1, zSig, i;
4579     a = float64_squash_input_denormal(a, status);
4580 
4581     aSig = extractFloat64Frac( a );
4582     aExp = extractFloat64Exp( a );
4583     aSign = extractFloat64Sign( a );
4584 
4585     if ( aExp == 0 ) {
4586         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4587         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4588     }
4589     if ( aSign ) {
4590         float_raise(float_flag_invalid, status);
4591         return float64_default_nan(status);
4592     }
4593     if ( aExp == 0x7FF ) {
4594         if (aSig) {
4595             return propagateFloat64NaN(a, float64_zero, status);
4596         }
4597         return a;
4598     }
4599 
4600     aExp -= 0x3FF;
4601     aSig |= LIT64( 0x0010000000000000 );
4602     zSign = aExp < 0;
4603     zSig = (uint64_t)aExp << 52;
4604     for (i = 1LL << 51; i > 0; i >>= 1) {
4605         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4606         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4607         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4608             aSig >>= 1;
4609             zSig |= i;
4610         }
4611     }
4612 
4613     if ( zSign )
4614         zSig = -zSig;
4615     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4616 }
4617 
4618 /*----------------------------------------------------------------------------
4619 | Returns 1 if the double-precision floating-point value `a' is equal to the
4620 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4621 | if either operand is a NaN.  Otherwise, the comparison is performed
4622 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4623 *----------------------------------------------------------------------------*/
4624 
4625 int float64_eq(float64 a, float64 b, float_status *status)
4626 {
4627     uint64_t av, bv;
4628     a = float64_squash_input_denormal(a, status);
4629     b = float64_squash_input_denormal(b, status);
4630 
4631     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4632          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4633        ) {
4634         float_raise(float_flag_invalid, status);
4635         return 0;
4636     }
4637     av = float64_val(a);
4638     bv = float64_val(b);
4639     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4640 
4641 }
4642 
4643 /*----------------------------------------------------------------------------
4644 | Returns 1 if the double-precision floating-point value `a' is less than or
4645 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4646 | exception is raised if either operand is a NaN.  The comparison is performed
4647 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4648 *----------------------------------------------------------------------------*/
4649 
4650 int float64_le(float64 a, float64 b, float_status *status)
4651 {
4652     flag aSign, bSign;
4653     uint64_t av, bv;
4654     a = float64_squash_input_denormal(a, status);
4655     b = float64_squash_input_denormal(b, status);
4656 
4657     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4658          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4659        ) {
4660         float_raise(float_flag_invalid, status);
4661         return 0;
4662     }
4663     aSign = extractFloat64Sign( a );
4664     bSign = extractFloat64Sign( b );
4665     av = float64_val(a);
4666     bv = float64_val(b);
4667     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4668     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4669 
4670 }
4671 
4672 /*----------------------------------------------------------------------------
4673 | Returns 1 if the double-precision floating-point value `a' is less than
4674 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4675 | raised if either operand is a NaN.  The comparison is performed according
4676 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4677 *----------------------------------------------------------------------------*/
4678 
4679 int float64_lt(float64 a, float64 b, float_status *status)
4680 {
4681     flag aSign, bSign;
4682     uint64_t av, bv;
4683 
4684     a = float64_squash_input_denormal(a, status);
4685     b = float64_squash_input_denormal(b, status);
4686     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4687          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4688        ) {
4689         float_raise(float_flag_invalid, status);
4690         return 0;
4691     }
4692     aSign = extractFloat64Sign( a );
4693     bSign = extractFloat64Sign( b );
4694     av = float64_val(a);
4695     bv = float64_val(b);
4696     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4697     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4698 
4699 }
4700 
4701 /*----------------------------------------------------------------------------
4702 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4703 | be compared, and 0 otherwise.  The invalid exception is raised if either
4704 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4705 | Standard for Binary Floating-Point Arithmetic.
4706 *----------------------------------------------------------------------------*/
4707 
4708 int float64_unordered(float64 a, float64 b, float_status *status)
4709 {
4710     a = float64_squash_input_denormal(a, status);
4711     b = float64_squash_input_denormal(b, status);
4712 
4713     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4714          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4715        ) {
4716         float_raise(float_flag_invalid, status);
4717         return 1;
4718     }
4719     return 0;
4720 }
4721 
4722 /*----------------------------------------------------------------------------
4723 | Returns 1 if the double-precision floating-point value `a' is equal to the
4724 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4725 | exception.The comparison is performed according to the IEC/IEEE Standard
4726 | for Binary Floating-Point Arithmetic.
4727 *----------------------------------------------------------------------------*/
4728 
4729 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4730 {
4731     uint64_t av, bv;
4732     a = float64_squash_input_denormal(a, status);
4733     b = float64_squash_input_denormal(b, status);
4734 
4735     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4736          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4737        ) {
4738         if (float64_is_signaling_nan(a, status)
4739          || float64_is_signaling_nan(b, status)) {
4740             float_raise(float_flag_invalid, status);
4741         }
4742         return 0;
4743     }
4744     av = float64_val(a);
4745     bv = float64_val(b);
4746     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4747 
4748 }
4749 
4750 /*----------------------------------------------------------------------------
4751 | Returns 1 if the double-precision floating-point value `a' is less than or
4752 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4753 | cause an exception.  Otherwise, the comparison is performed according to the
4754 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4755 *----------------------------------------------------------------------------*/
4756 
4757 int float64_le_quiet(float64 a, float64 b, float_status *status)
4758 {
4759     flag aSign, bSign;
4760     uint64_t av, bv;
4761     a = float64_squash_input_denormal(a, status);
4762     b = float64_squash_input_denormal(b, status);
4763 
4764     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4765          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4766        ) {
4767         if (float64_is_signaling_nan(a, status)
4768          || float64_is_signaling_nan(b, status)) {
4769             float_raise(float_flag_invalid, status);
4770         }
4771         return 0;
4772     }
4773     aSign = extractFloat64Sign( a );
4774     bSign = extractFloat64Sign( b );
4775     av = float64_val(a);
4776     bv = float64_val(b);
4777     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4778     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4779 
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the double-precision floating-point value `a' is less than
4784 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4785 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4786 | Standard for Binary Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788 
4789 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4790 {
4791     flag aSign, bSign;
4792     uint64_t av, bv;
4793     a = float64_squash_input_denormal(a, status);
4794     b = float64_squash_input_denormal(b, status);
4795 
4796     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4797          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4798        ) {
4799         if (float64_is_signaling_nan(a, status)
4800          || float64_is_signaling_nan(b, status)) {
4801             float_raise(float_flag_invalid, status);
4802         }
4803         return 0;
4804     }
4805     aSign = extractFloat64Sign( a );
4806     bSign = extractFloat64Sign( b );
4807     av = float64_val(a);
4808     bv = float64_val(b);
4809     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4810     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4811 
4812 }
4813 
4814 /*----------------------------------------------------------------------------
4815 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4816 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4817 | comparison is performed according to the IEC/IEEE Standard for Binary
4818 | Floating-Point Arithmetic.
4819 *----------------------------------------------------------------------------*/
4820 
4821 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4822 {
4823     a = float64_squash_input_denormal(a, status);
4824     b = float64_squash_input_denormal(b, status);
4825 
4826     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4827          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4828        ) {
4829         if (float64_is_signaling_nan(a, status)
4830          || float64_is_signaling_nan(b, status)) {
4831             float_raise(float_flag_invalid, status);
4832         }
4833         return 1;
4834     }
4835     return 0;
4836 }
4837 
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of converting the extended double-precision floating-
4840 | point value `a' to the 32-bit two's complement integer format.  The
4841 | conversion is performed according to the IEC/IEEE Standard for Binary
4842 | Floating-Point Arithmetic---which means in particular that the conversion
4843 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4844 | largest positive integer is returned.  Otherwise, if the conversion
4845 | overflows, the largest integer with the same sign as `a' is returned.
4846 *----------------------------------------------------------------------------*/
4847 
4848 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4849 {
4850     flag aSign;
4851     int32_t aExp, shiftCount;
4852     uint64_t aSig;
4853 
4854     if (floatx80_invalid_encoding(a)) {
4855         float_raise(float_flag_invalid, status);
4856         return 1 << 31;
4857     }
4858     aSig = extractFloatx80Frac( a );
4859     aExp = extractFloatx80Exp( a );
4860     aSign = extractFloatx80Sign( a );
4861     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4862     shiftCount = 0x4037 - aExp;
4863     if ( shiftCount <= 0 ) shiftCount = 1;
4864     shift64RightJamming( aSig, shiftCount, &aSig );
4865     return roundAndPackInt32(aSign, aSig, status);
4866 
4867 }
4868 
4869 /*----------------------------------------------------------------------------
4870 | Returns the result of converting the extended double-precision floating-
4871 | point value `a' to the 32-bit two's complement integer format.  The
4872 | conversion is performed according to the IEC/IEEE Standard for Binary
4873 | Floating-Point Arithmetic, except that the conversion is always rounded
4874 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4875 | Otherwise, if the conversion overflows, the largest integer with the same
4876 | sign as `a' is returned.
4877 *----------------------------------------------------------------------------*/
4878 
4879 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4880 {
4881     flag aSign;
4882     int32_t aExp, shiftCount;
4883     uint64_t aSig, savedASig;
4884     int32_t z;
4885 
4886     if (floatx80_invalid_encoding(a)) {
4887         float_raise(float_flag_invalid, status);
4888         return 1 << 31;
4889     }
4890     aSig = extractFloatx80Frac( a );
4891     aExp = extractFloatx80Exp( a );
4892     aSign = extractFloatx80Sign( a );
4893     if ( 0x401E < aExp ) {
4894         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4895         goto invalid;
4896     }
4897     else if ( aExp < 0x3FFF ) {
4898         if (aExp || aSig) {
4899             status->float_exception_flags |= float_flag_inexact;
4900         }
4901         return 0;
4902     }
4903     shiftCount = 0x403E - aExp;
4904     savedASig = aSig;
4905     aSig >>= shiftCount;
4906     z = aSig;
4907     if ( aSign ) z = - z;
4908     if ( ( z < 0 ) ^ aSign ) {
4909  invalid:
4910         float_raise(float_flag_invalid, status);
4911         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4912     }
4913     if ( ( aSig<<shiftCount ) != savedASig ) {
4914         status->float_exception_flags |= float_flag_inexact;
4915     }
4916     return z;
4917 
4918 }
4919 
4920 /*----------------------------------------------------------------------------
4921 | Returns the result of converting the extended double-precision floating-
4922 | point value `a' to the 64-bit two's complement integer format.  The
4923 | conversion is performed according to the IEC/IEEE Standard for Binary
4924 | Floating-Point Arithmetic---which means in particular that the conversion
4925 | is rounded according to the current rounding mode.  If `a' is a NaN,
4926 | the largest positive integer is returned.  Otherwise, if the conversion
4927 | overflows, the largest integer with the same sign as `a' is returned.
4928 *----------------------------------------------------------------------------*/
4929 
4930 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4931 {
4932     flag aSign;
4933     int32_t aExp, shiftCount;
4934     uint64_t aSig, aSigExtra;
4935 
4936     if (floatx80_invalid_encoding(a)) {
4937         float_raise(float_flag_invalid, status);
4938         return 1ULL << 63;
4939     }
4940     aSig = extractFloatx80Frac( a );
4941     aExp = extractFloatx80Exp( a );
4942     aSign = extractFloatx80Sign( a );
4943     shiftCount = 0x403E - aExp;
4944     if ( shiftCount <= 0 ) {
4945         if ( shiftCount ) {
4946             float_raise(float_flag_invalid, status);
4947             if (    ! aSign
4948                  || (    ( aExp == 0x7FFF )
4949                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4950                ) {
4951                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4952             }
4953             return (int64_t) LIT64( 0x8000000000000000 );
4954         }
4955         aSigExtra = 0;
4956     }
4957     else {
4958         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4959     }
4960     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4961 
4962 }
4963 
4964 /*----------------------------------------------------------------------------
4965 | Returns the result of converting the extended double-precision floating-
4966 | point value `a' to the 64-bit two's complement integer format.  The
4967 | conversion is performed according to the IEC/IEEE Standard for Binary
4968 | Floating-Point Arithmetic, except that the conversion is always rounded
4969 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4970 | Otherwise, if the conversion overflows, the largest integer with the same
4971 | sign as `a' is returned.
4972 *----------------------------------------------------------------------------*/
4973 
4974 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4975 {
4976     flag aSign;
4977     int32_t aExp, shiftCount;
4978     uint64_t aSig;
4979     int64_t z;
4980 
4981     if (floatx80_invalid_encoding(a)) {
4982         float_raise(float_flag_invalid, status);
4983         return 1ULL << 63;
4984     }
4985     aSig = extractFloatx80Frac( a );
4986     aExp = extractFloatx80Exp( a );
4987     aSign = extractFloatx80Sign( a );
4988     shiftCount = aExp - 0x403E;
4989     if ( 0 <= shiftCount ) {
4990         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4991         if ( ( a.high != 0xC03E ) || aSig ) {
4992             float_raise(float_flag_invalid, status);
4993             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4994                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4995             }
4996         }
4997         return (int64_t) LIT64( 0x8000000000000000 );
4998     }
4999     else if ( aExp < 0x3FFF ) {
5000         if (aExp | aSig) {
5001             status->float_exception_flags |= float_flag_inexact;
5002         }
5003         return 0;
5004     }
5005     z = aSig>>( - shiftCount );
5006     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5007         status->float_exception_flags |= float_flag_inexact;
5008     }
5009     if ( aSign ) z = - z;
5010     return z;
5011 
5012 }
5013 
5014 /*----------------------------------------------------------------------------
5015 | Returns the result of converting the extended double-precision floating-
5016 | point value `a' to the single-precision floating-point format.  The
5017 | conversion is performed according to the IEC/IEEE Standard for Binary
5018 | Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5020 
5021 float32 floatx80_to_float32(floatx80 a, float_status *status)
5022 {
5023     flag aSign;
5024     int32_t aExp;
5025     uint64_t aSig;
5026 
5027     if (floatx80_invalid_encoding(a)) {
5028         float_raise(float_flag_invalid, status);
5029         return float32_default_nan(status);
5030     }
5031     aSig = extractFloatx80Frac( a );
5032     aExp = extractFloatx80Exp( a );
5033     aSign = extractFloatx80Sign( a );
5034     if ( aExp == 0x7FFF ) {
5035         if ( (uint64_t) ( aSig<<1 ) ) {
5036             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5037         }
5038         return packFloat32( aSign, 0xFF, 0 );
5039     }
5040     shift64RightJamming( aSig, 33, &aSig );
5041     if ( aExp || aSig ) aExp -= 0x3F81;
5042     return roundAndPackFloat32(aSign, aExp, aSig, status);
5043 
5044 }
5045 
5046 /*----------------------------------------------------------------------------
5047 | Returns the result of converting the extended double-precision floating-
5048 | point value `a' to the double-precision floating-point format.  The
5049 | conversion is performed according to the IEC/IEEE Standard for Binary
5050 | Floating-Point Arithmetic.
5051 *----------------------------------------------------------------------------*/
5052 
5053 float64 floatx80_to_float64(floatx80 a, float_status *status)
5054 {
5055     flag aSign;
5056     int32_t aExp;
5057     uint64_t aSig, zSig;
5058 
5059     if (floatx80_invalid_encoding(a)) {
5060         float_raise(float_flag_invalid, status);
5061         return float64_default_nan(status);
5062     }
5063     aSig = extractFloatx80Frac( a );
5064     aExp = extractFloatx80Exp( a );
5065     aSign = extractFloatx80Sign( a );
5066     if ( aExp == 0x7FFF ) {
5067         if ( (uint64_t) ( aSig<<1 ) ) {
5068             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5069         }
5070         return packFloat64( aSign, 0x7FF, 0 );
5071     }
5072     shift64RightJamming( aSig, 1, &zSig );
5073     if ( aExp || aSig ) aExp -= 0x3C01;
5074     return roundAndPackFloat64(aSign, aExp, zSig, status);
5075 
5076 }
5077 
5078 /*----------------------------------------------------------------------------
5079 | Returns the result of converting the extended double-precision floating-
5080 | point value `a' to the quadruple-precision floating-point format.  The
5081 | conversion is performed according to the IEC/IEEE Standard for Binary
5082 | Floating-Point Arithmetic.
5083 *----------------------------------------------------------------------------*/
5084 
5085 float128 floatx80_to_float128(floatx80 a, float_status *status)
5086 {
5087     flag aSign;
5088     int aExp;
5089     uint64_t aSig, zSig0, zSig1;
5090 
5091     if (floatx80_invalid_encoding(a)) {
5092         float_raise(float_flag_invalid, status);
5093         return float128_default_nan(status);
5094     }
5095     aSig = extractFloatx80Frac( a );
5096     aExp = extractFloatx80Exp( a );
5097     aSign = extractFloatx80Sign( a );
5098     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5099         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5100     }
5101     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5102     return packFloat128( aSign, aExp, zSig0, zSig1 );
5103 
5104 }
5105 
5106 /*----------------------------------------------------------------------------
5107 | Rounds the extended double-precision floating-point value `a'
5108 | to the precision provided by floatx80_rounding_precision and returns the
5109 | result as an extended double-precision floating-point value.
5110 | The operation is performed according to the IEC/IEEE Standard for Binary
5111 | Floating-Point Arithmetic.
5112 *----------------------------------------------------------------------------*/
5113 
5114 floatx80 floatx80_round(floatx80 a, float_status *status)
5115 {
5116     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5117                                 extractFloatx80Sign(a),
5118                                 extractFloatx80Exp(a),
5119                                 extractFloatx80Frac(a), 0, status);
5120 }
5121 
5122 /*----------------------------------------------------------------------------
5123 | Rounds the extended double-precision floating-point value `a' to an integer,
5124 | and returns the result as an extended quadruple-precision floating-point
5125 | value.  The operation is performed according to the IEC/IEEE Standard for
5126 | Binary Floating-Point Arithmetic.
5127 *----------------------------------------------------------------------------*/
5128 
5129 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5130 {
5131     flag aSign;
5132     int32_t aExp;
5133     uint64_t lastBitMask, roundBitsMask;
5134     floatx80 z;
5135 
5136     if (floatx80_invalid_encoding(a)) {
5137         float_raise(float_flag_invalid, status);
5138         return floatx80_default_nan(status);
5139     }
5140     aExp = extractFloatx80Exp( a );
5141     if ( 0x403E <= aExp ) {
5142         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5143             return propagateFloatx80NaN(a, a, status);
5144         }
5145         return a;
5146     }
5147     if ( aExp < 0x3FFF ) {
5148         if (    ( aExp == 0 )
5149              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5150             return a;
5151         }
5152         status->float_exception_flags |= float_flag_inexact;
5153         aSign = extractFloatx80Sign( a );
5154         switch (status->float_rounding_mode) {
5155          case float_round_nearest_even:
5156             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5157                ) {
5158                 return
5159                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5160             }
5161             break;
5162         case float_round_ties_away:
5163             if (aExp == 0x3FFE) {
5164                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5165             }
5166             break;
5167          case float_round_down:
5168             return
5169                   aSign ?
5170                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5171                 : packFloatx80( 0, 0, 0 );
5172          case float_round_up:
5173             return
5174                   aSign ? packFloatx80( 1, 0, 0 )
5175                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5176         }
5177         return packFloatx80( aSign, 0, 0 );
5178     }
5179     lastBitMask = 1;
5180     lastBitMask <<= 0x403E - aExp;
5181     roundBitsMask = lastBitMask - 1;
5182     z = a;
5183     switch (status->float_rounding_mode) {
5184     case float_round_nearest_even:
5185         z.low += lastBitMask>>1;
5186         if ((z.low & roundBitsMask) == 0) {
5187             z.low &= ~lastBitMask;
5188         }
5189         break;
5190     case float_round_ties_away:
5191         z.low += lastBitMask >> 1;
5192         break;
5193     case float_round_to_zero:
5194         break;
5195     case float_round_up:
5196         if (!extractFloatx80Sign(z)) {
5197             z.low += roundBitsMask;
5198         }
5199         break;
5200     case float_round_down:
5201         if (extractFloatx80Sign(z)) {
5202             z.low += roundBitsMask;
5203         }
5204         break;
5205     default:
5206         abort();
5207     }
5208     z.low &= ~ roundBitsMask;
5209     if ( z.low == 0 ) {
5210         ++z.high;
5211         z.low = LIT64( 0x8000000000000000 );
5212     }
5213     if (z.low != a.low) {
5214         status->float_exception_flags |= float_flag_inexact;
5215     }
5216     return z;
5217 
5218 }
5219 
5220 /*----------------------------------------------------------------------------
5221 | Returns the result of adding the absolute values of the extended double-
5222 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5223 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5224 | The addition is performed according to the IEC/IEEE Standard for Binary
5225 | Floating-Point Arithmetic.
5226 *----------------------------------------------------------------------------*/
5227 
5228 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5229                                 float_status *status)
5230 {
5231     int32_t aExp, bExp, zExp;
5232     uint64_t aSig, bSig, zSig0, zSig1;
5233     int32_t expDiff;
5234 
5235     aSig = extractFloatx80Frac( a );
5236     aExp = extractFloatx80Exp( a );
5237     bSig = extractFloatx80Frac( b );
5238     bExp = extractFloatx80Exp( b );
5239     expDiff = aExp - bExp;
5240     if ( 0 < expDiff ) {
5241         if ( aExp == 0x7FFF ) {
5242             if ((uint64_t)(aSig << 1)) {
5243                 return propagateFloatx80NaN(a, b, status);
5244             }
5245             return a;
5246         }
5247         if ( bExp == 0 ) --expDiff;
5248         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249         zExp = aExp;
5250     }
5251     else if ( expDiff < 0 ) {
5252         if ( bExp == 0x7FFF ) {
5253             if ((uint64_t)(bSig << 1)) {
5254                 return propagateFloatx80NaN(a, b, status);
5255             }
5256             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5257         }
5258         if ( aExp == 0 ) ++expDiff;
5259         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5260         zExp = bExp;
5261     }
5262     else {
5263         if ( aExp == 0x7FFF ) {
5264             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5265                 return propagateFloatx80NaN(a, b, status);
5266             }
5267             return a;
5268         }
5269         zSig1 = 0;
5270         zSig0 = aSig + bSig;
5271         if ( aExp == 0 ) {
5272             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5273             goto roundAndPack;
5274         }
5275         zExp = aExp;
5276         goto shiftRight1;
5277     }
5278     zSig0 = aSig + bSig;
5279     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5280  shiftRight1:
5281     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5282     zSig0 |= LIT64( 0x8000000000000000 );
5283     ++zExp;
5284  roundAndPack:
5285     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5286                                 zSign, zExp, zSig0, zSig1, status);
5287 }
5288 
5289 /*----------------------------------------------------------------------------
5290 | Returns the result of subtracting the absolute values of the extended
5291 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5292 | difference is negated before being returned.  `zSign' is ignored if the
5293 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5294 | Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296 
5297 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5298                                 float_status *status)
5299 {
5300     int32_t aExp, bExp, zExp;
5301     uint64_t aSig, bSig, zSig0, zSig1;
5302     int32_t expDiff;
5303 
5304     aSig = extractFloatx80Frac( a );
5305     aExp = extractFloatx80Exp( a );
5306     bSig = extractFloatx80Frac( b );
5307     bExp = extractFloatx80Exp( b );
5308     expDiff = aExp - bExp;
5309     if ( 0 < expDiff ) goto aExpBigger;
5310     if ( expDiff < 0 ) goto bExpBigger;
5311     if ( aExp == 0x7FFF ) {
5312         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5313             return propagateFloatx80NaN(a, b, status);
5314         }
5315         float_raise(float_flag_invalid, status);
5316         return floatx80_default_nan(status);
5317     }
5318     if ( aExp == 0 ) {
5319         aExp = 1;
5320         bExp = 1;
5321     }
5322     zSig1 = 0;
5323     if ( bSig < aSig ) goto aBigger;
5324     if ( aSig < bSig ) goto bBigger;
5325     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5326  bExpBigger:
5327     if ( bExp == 0x7FFF ) {
5328         if ((uint64_t)(bSig << 1)) {
5329             return propagateFloatx80NaN(a, b, status);
5330         }
5331         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5332     }
5333     if ( aExp == 0 ) ++expDiff;
5334     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5335  bBigger:
5336     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5337     zExp = bExp;
5338     zSign ^= 1;
5339     goto normalizeRoundAndPack;
5340  aExpBigger:
5341     if ( aExp == 0x7FFF ) {
5342         if ((uint64_t)(aSig << 1)) {
5343             return propagateFloatx80NaN(a, b, status);
5344         }
5345         return a;
5346     }
5347     if ( bExp == 0 ) --expDiff;
5348     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5349  aBigger:
5350     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5351     zExp = aExp;
5352  normalizeRoundAndPack:
5353     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5354                                          zSign, zExp, zSig0, zSig1, status);
5355 }
5356 
5357 /*----------------------------------------------------------------------------
5358 | Returns the result of adding the extended double-precision floating-point
5359 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5360 | Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 
5363 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5364 {
5365     flag aSign, bSign;
5366 
5367     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5368         float_raise(float_flag_invalid, status);
5369         return floatx80_default_nan(status);
5370     }
5371     aSign = extractFloatx80Sign( a );
5372     bSign = extractFloatx80Sign( b );
5373     if ( aSign == bSign ) {
5374         return addFloatx80Sigs(a, b, aSign, status);
5375     }
5376     else {
5377         return subFloatx80Sigs(a, b, aSign, status);
5378     }
5379 
5380 }
5381 
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of subtracting the extended double-precision floating-
5384 | point values `a' and `b'.  The operation is performed according to the
5385 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5387 
5388 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5389 {
5390     flag aSign, bSign;
5391 
5392     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5393         float_raise(float_flag_invalid, status);
5394         return floatx80_default_nan(status);
5395     }
5396     aSign = extractFloatx80Sign( a );
5397     bSign = extractFloatx80Sign( b );
5398     if ( aSign == bSign ) {
5399         return subFloatx80Sigs(a, b, aSign, status);
5400     }
5401     else {
5402         return addFloatx80Sigs(a, b, aSign, status);
5403     }
5404 
5405 }
5406 
5407 /*----------------------------------------------------------------------------
5408 | Returns the result of multiplying the extended double-precision floating-
5409 | point values `a' and `b'.  The operation is performed according to the
5410 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5411 *----------------------------------------------------------------------------*/
5412 
5413 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5414 {
5415     flag aSign, bSign, zSign;
5416     int32_t aExp, bExp, zExp;
5417     uint64_t aSig, bSig, zSig0, zSig1;
5418 
5419     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5420         float_raise(float_flag_invalid, status);
5421         return floatx80_default_nan(status);
5422     }
5423     aSig = extractFloatx80Frac( a );
5424     aExp = extractFloatx80Exp( a );
5425     aSign = extractFloatx80Sign( a );
5426     bSig = extractFloatx80Frac( b );
5427     bExp = extractFloatx80Exp( b );
5428     bSign = extractFloatx80Sign( b );
5429     zSign = aSign ^ bSign;
5430     if ( aExp == 0x7FFF ) {
5431         if (    (uint64_t) ( aSig<<1 )
5432              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5433             return propagateFloatx80NaN(a, b, status);
5434         }
5435         if ( ( bExp | bSig ) == 0 ) goto invalid;
5436         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5437     }
5438     if ( bExp == 0x7FFF ) {
5439         if ((uint64_t)(bSig << 1)) {
5440             return propagateFloatx80NaN(a, b, status);
5441         }
5442         if ( ( aExp | aSig ) == 0 ) {
5443  invalid:
5444             float_raise(float_flag_invalid, status);
5445             return floatx80_default_nan(status);
5446         }
5447         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5448     }
5449     if ( aExp == 0 ) {
5450         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5451         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5452     }
5453     if ( bExp == 0 ) {
5454         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5455         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5456     }
5457     zExp = aExp + bExp - 0x3FFE;
5458     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5459     if ( 0 < (int64_t) zSig0 ) {
5460         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5461         --zExp;
5462     }
5463     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5464                                 zSign, zExp, zSig0, zSig1, status);
5465 }
5466 
5467 /*----------------------------------------------------------------------------
5468 | Returns the result of dividing the extended double-precision floating-point
5469 | value `a' by the corresponding value `b'.  The operation is performed
5470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471 *----------------------------------------------------------------------------*/
5472 
5473 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5474 {
5475     flag aSign, bSign, zSign;
5476     int32_t aExp, bExp, zExp;
5477     uint64_t aSig, bSig, zSig0, zSig1;
5478     uint64_t rem0, rem1, rem2, term0, term1, term2;
5479 
5480     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5481         float_raise(float_flag_invalid, status);
5482         return floatx80_default_nan(status);
5483     }
5484     aSig = extractFloatx80Frac( a );
5485     aExp = extractFloatx80Exp( a );
5486     aSign = extractFloatx80Sign( a );
5487     bSig = extractFloatx80Frac( b );
5488     bExp = extractFloatx80Exp( b );
5489     bSign = extractFloatx80Sign( b );
5490     zSign = aSign ^ bSign;
5491     if ( aExp == 0x7FFF ) {
5492         if ((uint64_t)(aSig << 1)) {
5493             return propagateFloatx80NaN(a, b, status);
5494         }
5495         if ( bExp == 0x7FFF ) {
5496             if ((uint64_t)(bSig << 1)) {
5497                 return propagateFloatx80NaN(a, b, status);
5498             }
5499             goto invalid;
5500         }
5501         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5502     }
5503     if ( bExp == 0x7FFF ) {
5504         if ((uint64_t)(bSig << 1)) {
5505             return propagateFloatx80NaN(a, b, status);
5506         }
5507         return packFloatx80( zSign, 0, 0 );
5508     }
5509     if ( bExp == 0 ) {
5510         if ( bSig == 0 ) {
5511             if ( ( aExp | aSig ) == 0 ) {
5512  invalid:
5513                 float_raise(float_flag_invalid, status);
5514                 return floatx80_default_nan(status);
5515             }
5516             float_raise(float_flag_divbyzero, status);
5517             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5518         }
5519         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5520     }
5521     if ( aExp == 0 ) {
5522         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5523         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5524     }
5525     zExp = aExp - bExp + 0x3FFE;
5526     rem1 = 0;
5527     if ( bSig <= aSig ) {
5528         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5529         ++zExp;
5530     }
5531     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5532     mul64To128( bSig, zSig0, &term0, &term1 );
5533     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5534     while ( (int64_t) rem0 < 0 ) {
5535         --zSig0;
5536         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5537     }
5538     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5539     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5540         mul64To128( bSig, zSig1, &term1, &term2 );
5541         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5542         while ( (int64_t) rem1 < 0 ) {
5543             --zSig1;
5544             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5545         }
5546         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5547     }
5548     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5549                                 zSign, zExp, zSig0, zSig1, status);
5550 }
5551 
5552 /*----------------------------------------------------------------------------
5553 | Returns the remainder of the extended double-precision floating-point value
5554 | `a' with respect to the corresponding value `b'.  The operation is performed
5555 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5557 
5558 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5559 {
5560     flag aSign, zSign;
5561     int32_t aExp, bExp, expDiff;
5562     uint64_t aSig0, aSig1, bSig;
5563     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5564 
5565     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5566         float_raise(float_flag_invalid, status);
5567         return floatx80_default_nan(status);
5568     }
5569     aSig0 = extractFloatx80Frac( a );
5570     aExp = extractFloatx80Exp( a );
5571     aSign = extractFloatx80Sign( a );
5572     bSig = extractFloatx80Frac( b );
5573     bExp = extractFloatx80Exp( b );
5574     if ( aExp == 0x7FFF ) {
5575         if (    (uint64_t) ( aSig0<<1 )
5576              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5577             return propagateFloatx80NaN(a, b, status);
5578         }
5579         goto invalid;
5580     }
5581     if ( bExp == 0x7FFF ) {
5582         if ((uint64_t)(bSig << 1)) {
5583             return propagateFloatx80NaN(a, b, status);
5584         }
5585         return a;
5586     }
5587     if ( bExp == 0 ) {
5588         if ( bSig == 0 ) {
5589  invalid:
5590             float_raise(float_flag_invalid, status);
5591             return floatx80_default_nan(status);
5592         }
5593         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5594     }
5595     if ( aExp == 0 ) {
5596         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5597         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5598     }
5599     bSig |= LIT64( 0x8000000000000000 );
5600     zSign = aSign;
5601     expDiff = aExp - bExp;
5602     aSig1 = 0;
5603     if ( expDiff < 0 ) {
5604         if ( expDiff < -1 ) return a;
5605         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5606         expDiff = 0;
5607     }
5608     q = ( bSig <= aSig0 );
5609     if ( q ) aSig0 -= bSig;
5610     expDiff -= 64;
5611     while ( 0 < expDiff ) {
5612         q = estimateDiv128To64( aSig0, aSig1, bSig );
5613         q = ( 2 < q ) ? q - 2 : 0;
5614         mul64To128( bSig, q, &term0, &term1 );
5615         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5616         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5617         expDiff -= 62;
5618     }
5619     expDiff += 64;
5620     if ( 0 < expDiff ) {
5621         q = estimateDiv128To64( aSig0, aSig1, bSig );
5622         q = ( 2 < q ) ? q - 2 : 0;
5623         q >>= 64 - expDiff;
5624         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5625         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5626         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5627         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5628             ++q;
5629             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5630         }
5631     }
5632     else {
5633         term1 = 0;
5634         term0 = bSig;
5635     }
5636     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5637     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5638          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5639               && ( q & 1 ) )
5640        ) {
5641         aSig0 = alternateASig0;
5642         aSig1 = alternateASig1;
5643         zSign = ! zSign;
5644     }
5645     return
5646         normalizeRoundAndPackFloatx80(
5647             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5648 
5649 }
5650 
5651 /*----------------------------------------------------------------------------
5652 | Returns the square root of the extended double-precision floating-point
5653 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5654 | for Binary Floating-Point Arithmetic.
5655 *----------------------------------------------------------------------------*/
5656 
5657 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5658 {
5659     flag aSign;
5660     int32_t aExp, zExp;
5661     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5662     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5663 
5664     if (floatx80_invalid_encoding(a)) {
5665         float_raise(float_flag_invalid, status);
5666         return floatx80_default_nan(status);
5667     }
5668     aSig0 = extractFloatx80Frac( a );
5669     aExp = extractFloatx80Exp( a );
5670     aSign = extractFloatx80Sign( a );
5671     if ( aExp == 0x7FFF ) {
5672         if ((uint64_t)(aSig0 << 1)) {
5673             return propagateFloatx80NaN(a, a, status);
5674         }
5675         if ( ! aSign ) return a;
5676         goto invalid;
5677     }
5678     if ( aSign ) {
5679         if ( ( aExp | aSig0 ) == 0 ) return a;
5680  invalid:
5681         float_raise(float_flag_invalid, status);
5682         return floatx80_default_nan(status);
5683     }
5684     if ( aExp == 0 ) {
5685         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5686         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5687     }
5688     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5689     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5690     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5691     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5692     doubleZSig0 = zSig0<<1;
5693     mul64To128( zSig0, zSig0, &term0, &term1 );
5694     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5695     while ( (int64_t) rem0 < 0 ) {
5696         --zSig0;
5697         doubleZSig0 -= 2;
5698         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5699     }
5700     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5701     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5702         if ( zSig1 == 0 ) zSig1 = 1;
5703         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5704         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5705         mul64To128( zSig1, zSig1, &term2, &term3 );
5706         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5707         while ( (int64_t) rem1 < 0 ) {
5708             --zSig1;
5709             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5710             term3 |= 1;
5711             term2 |= doubleZSig0;
5712             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5713         }
5714         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5715     }
5716     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5717     zSig0 |= doubleZSig0;
5718     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5719                                 0, zExp, zSig0, zSig1, status);
5720 }
5721 
5722 /*----------------------------------------------------------------------------
5723 | Returns 1 if the extended double-precision floating-point value `a' is equal
5724 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5725 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5726 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5727 *----------------------------------------------------------------------------*/
5728 
5729 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5730 {
5731 
5732     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5733         || (extractFloatx80Exp(a) == 0x7FFF
5734             && (uint64_t) (extractFloatx80Frac(a) << 1))
5735         || (extractFloatx80Exp(b) == 0x7FFF
5736             && (uint64_t) (extractFloatx80Frac(b) << 1))
5737        ) {
5738         float_raise(float_flag_invalid, status);
5739         return 0;
5740     }
5741     return
5742            ( a.low == b.low )
5743         && (    ( a.high == b.high )
5744              || (    ( a.low == 0 )
5745                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5746            );
5747 
5748 }
5749 
5750 /*----------------------------------------------------------------------------
5751 | Returns 1 if the extended double-precision floating-point value `a' is
5752 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5753 | invalid exception is raised if either operand is a NaN.  The comparison is
5754 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic.
5756 *----------------------------------------------------------------------------*/
5757 
5758 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5759 {
5760     flag aSign, bSign;
5761 
5762     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5763         || (extractFloatx80Exp(a) == 0x7FFF
5764             && (uint64_t) (extractFloatx80Frac(a) << 1))
5765         || (extractFloatx80Exp(b) == 0x7FFF
5766             && (uint64_t) (extractFloatx80Frac(b) << 1))
5767        ) {
5768         float_raise(float_flag_invalid, status);
5769         return 0;
5770     }
5771     aSign = extractFloatx80Sign( a );
5772     bSign = extractFloatx80Sign( b );
5773     if ( aSign != bSign ) {
5774         return
5775                aSign
5776             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5777                  == 0 );
5778     }
5779     return
5780           aSign ? le128( b.high, b.low, a.high, a.low )
5781         : le128( a.high, a.low, b.high, b.low );
5782 
5783 }
5784 
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the extended double-precision floating-point value `a' is
5787 | less than the corresponding value `b', and 0 otherwise.  The invalid
5788 | exception is raised if either operand is a NaN.  The comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791 
5792 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5793 {
5794     flag aSign, bSign;
5795 
5796     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5797         || (extractFloatx80Exp(a) == 0x7FFF
5798             && (uint64_t) (extractFloatx80Frac(a) << 1))
5799         || (extractFloatx80Exp(b) == 0x7FFF
5800             && (uint64_t) (extractFloatx80Frac(b) << 1))
5801        ) {
5802         float_raise(float_flag_invalid, status);
5803         return 0;
5804     }
5805     aSign = extractFloatx80Sign( a );
5806     bSign = extractFloatx80Sign( b );
5807     if ( aSign != bSign ) {
5808         return
5809                aSign
5810             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5811                  != 0 );
5812     }
5813     return
5814           aSign ? lt128( b.high, b.low, a.high, a.low )
5815         : lt128( a.high, a.low, b.high, b.low );
5816 
5817 }
5818 
5819 /*----------------------------------------------------------------------------
5820 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5821 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5822 | either operand is a NaN.   The comparison is performed according to the
5823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5824 *----------------------------------------------------------------------------*/
5825 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5826 {
5827     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5828         || (extractFloatx80Exp(a) == 0x7FFF
5829             && (uint64_t) (extractFloatx80Frac(a) << 1))
5830         || (extractFloatx80Exp(b) == 0x7FFF
5831             && (uint64_t) (extractFloatx80Frac(b) << 1))
5832        ) {
5833         float_raise(float_flag_invalid, status);
5834         return 1;
5835     }
5836     return 0;
5837 }
5838 
5839 /*----------------------------------------------------------------------------
5840 | Returns 1 if the extended double-precision floating-point value `a' is
5841 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5842 | cause an exception.  The comparison is performed according to the IEC/IEEE
5843 | Standard for Binary Floating-Point Arithmetic.
5844 *----------------------------------------------------------------------------*/
5845 
5846 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5847 {
5848 
5849     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5850         float_raise(float_flag_invalid, status);
5851         return 0;
5852     }
5853     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5854               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5855          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5856               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5857        ) {
5858         if (floatx80_is_signaling_nan(a, status)
5859          || floatx80_is_signaling_nan(b, status)) {
5860             float_raise(float_flag_invalid, status);
5861         }
5862         return 0;
5863     }
5864     return
5865            ( a.low == b.low )
5866         && (    ( a.high == b.high )
5867              || (    ( a.low == 0 )
5868                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5869            );
5870 
5871 }
5872 
5873 /*----------------------------------------------------------------------------
5874 | Returns 1 if the extended double-precision floating-point value `a' is less
5875 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5876 | do not cause an exception.  Otherwise, the comparison is performed according
5877 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5878 *----------------------------------------------------------------------------*/
5879 
5880 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5881 {
5882     flag aSign, bSign;
5883 
5884     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5885         float_raise(float_flag_invalid, status);
5886         return 0;
5887     }
5888     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5889               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5890          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5891               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5892        ) {
5893         if (floatx80_is_signaling_nan(a, status)
5894          || floatx80_is_signaling_nan(b, status)) {
5895             float_raise(float_flag_invalid, status);
5896         }
5897         return 0;
5898     }
5899     aSign = extractFloatx80Sign( a );
5900     bSign = extractFloatx80Sign( b );
5901     if ( aSign != bSign ) {
5902         return
5903                aSign
5904             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5905                  == 0 );
5906     }
5907     return
5908           aSign ? le128( b.high, b.low, a.high, a.low )
5909         : le128( a.high, a.low, b.high, b.low );
5910 
5911 }
5912 
5913 /*----------------------------------------------------------------------------
5914 | Returns 1 if the extended double-precision floating-point value `a' is less
5915 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5916 | an exception.  Otherwise, the comparison is performed according to the
5917 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5918 *----------------------------------------------------------------------------*/
5919 
5920 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5921 {
5922     flag aSign, bSign;
5923 
5924     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5925         float_raise(float_flag_invalid, status);
5926         return 0;
5927     }
5928     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5929               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5930          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5931               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5932        ) {
5933         if (floatx80_is_signaling_nan(a, status)
5934          || floatx80_is_signaling_nan(b, status)) {
5935             float_raise(float_flag_invalid, status);
5936         }
5937         return 0;
5938     }
5939     aSign = extractFloatx80Sign( a );
5940     bSign = extractFloatx80Sign( b );
5941     if ( aSign != bSign ) {
5942         return
5943                aSign
5944             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5945                  != 0 );
5946     }
5947     return
5948           aSign ? lt128( b.high, b.low, a.high, a.low )
5949         : lt128( a.high, a.low, b.high, b.low );
5950 
5951 }
5952 
5953 /*----------------------------------------------------------------------------
5954 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5955 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5956 | The comparison is performed according to the IEC/IEEE Standard for Binary
5957 | Floating-Point Arithmetic.
5958 *----------------------------------------------------------------------------*/
5959 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5960 {
5961     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5962         float_raise(float_flag_invalid, status);
5963         return 1;
5964     }
5965     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5966               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5967          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5968               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5969        ) {
5970         if (floatx80_is_signaling_nan(a, status)
5971          || floatx80_is_signaling_nan(b, status)) {
5972             float_raise(float_flag_invalid, status);
5973         }
5974         return 1;
5975     }
5976     return 0;
5977 }
5978 
5979 /*----------------------------------------------------------------------------
5980 | Returns the result of converting the quadruple-precision floating-point
5981 | value `a' to the 32-bit two's complement integer format.  The conversion
5982 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5983 | Arithmetic---which means in particular that the conversion is rounded
5984 | according to the current rounding mode.  If `a' is a NaN, the largest
5985 | positive integer is returned.  Otherwise, if the conversion overflows, the
5986 | largest integer with the same sign as `a' is returned.
5987 *----------------------------------------------------------------------------*/
5988 
5989 int32_t float128_to_int32(float128 a, float_status *status)
5990 {
5991     flag aSign;
5992     int32_t aExp, shiftCount;
5993     uint64_t aSig0, aSig1;
5994 
5995     aSig1 = extractFloat128Frac1( a );
5996     aSig0 = extractFloat128Frac0( a );
5997     aExp = extractFloat128Exp( a );
5998     aSign = extractFloat128Sign( a );
5999     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6000     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6001     aSig0 |= ( aSig1 != 0 );
6002     shiftCount = 0x4028 - aExp;
6003     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6004     return roundAndPackInt32(aSign, aSig0, status);
6005 
6006 }
6007 
6008 /*----------------------------------------------------------------------------
6009 | Returns the result of converting the quadruple-precision floating-point
6010 | value `a' to the 32-bit two's complement integer format.  The conversion
6011 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6012 | Arithmetic, except that the conversion is always rounded toward zero.  If
6013 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6014 | conversion overflows, the largest integer with the same sign as `a' is
6015 | returned.
6016 *----------------------------------------------------------------------------*/
6017 
6018 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6019 {
6020     flag aSign;
6021     int32_t aExp, shiftCount;
6022     uint64_t aSig0, aSig1, savedASig;
6023     int32_t z;
6024 
6025     aSig1 = extractFloat128Frac1( a );
6026     aSig0 = extractFloat128Frac0( a );
6027     aExp = extractFloat128Exp( a );
6028     aSign = extractFloat128Sign( a );
6029     aSig0 |= ( aSig1 != 0 );
6030     if ( 0x401E < aExp ) {
6031         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6032         goto invalid;
6033     }
6034     else if ( aExp < 0x3FFF ) {
6035         if (aExp || aSig0) {
6036             status->float_exception_flags |= float_flag_inexact;
6037         }
6038         return 0;
6039     }
6040     aSig0 |= LIT64( 0x0001000000000000 );
6041     shiftCount = 0x402F - aExp;
6042     savedASig = aSig0;
6043     aSig0 >>= shiftCount;
6044     z = aSig0;
6045     if ( aSign ) z = - z;
6046     if ( ( z < 0 ) ^ aSign ) {
6047  invalid:
6048         float_raise(float_flag_invalid, status);
6049         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6050     }
6051     if ( ( aSig0<<shiftCount ) != savedASig ) {
6052         status->float_exception_flags |= float_flag_inexact;
6053     }
6054     return z;
6055 
6056 }
6057 
6058 /*----------------------------------------------------------------------------
6059 | Returns the result of converting the quadruple-precision floating-point
6060 | value `a' to the 64-bit two's complement integer format.  The conversion
6061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6062 | Arithmetic---which means in particular that the conversion is rounded
6063 | according to the current rounding mode.  If `a' is a NaN, the largest
6064 | positive integer is returned.  Otherwise, if the conversion overflows, the
6065 | largest integer with the same sign as `a' is returned.
6066 *----------------------------------------------------------------------------*/
6067 
6068 int64_t float128_to_int64(float128 a, float_status *status)
6069 {
6070     flag aSign;
6071     int32_t aExp, shiftCount;
6072     uint64_t aSig0, aSig1;
6073 
6074     aSig1 = extractFloat128Frac1( a );
6075     aSig0 = extractFloat128Frac0( a );
6076     aExp = extractFloat128Exp( a );
6077     aSign = extractFloat128Sign( a );
6078     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6079     shiftCount = 0x402F - aExp;
6080     if ( shiftCount <= 0 ) {
6081         if ( 0x403E < aExp ) {
6082             float_raise(float_flag_invalid, status);
6083             if (    ! aSign
6084                  || (    ( aExp == 0x7FFF )
6085                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6086                     )
6087                ) {
6088                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6089             }
6090             return (int64_t) LIT64( 0x8000000000000000 );
6091         }
6092         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6093     }
6094     else {
6095         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6096     }
6097     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6098 
6099 }
6100 
6101 /*----------------------------------------------------------------------------
6102 | Returns the result of converting the quadruple-precision floating-point
6103 | value `a' to the 64-bit two's complement integer format.  The conversion
6104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6105 | Arithmetic, except that the conversion is always rounded toward zero.
6106 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6107 | the conversion overflows, the largest integer with the same sign as `a' is
6108 | returned.
6109 *----------------------------------------------------------------------------*/
6110 
6111 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6112 {
6113     flag aSign;
6114     int32_t aExp, shiftCount;
6115     uint64_t aSig0, aSig1;
6116     int64_t z;
6117 
6118     aSig1 = extractFloat128Frac1( a );
6119     aSig0 = extractFloat128Frac0( a );
6120     aExp = extractFloat128Exp( a );
6121     aSign = extractFloat128Sign( a );
6122     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6123     shiftCount = aExp - 0x402F;
6124     if ( 0 < shiftCount ) {
6125         if ( 0x403E <= aExp ) {
6126             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6127             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6128                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6129                 if (aSig1) {
6130                     status->float_exception_flags |= float_flag_inexact;
6131                 }
6132             }
6133             else {
6134                 float_raise(float_flag_invalid, status);
6135                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6136                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6137                 }
6138             }
6139             return (int64_t) LIT64( 0x8000000000000000 );
6140         }
6141         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6142         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6143             status->float_exception_flags |= float_flag_inexact;
6144         }
6145     }
6146     else {
6147         if ( aExp < 0x3FFF ) {
6148             if ( aExp | aSig0 | aSig1 ) {
6149                 status->float_exception_flags |= float_flag_inexact;
6150             }
6151             return 0;
6152         }
6153         z = aSig0>>( - shiftCount );
6154         if (    aSig1
6155              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6156             status->float_exception_flags |= float_flag_inexact;
6157         }
6158     }
6159     if ( aSign ) z = - z;
6160     return z;
6161 
6162 }
6163 
6164 /*----------------------------------------------------------------------------
6165 | Returns the result of converting the quadruple-precision floating-point value
6166 | `a' to the 64-bit unsigned integer format.  The conversion is
6167 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6168 | Arithmetic---which means in particular that the conversion is rounded
6169 | according to the current rounding mode.  If `a' is a NaN, the largest
6170 | positive integer is returned.  If the conversion overflows, the
6171 | largest unsigned integer is returned.  If 'a' is negative, the value is
6172 | rounded and zero is returned; negative values that do not round to zero
6173 | will raise the inexact exception.
6174 *----------------------------------------------------------------------------*/
6175 
6176 uint64_t float128_to_uint64(float128 a, float_status *status)
6177 {
6178     flag aSign;
6179     int aExp;
6180     int shiftCount;
6181     uint64_t aSig0, aSig1;
6182 
6183     aSig0 = extractFloat128Frac0(a);
6184     aSig1 = extractFloat128Frac1(a);
6185     aExp = extractFloat128Exp(a);
6186     aSign = extractFloat128Sign(a);
6187     if (aSign && (aExp > 0x3FFE)) {
6188         float_raise(float_flag_invalid, status);
6189         if (float128_is_any_nan(a)) {
6190             return LIT64(0xFFFFFFFFFFFFFFFF);
6191         } else {
6192             return 0;
6193         }
6194     }
6195     if (aExp) {
6196         aSig0 |= LIT64(0x0001000000000000);
6197     }
6198     shiftCount = 0x402F - aExp;
6199     if (shiftCount <= 0) {
6200         if (0x403E < aExp) {
6201             float_raise(float_flag_invalid, status);
6202             return LIT64(0xFFFFFFFFFFFFFFFF);
6203         }
6204         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6205     } else {
6206         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6207     }
6208     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6209 }
6210 
6211 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6212 {
6213     uint64_t v;
6214     signed char current_rounding_mode = status->float_rounding_mode;
6215 
6216     set_float_rounding_mode(float_round_to_zero, status);
6217     v = float128_to_uint64(a, status);
6218     set_float_rounding_mode(current_rounding_mode, status);
6219 
6220     return v;
6221 }
6222 
6223 /*----------------------------------------------------------------------------
6224 | Returns the result of converting the quadruple-precision floating-point
6225 | value `a' to the 32-bit unsigned integer format.  The conversion
6226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6227 | Arithmetic except that the conversion is always rounded toward zero.
6228 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6229 | if the conversion overflows, the largest unsigned integer is returned.
6230 | If 'a' is negative, the value is rounded and zero is returned; negative
6231 | values that do not round to zero will raise the inexact exception.
6232 *----------------------------------------------------------------------------*/
6233 
6234 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6235 {
6236     uint64_t v;
6237     uint32_t res;
6238     int old_exc_flags = get_float_exception_flags(status);
6239 
6240     v = float128_to_uint64_round_to_zero(a, status);
6241     if (v > 0xffffffff) {
6242         res = 0xffffffff;
6243     } else {
6244         return v;
6245     }
6246     set_float_exception_flags(old_exc_flags, status);
6247     float_raise(float_flag_invalid, status);
6248     return res;
6249 }
6250 
6251 /*----------------------------------------------------------------------------
6252 | Returns the result of converting the quadruple-precision floating-point
6253 | value `a' to the single-precision floating-point format.  The conversion
6254 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6255 | Arithmetic.
6256 *----------------------------------------------------------------------------*/
6257 
6258 float32 float128_to_float32(float128 a, float_status *status)
6259 {
6260     flag aSign;
6261     int32_t aExp;
6262     uint64_t aSig0, aSig1;
6263     uint32_t zSig;
6264 
6265     aSig1 = extractFloat128Frac1( a );
6266     aSig0 = extractFloat128Frac0( a );
6267     aExp = extractFloat128Exp( a );
6268     aSign = extractFloat128Sign( a );
6269     if ( aExp == 0x7FFF ) {
6270         if ( aSig0 | aSig1 ) {
6271             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6272         }
6273         return packFloat32( aSign, 0xFF, 0 );
6274     }
6275     aSig0 |= ( aSig1 != 0 );
6276     shift64RightJamming( aSig0, 18, &aSig0 );
6277     zSig = aSig0;
6278     if ( aExp || zSig ) {
6279         zSig |= 0x40000000;
6280         aExp -= 0x3F81;
6281     }
6282     return roundAndPackFloat32(aSign, aExp, zSig, status);
6283 
6284 }
6285 
6286 /*----------------------------------------------------------------------------
6287 | Returns the result of converting the quadruple-precision floating-point
6288 | value `a' to the double-precision floating-point format.  The conversion
6289 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6290 | Arithmetic.
6291 *----------------------------------------------------------------------------*/
6292 
6293 float64 float128_to_float64(float128 a, float_status *status)
6294 {
6295     flag aSign;
6296     int32_t aExp;
6297     uint64_t aSig0, aSig1;
6298 
6299     aSig1 = extractFloat128Frac1( a );
6300     aSig0 = extractFloat128Frac0( a );
6301     aExp = extractFloat128Exp( a );
6302     aSign = extractFloat128Sign( a );
6303     if ( aExp == 0x7FFF ) {
6304         if ( aSig0 | aSig1 ) {
6305             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6306         }
6307         return packFloat64( aSign, 0x7FF, 0 );
6308     }
6309     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6310     aSig0 |= ( aSig1 != 0 );
6311     if ( aExp || aSig0 ) {
6312         aSig0 |= LIT64( 0x4000000000000000 );
6313         aExp -= 0x3C01;
6314     }
6315     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6316 
6317 }
6318 
6319 /*----------------------------------------------------------------------------
6320 | Returns the result of converting the quadruple-precision floating-point
6321 | value `a' to the extended double-precision floating-point format.  The
6322 | conversion is performed according to the IEC/IEEE Standard for Binary
6323 | Floating-Point Arithmetic.
6324 *----------------------------------------------------------------------------*/
6325 
6326 floatx80 float128_to_floatx80(float128 a, float_status *status)
6327 {
6328     flag aSign;
6329     int32_t aExp;
6330     uint64_t aSig0, aSig1;
6331 
6332     aSig1 = extractFloat128Frac1( a );
6333     aSig0 = extractFloat128Frac0( a );
6334     aExp = extractFloat128Exp( a );
6335     aSign = extractFloat128Sign( a );
6336     if ( aExp == 0x7FFF ) {
6337         if ( aSig0 | aSig1 ) {
6338             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6339         }
6340         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6341     }
6342     if ( aExp == 0 ) {
6343         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6344         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6345     }
6346     else {
6347         aSig0 |= LIT64( 0x0001000000000000 );
6348     }
6349     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6350     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6351 
6352 }
6353 
6354 /*----------------------------------------------------------------------------
6355 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6356 | returns the result as a quadruple-precision floating-point value.  The
6357 | operation is performed according to the IEC/IEEE Standard for Binary
6358 | Floating-Point Arithmetic.
6359 *----------------------------------------------------------------------------*/
6360 
6361 float128 float128_round_to_int(float128 a, float_status *status)
6362 {
6363     flag aSign;
6364     int32_t aExp;
6365     uint64_t lastBitMask, roundBitsMask;
6366     float128 z;
6367 
6368     aExp = extractFloat128Exp( a );
6369     if ( 0x402F <= aExp ) {
6370         if ( 0x406F <= aExp ) {
6371             if (    ( aExp == 0x7FFF )
6372                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6373                ) {
6374                 return propagateFloat128NaN(a, a, status);
6375             }
6376             return a;
6377         }
6378         lastBitMask = 1;
6379         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6380         roundBitsMask = lastBitMask - 1;
6381         z = a;
6382         switch (status->float_rounding_mode) {
6383         case float_round_nearest_even:
6384             if ( lastBitMask ) {
6385                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6386                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6387             }
6388             else {
6389                 if ( (int64_t) z.low < 0 ) {
6390                     ++z.high;
6391                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6392                 }
6393             }
6394             break;
6395         case float_round_ties_away:
6396             if (lastBitMask) {
6397                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6398             } else {
6399                 if ((int64_t) z.low < 0) {
6400                     ++z.high;
6401                 }
6402             }
6403             break;
6404         case float_round_to_zero:
6405             break;
6406         case float_round_up:
6407             if (!extractFloat128Sign(z)) {
6408                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6409             }
6410             break;
6411         case float_round_down:
6412             if (extractFloat128Sign(z)) {
6413                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6414             }
6415             break;
6416         default:
6417             abort();
6418         }
6419         z.low &= ~ roundBitsMask;
6420     }
6421     else {
6422         if ( aExp < 0x3FFF ) {
6423             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6424             status->float_exception_flags |= float_flag_inexact;
6425             aSign = extractFloat128Sign( a );
6426             switch (status->float_rounding_mode) {
6427              case float_round_nearest_even:
6428                 if (    ( aExp == 0x3FFE )
6429                      && (   extractFloat128Frac0( a )
6430                           | extractFloat128Frac1( a ) )
6431                    ) {
6432                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6433                 }
6434                 break;
6435             case float_round_ties_away:
6436                 if (aExp == 0x3FFE) {
6437                     return packFloat128(aSign, 0x3FFF, 0, 0);
6438                 }
6439                 break;
6440              case float_round_down:
6441                 return
6442                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6443                     : packFloat128( 0, 0, 0, 0 );
6444              case float_round_up:
6445                 return
6446                       aSign ? packFloat128( 1, 0, 0, 0 )
6447                     : packFloat128( 0, 0x3FFF, 0, 0 );
6448             }
6449             return packFloat128( aSign, 0, 0, 0 );
6450         }
6451         lastBitMask = 1;
6452         lastBitMask <<= 0x402F - aExp;
6453         roundBitsMask = lastBitMask - 1;
6454         z.low = 0;
6455         z.high = a.high;
6456         switch (status->float_rounding_mode) {
6457         case float_round_nearest_even:
6458             z.high += lastBitMask>>1;
6459             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6460                 z.high &= ~ lastBitMask;
6461             }
6462             break;
6463         case float_round_ties_away:
6464             z.high += lastBitMask>>1;
6465             break;
6466         case float_round_to_zero:
6467             break;
6468         case float_round_up:
6469             if (!extractFloat128Sign(z)) {
6470                 z.high |= ( a.low != 0 );
6471                 z.high += roundBitsMask;
6472             }
6473             break;
6474         case float_round_down:
6475             if (extractFloat128Sign(z)) {
6476                 z.high |= (a.low != 0);
6477                 z.high += roundBitsMask;
6478             }
6479             break;
6480         default:
6481             abort();
6482         }
6483         z.high &= ~ roundBitsMask;
6484     }
6485     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6486         status->float_exception_flags |= float_flag_inexact;
6487     }
6488     return z;
6489 
6490 }
6491 
6492 /*----------------------------------------------------------------------------
6493 | Returns the result of adding the absolute values of the quadruple-precision
6494 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6495 | before being returned.  `zSign' is ignored if the result is a NaN.
6496 | The addition is performed according to the IEC/IEEE Standard for Binary
6497 | Floating-Point Arithmetic.
6498 *----------------------------------------------------------------------------*/
6499 
6500 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6501                                 float_status *status)
6502 {
6503     int32_t aExp, bExp, zExp;
6504     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6505     int32_t expDiff;
6506 
6507     aSig1 = extractFloat128Frac1( a );
6508     aSig0 = extractFloat128Frac0( a );
6509     aExp = extractFloat128Exp( a );
6510     bSig1 = extractFloat128Frac1( b );
6511     bSig0 = extractFloat128Frac0( b );
6512     bExp = extractFloat128Exp( b );
6513     expDiff = aExp - bExp;
6514     if ( 0 < expDiff ) {
6515         if ( aExp == 0x7FFF ) {
6516             if (aSig0 | aSig1) {
6517                 return propagateFloat128NaN(a, b, status);
6518             }
6519             return a;
6520         }
6521         if ( bExp == 0 ) {
6522             --expDiff;
6523         }
6524         else {
6525             bSig0 |= LIT64( 0x0001000000000000 );
6526         }
6527         shift128ExtraRightJamming(
6528             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6529         zExp = aExp;
6530     }
6531     else if ( expDiff < 0 ) {
6532         if ( bExp == 0x7FFF ) {
6533             if (bSig0 | bSig1) {
6534                 return propagateFloat128NaN(a, b, status);
6535             }
6536             return packFloat128( zSign, 0x7FFF, 0, 0 );
6537         }
6538         if ( aExp == 0 ) {
6539             ++expDiff;
6540         }
6541         else {
6542             aSig0 |= LIT64( 0x0001000000000000 );
6543         }
6544         shift128ExtraRightJamming(
6545             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6546         zExp = bExp;
6547     }
6548     else {
6549         if ( aExp == 0x7FFF ) {
6550             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6551                 return propagateFloat128NaN(a, b, status);
6552             }
6553             return a;
6554         }
6555         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6556         if ( aExp == 0 ) {
6557             if (status->flush_to_zero) {
6558                 if (zSig0 | zSig1) {
6559                     float_raise(float_flag_output_denormal, status);
6560                 }
6561                 return packFloat128(zSign, 0, 0, 0);
6562             }
6563             return packFloat128( zSign, 0, zSig0, zSig1 );
6564         }
6565         zSig2 = 0;
6566         zSig0 |= LIT64( 0x0002000000000000 );
6567         zExp = aExp;
6568         goto shiftRight1;
6569     }
6570     aSig0 |= LIT64( 0x0001000000000000 );
6571     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6572     --zExp;
6573     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6574     ++zExp;
6575  shiftRight1:
6576     shift128ExtraRightJamming(
6577         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6578  roundAndPack:
6579     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6580 
6581 }
6582 
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of subtracting the absolute values of the quadruple-
6585 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6586 | difference is negated before being returned.  `zSign' is ignored if the
6587 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6588 | Standard for Binary Floating-Point Arithmetic.
6589 *----------------------------------------------------------------------------*/
6590 
6591 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6592                                 float_status *status)
6593 {
6594     int32_t aExp, bExp, zExp;
6595     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6596     int32_t expDiff;
6597 
6598     aSig1 = extractFloat128Frac1( a );
6599     aSig0 = extractFloat128Frac0( a );
6600     aExp = extractFloat128Exp( a );
6601     bSig1 = extractFloat128Frac1( b );
6602     bSig0 = extractFloat128Frac0( b );
6603     bExp = extractFloat128Exp( b );
6604     expDiff = aExp - bExp;
6605     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6606     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6607     if ( 0 < expDiff ) goto aExpBigger;
6608     if ( expDiff < 0 ) goto bExpBigger;
6609     if ( aExp == 0x7FFF ) {
6610         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6611             return propagateFloat128NaN(a, b, status);
6612         }
6613         float_raise(float_flag_invalid, status);
6614         return float128_default_nan(status);
6615     }
6616     if ( aExp == 0 ) {
6617         aExp = 1;
6618         bExp = 1;
6619     }
6620     if ( bSig0 < aSig0 ) goto aBigger;
6621     if ( aSig0 < bSig0 ) goto bBigger;
6622     if ( bSig1 < aSig1 ) goto aBigger;
6623     if ( aSig1 < bSig1 ) goto bBigger;
6624     return packFloat128(status->float_rounding_mode == float_round_down,
6625                         0, 0, 0);
6626  bExpBigger:
6627     if ( bExp == 0x7FFF ) {
6628         if (bSig0 | bSig1) {
6629             return propagateFloat128NaN(a, b, status);
6630         }
6631         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6632     }
6633     if ( aExp == 0 ) {
6634         ++expDiff;
6635     }
6636     else {
6637         aSig0 |= LIT64( 0x4000000000000000 );
6638     }
6639     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6640     bSig0 |= LIT64( 0x4000000000000000 );
6641  bBigger:
6642     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6643     zExp = bExp;
6644     zSign ^= 1;
6645     goto normalizeRoundAndPack;
6646  aExpBigger:
6647     if ( aExp == 0x7FFF ) {
6648         if (aSig0 | aSig1) {
6649             return propagateFloat128NaN(a, b, status);
6650         }
6651         return a;
6652     }
6653     if ( bExp == 0 ) {
6654         --expDiff;
6655     }
6656     else {
6657         bSig0 |= LIT64( 0x4000000000000000 );
6658     }
6659     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6660     aSig0 |= LIT64( 0x4000000000000000 );
6661  aBigger:
6662     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6663     zExp = aExp;
6664  normalizeRoundAndPack:
6665     --zExp;
6666     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6667                                          status);
6668 
6669 }
6670 
6671 /*----------------------------------------------------------------------------
6672 | Returns the result of adding the quadruple-precision floating-point values
6673 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6674 | for Binary Floating-Point Arithmetic.
6675 *----------------------------------------------------------------------------*/
6676 
6677 float128 float128_add(float128 a, float128 b, float_status *status)
6678 {
6679     flag aSign, bSign;
6680 
6681     aSign = extractFloat128Sign( a );
6682     bSign = extractFloat128Sign( b );
6683     if ( aSign == bSign ) {
6684         return addFloat128Sigs(a, b, aSign, status);
6685     }
6686     else {
6687         return subFloat128Sigs(a, b, aSign, status);
6688     }
6689 
6690 }
6691 
6692 /*----------------------------------------------------------------------------
6693 | Returns the result of subtracting the quadruple-precision floating-point
6694 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6695 | Standard for Binary Floating-Point Arithmetic.
6696 *----------------------------------------------------------------------------*/
6697 
6698 float128 float128_sub(float128 a, float128 b, float_status *status)
6699 {
6700     flag aSign, bSign;
6701 
6702     aSign = extractFloat128Sign( a );
6703     bSign = extractFloat128Sign( b );
6704     if ( aSign == bSign ) {
6705         return subFloat128Sigs(a, b, aSign, status);
6706     }
6707     else {
6708         return addFloat128Sigs(a, b, aSign, status);
6709     }
6710 
6711 }
6712 
6713 /*----------------------------------------------------------------------------
6714 | Returns the result of multiplying the quadruple-precision floating-point
6715 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6716 | Standard for Binary Floating-Point Arithmetic.
6717 *----------------------------------------------------------------------------*/
6718 
6719 float128 float128_mul(float128 a, float128 b, float_status *status)
6720 {
6721     flag aSign, bSign, zSign;
6722     int32_t aExp, bExp, zExp;
6723     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6724 
6725     aSig1 = extractFloat128Frac1( a );
6726     aSig0 = extractFloat128Frac0( a );
6727     aExp = extractFloat128Exp( a );
6728     aSign = extractFloat128Sign( a );
6729     bSig1 = extractFloat128Frac1( b );
6730     bSig0 = extractFloat128Frac0( b );
6731     bExp = extractFloat128Exp( b );
6732     bSign = extractFloat128Sign( b );
6733     zSign = aSign ^ bSign;
6734     if ( aExp == 0x7FFF ) {
6735         if (    ( aSig0 | aSig1 )
6736              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6737             return propagateFloat128NaN(a, b, status);
6738         }
6739         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6740         return packFloat128( zSign, 0x7FFF, 0, 0 );
6741     }
6742     if ( bExp == 0x7FFF ) {
6743         if (bSig0 | bSig1) {
6744             return propagateFloat128NaN(a, b, status);
6745         }
6746         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6747  invalid:
6748             float_raise(float_flag_invalid, status);
6749             return float128_default_nan(status);
6750         }
6751         return packFloat128( zSign, 0x7FFF, 0, 0 );
6752     }
6753     if ( aExp == 0 ) {
6754         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6755         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6756     }
6757     if ( bExp == 0 ) {
6758         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6759         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6760     }
6761     zExp = aExp + bExp - 0x4000;
6762     aSig0 |= LIT64( 0x0001000000000000 );
6763     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6764     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6765     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6766     zSig2 |= ( zSig3 != 0 );
6767     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6768         shift128ExtraRightJamming(
6769             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6770         ++zExp;
6771     }
6772     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6773 
6774 }
6775 
6776 /*----------------------------------------------------------------------------
6777 | Returns the result of dividing the quadruple-precision floating-point value
6778 | `a' by the corresponding value `b'.  The operation is performed according to
6779 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6780 *----------------------------------------------------------------------------*/
6781 
6782 float128 float128_div(float128 a, float128 b, float_status *status)
6783 {
6784     flag aSign, bSign, zSign;
6785     int32_t aExp, bExp, zExp;
6786     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6787     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6788 
6789     aSig1 = extractFloat128Frac1( a );
6790     aSig0 = extractFloat128Frac0( a );
6791     aExp = extractFloat128Exp( a );
6792     aSign = extractFloat128Sign( a );
6793     bSig1 = extractFloat128Frac1( b );
6794     bSig0 = extractFloat128Frac0( b );
6795     bExp = extractFloat128Exp( b );
6796     bSign = extractFloat128Sign( b );
6797     zSign = aSign ^ bSign;
6798     if ( aExp == 0x7FFF ) {
6799         if (aSig0 | aSig1) {
6800             return propagateFloat128NaN(a, b, status);
6801         }
6802         if ( bExp == 0x7FFF ) {
6803             if (bSig0 | bSig1) {
6804                 return propagateFloat128NaN(a, b, status);
6805             }
6806             goto invalid;
6807         }
6808         return packFloat128( zSign, 0x7FFF, 0, 0 );
6809     }
6810     if ( bExp == 0x7FFF ) {
6811         if (bSig0 | bSig1) {
6812             return propagateFloat128NaN(a, b, status);
6813         }
6814         return packFloat128( zSign, 0, 0, 0 );
6815     }
6816     if ( bExp == 0 ) {
6817         if ( ( bSig0 | bSig1 ) == 0 ) {
6818             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6819  invalid:
6820                 float_raise(float_flag_invalid, status);
6821                 return float128_default_nan(status);
6822             }
6823             float_raise(float_flag_divbyzero, status);
6824             return packFloat128( zSign, 0x7FFF, 0, 0 );
6825         }
6826         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6827     }
6828     if ( aExp == 0 ) {
6829         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6830         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6831     }
6832     zExp = aExp - bExp + 0x3FFD;
6833     shortShift128Left(
6834         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6835     shortShift128Left(
6836         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6837     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6838         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6839         ++zExp;
6840     }
6841     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6842     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6843     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6844     while ( (int64_t) rem0 < 0 ) {
6845         --zSig0;
6846         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6847     }
6848     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6849     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6850         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6851         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6852         while ( (int64_t) rem1 < 0 ) {
6853             --zSig1;
6854             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6855         }
6856         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6857     }
6858     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6859     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6860 
6861 }
6862 
6863 /*----------------------------------------------------------------------------
6864 | Returns the remainder of the quadruple-precision floating-point value `a'
6865 | with respect to the corresponding value `b'.  The operation is performed
6866 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6867 *----------------------------------------------------------------------------*/
6868 
6869 float128 float128_rem(float128 a, float128 b, float_status *status)
6870 {
6871     flag aSign, zSign;
6872     int32_t aExp, bExp, expDiff;
6873     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6874     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6875     int64_t sigMean0;
6876 
6877     aSig1 = extractFloat128Frac1( a );
6878     aSig0 = extractFloat128Frac0( a );
6879     aExp = extractFloat128Exp( a );
6880     aSign = extractFloat128Sign( a );
6881     bSig1 = extractFloat128Frac1( b );
6882     bSig0 = extractFloat128Frac0( b );
6883     bExp = extractFloat128Exp( b );
6884     if ( aExp == 0x7FFF ) {
6885         if (    ( aSig0 | aSig1 )
6886              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6887             return propagateFloat128NaN(a, b, status);
6888         }
6889         goto invalid;
6890     }
6891     if ( bExp == 0x7FFF ) {
6892         if (bSig0 | bSig1) {
6893             return propagateFloat128NaN(a, b, status);
6894         }
6895         return a;
6896     }
6897     if ( bExp == 0 ) {
6898         if ( ( bSig0 | bSig1 ) == 0 ) {
6899  invalid:
6900             float_raise(float_flag_invalid, status);
6901             return float128_default_nan(status);
6902         }
6903         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6904     }
6905     if ( aExp == 0 ) {
6906         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6907         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6908     }
6909     expDiff = aExp - bExp;
6910     if ( expDiff < -1 ) return a;
6911     shortShift128Left(
6912         aSig0 | LIT64( 0x0001000000000000 ),
6913         aSig1,
6914         15 - ( expDiff < 0 ),
6915         &aSig0,
6916         &aSig1
6917     );
6918     shortShift128Left(
6919         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6920     q = le128( bSig0, bSig1, aSig0, aSig1 );
6921     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6922     expDiff -= 64;
6923     while ( 0 < expDiff ) {
6924         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6925         q = ( 4 < q ) ? q - 4 : 0;
6926         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6927         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6928         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6929         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6930         expDiff -= 61;
6931     }
6932     if ( -64 < expDiff ) {
6933         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6934         q = ( 4 < q ) ? q - 4 : 0;
6935         q >>= - expDiff;
6936         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6937         expDiff += 52;
6938         if ( expDiff < 0 ) {
6939             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6940         }
6941         else {
6942             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6943         }
6944         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6945         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6946     }
6947     else {
6948         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6949         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6950     }
6951     do {
6952         alternateASig0 = aSig0;
6953         alternateASig1 = aSig1;
6954         ++q;
6955         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6956     } while ( 0 <= (int64_t) aSig0 );
6957     add128(
6958         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6959     if (    ( sigMean0 < 0 )
6960          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6961         aSig0 = alternateASig0;
6962         aSig1 = alternateASig1;
6963     }
6964     zSign = ( (int64_t) aSig0 < 0 );
6965     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6966     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6967                                          status);
6968 }
6969 
6970 /*----------------------------------------------------------------------------
6971 | Returns the square root of the quadruple-precision floating-point value `a'.
6972 | The operation is performed according to the IEC/IEEE Standard for Binary
6973 | Floating-Point Arithmetic.
6974 *----------------------------------------------------------------------------*/
6975 
6976 float128 float128_sqrt(float128 a, float_status *status)
6977 {
6978     flag aSign;
6979     int32_t aExp, zExp;
6980     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6981     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6982 
6983     aSig1 = extractFloat128Frac1( a );
6984     aSig0 = extractFloat128Frac0( a );
6985     aExp = extractFloat128Exp( a );
6986     aSign = extractFloat128Sign( a );
6987     if ( aExp == 0x7FFF ) {
6988         if (aSig0 | aSig1) {
6989             return propagateFloat128NaN(a, a, status);
6990         }
6991         if ( ! aSign ) return a;
6992         goto invalid;
6993     }
6994     if ( aSign ) {
6995         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6996  invalid:
6997         float_raise(float_flag_invalid, status);
6998         return float128_default_nan(status);
6999     }
7000     if ( aExp == 0 ) {
7001         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7002         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7003     }
7004     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7005     aSig0 |= LIT64( 0x0001000000000000 );
7006     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7007     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7008     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7009     doubleZSig0 = zSig0<<1;
7010     mul64To128( zSig0, zSig0, &term0, &term1 );
7011     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7012     while ( (int64_t) rem0 < 0 ) {
7013         --zSig0;
7014         doubleZSig0 -= 2;
7015         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7016     }
7017     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7018     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7019         if ( zSig1 == 0 ) zSig1 = 1;
7020         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7021         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7022         mul64To128( zSig1, zSig1, &term2, &term3 );
7023         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7024         while ( (int64_t) rem1 < 0 ) {
7025             --zSig1;
7026             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7027             term3 |= 1;
7028             term2 |= doubleZSig0;
7029             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7030         }
7031         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7032     }
7033     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7034     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7035 
7036 }
7037 
7038 /*----------------------------------------------------------------------------
7039 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7040 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7041 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7043 *----------------------------------------------------------------------------*/
7044 
7045 int float128_eq(float128 a, float128 b, float_status *status)
7046 {
7047 
7048     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7049               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7050          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7051               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7052        ) {
7053         float_raise(float_flag_invalid, status);
7054         return 0;
7055     }
7056     return
7057            ( a.low == b.low )
7058         && (    ( a.high == b.high )
7059              || (    ( a.low == 0 )
7060                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7061            );
7062 
7063 }
7064 
7065 /*----------------------------------------------------------------------------
7066 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7067 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7068 | exception is raised if either operand is a NaN.  The comparison is performed
7069 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7070 *----------------------------------------------------------------------------*/
7071 
7072 int float128_le(float128 a, float128 b, float_status *status)
7073 {
7074     flag aSign, bSign;
7075 
7076     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7077               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7078          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7079               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7080        ) {
7081         float_raise(float_flag_invalid, status);
7082         return 0;
7083     }
7084     aSign = extractFloat128Sign( a );
7085     bSign = extractFloat128Sign( b );
7086     if ( aSign != bSign ) {
7087         return
7088                aSign
7089             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7090                  == 0 );
7091     }
7092     return
7093           aSign ? le128( b.high, b.low, a.high, a.low )
7094         : le128( a.high, a.low, b.high, b.low );
7095 
7096 }
7097 
7098 /*----------------------------------------------------------------------------
7099 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7100 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7101 | raised if either operand is a NaN.  The comparison is performed according
7102 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7103 *----------------------------------------------------------------------------*/
7104 
7105 int float128_lt(float128 a, float128 b, float_status *status)
7106 {
7107     flag aSign, bSign;
7108 
7109     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7110               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7111          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7112               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7113        ) {
7114         float_raise(float_flag_invalid, status);
7115         return 0;
7116     }
7117     aSign = extractFloat128Sign( a );
7118     bSign = extractFloat128Sign( b );
7119     if ( aSign != bSign ) {
7120         return
7121                aSign
7122             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7123                  != 0 );
7124     }
7125     return
7126           aSign ? lt128( b.high, b.low, a.high, a.low )
7127         : lt128( a.high, a.low, b.high, b.low );
7128 
7129 }
7130 
7131 /*----------------------------------------------------------------------------
7132 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7133 | be compared, and 0 otherwise.  The invalid exception is raised if either
7134 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7135 | Standard for Binary Floating-Point Arithmetic.
7136 *----------------------------------------------------------------------------*/
7137 
7138 int float128_unordered(float128 a, float128 b, float_status *status)
7139 {
7140     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7141               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7142          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7143               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7144        ) {
7145         float_raise(float_flag_invalid, status);
7146         return 1;
7147     }
7148     return 0;
7149 }
7150 
7151 /*----------------------------------------------------------------------------
7152 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7153 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7154 | exception.  The comparison is performed according to the IEC/IEEE Standard
7155 | for Binary Floating-Point Arithmetic.
7156 *----------------------------------------------------------------------------*/
7157 
7158 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7159 {
7160 
7161     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7162               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7163          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7164               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7165        ) {
7166         if (float128_is_signaling_nan(a, status)
7167          || float128_is_signaling_nan(b, status)) {
7168             float_raise(float_flag_invalid, status);
7169         }
7170         return 0;
7171     }
7172     return
7173            ( a.low == b.low )
7174         && (    ( a.high == b.high )
7175              || (    ( a.low == 0 )
7176                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7177            );
7178 
7179 }
7180 
7181 /*----------------------------------------------------------------------------
7182 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7183 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7184 | cause an exception.  Otherwise, the comparison is performed according to the
7185 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7186 *----------------------------------------------------------------------------*/
7187 
7188 int float128_le_quiet(float128 a, float128 b, float_status *status)
7189 {
7190     flag aSign, bSign;
7191 
7192     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7193               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7194          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7195               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7196        ) {
7197         if (float128_is_signaling_nan(a, status)
7198          || float128_is_signaling_nan(b, status)) {
7199             float_raise(float_flag_invalid, status);
7200         }
7201         return 0;
7202     }
7203     aSign = extractFloat128Sign( a );
7204     bSign = extractFloat128Sign( b );
7205     if ( aSign != bSign ) {
7206         return
7207                aSign
7208             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7209                  == 0 );
7210     }
7211     return
7212           aSign ? le128( b.high, b.low, a.high, a.low )
7213         : le128( a.high, a.low, b.high, b.low );
7214 
7215 }
7216 
7217 /*----------------------------------------------------------------------------
7218 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7219 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7220 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7221 | Standard for Binary Floating-Point Arithmetic.
7222 *----------------------------------------------------------------------------*/
7223 
7224 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7225 {
7226     flag aSign, bSign;
7227 
7228     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7229               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7230          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7231               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7232        ) {
7233         if (float128_is_signaling_nan(a, status)
7234          || float128_is_signaling_nan(b, status)) {
7235             float_raise(float_flag_invalid, status);
7236         }
7237         return 0;
7238     }
7239     aSign = extractFloat128Sign( a );
7240     bSign = extractFloat128Sign( b );
7241     if ( aSign != bSign ) {
7242         return
7243                aSign
7244             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7245                  != 0 );
7246     }
7247     return
7248           aSign ? lt128( b.high, b.low, a.high, a.low )
7249         : lt128( a.high, a.low, b.high, b.low );
7250 
7251 }
7252 
7253 /*----------------------------------------------------------------------------
7254 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7255 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7256 | comparison is performed according to the IEC/IEEE Standard for Binary
7257 | Floating-Point Arithmetic.
7258 *----------------------------------------------------------------------------*/
7259 
7260 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7261 {
7262     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7263               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7264          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7265               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7266        ) {
7267         if (float128_is_signaling_nan(a, status)
7268          || float128_is_signaling_nan(b, status)) {
7269             float_raise(float_flag_invalid, status);
7270         }
7271         return 1;
7272     }
7273     return 0;
7274 }
7275 
7276 /* misc functions */
7277 float32 uint32_to_float32(uint32_t a, float_status *status)
7278 {
7279     return int64_to_float32(a, status);
7280 }
7281 
7282 float64 uint32_to_float64(uint32_t a, float_status *status)
7283 {
7284     return int64_to_float64(a, status);
7285 }
7286 
7287 uint32_t float32_to_uint32(float32 a, float_status *status)
7288 {
7289     int64_t v;
7290     uint32_t res;
7291     int old_exc_flags = get_float_exception_flags(status);
7292 
7293     v = float32_to_int64(a, status);
7294     if (v < 0) {
7295         res = 0;
7296     } else if (v > 0xffffffff) {
7297         res = 0xffffffff;
7298     } else {
7299         return v;
7300     }
7301     set_float_exception_flags(old_exc_flags, status);
7302     float_raise(float_flag_invalid, status);
7303     return res;
7304 }
7305 
7306 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7307 {
7308     int64_t v;
7309     uint32_t res;
7310     int old_exc_flags = get_float_exception_flags(status);
7311 
7312     v = float32_to_int64_round_to_zero(a, status);
7313     if (v < 0) {
7314         res = 0;
7315     } else if (v > 0xffffffff) {
7316         res = 0xffffffff;
7317     } else {
7318         return v;
7319     }
7320     set_float_exception_flags(old_exc_flags, status);
7321     float_raise(float_flag_invalid, status);
7322     return res;
7323 }
7324 
7325 int16_t float32_to_int16(float32 a, float_status *status)
7326 {
7327     int32_t v;
7328     int16_t res;
7329     int old_exc_flags = get_float_exception_flags(status);
7330 
7331     v = float32_to_int32(a, status);
7332     if (v < -0x8000) {
7333         res = -0x8000;
7334     } else if (v > 0x7fff) {
7335         res = 0x7fff;
7336     } else {
7337         return v;
7338     }
7339 
7340     set_float_exception_flags(old_exc_flags, status);
7341     float_raise(float_flag_invalid, status);
7342     return res;
7343 }
7344 
7345 uint16_t float32_to_uint16(float32 a, float_status *status)
7346 {
7347     int32_t v;
7348     uint16_t res;
7349     int old_exc_flags = get_float_exception_flags(status);
7350 
7351     v = float32_to_int32(a, status);
7352     if (v < 0) {
7353         res = 0;
7354     } else if (v > 0xffff) {
7355         res = 0xffff;
7356     } else {
7357         return v;
7358     }
7359 
7360     set_float_exception_flags(old_exc_flags, status);
7361     float_raise(float_flag_invalid, status);
7362     return res;
7363 }
7364 
7365 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7366 {
7367     int64_t v;
7368     uint16_t res;
7369     int old_exc_flags = get_float_exception_flags(status);
7370 
7371     v = float32_to_int64_round_to_zero(a, status);
7372     if (v < 0) {
7373         res = 0;
7374     } else if (v > 0xffff) {
7375         res = 0xffff;
7376     } else {
7377         return v;
7378     }
7379     set_float_exception_flags(old_exc_flags, status);
7380     float_raise(float_flag_invalid, status);
7381     return res;
7382 }
7383 
7384 uint32_t float64_to_uint32(float64 a, float_status *status)
7385 {
7386     uint64_t v;
7387     uint32_t res;
7388     int old_exc_flags = get_float_exception_flags(status);
7389 
7390     v = float64_to_uint64(a, status);
7391     if (v > 0xffffffff) {
7392         res = 0xffffffff;
7393     } else {
7394         return v;
7395     }
7396     set_float_exception_flags(old_exc_flags, status);
7397     float_raise(float_flag_invalid, status);
7398     return res;
7399 }
7400 
7401 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7402 {
7403     uint64_t v;
7404     uint32_t res;
7405     int old_exc_flags = get_float_exception_flags(status);
7406 
7407     v = float64_to_uint64_round_to_zero(a, status);
7408     if (v > 0xffffffff) {
7409         res = 0xffffffff;
7410     } else {
7411         return v;
7412     }
7413     set_float_exception_flags(old_exc_flags, status);
7414     float_raise(float_flag_invalid, status);
7415     return res;
7416 }
7417 
7418 int16_t float64_to_int16(float64 a, float_status *status)
7419 {
7420     int64_t v;
7421     int16_t res;
7422     int old_exc_flags = get_float_exception_flags(status);
7423 
7424     v = float64_to_int32(a, status);
7425     if (v < -0x8000) {
7426         res = -0x8000;
7427     } else if (v > 0x7fff) {
7428         res = 0x7fff;
7429     } else {
7430         return v;
7431     }
7432 
7433     set_float_exception_flags(old_exc_flags, status);
7434     float_raise(float_flag_invalid, status);
7435     return res;
7436 }
7437 
7438 uint16_t float64_to_uint16(float64 a, float_status *status)
7439 {
7440     int64_t v;
7441     uint16_t res;
7442     int old_exc_flags = get_float_exception_flags(status);
7443 
7444     v = float64_to_int32(a, status);
7445     if (v < 0) {
7446         res = 0;
7447     } else if (v > 0xffff) {
7448         res = 0xffff;
7449     } else {
7450         return v;
7451     }
7452 
7453     set_float_exception_flags(old_exc_flags, status);
7454     float_raise(float_flag_invalid, status);
7455     return res;
7456 }
7457 
7458 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7459 {
7460     int64_t v;
7461     uint16_t res;
7462     int old_exc_flags = get_float_exception_flags(status);
7463 
7464     v = float64_to_int64_round_to_zero(a, status);
7465     if (v < 0) {
7466         res = 0;
7467     } else if (v > 0xffff) {
7468         res = 0xffff;
7469     } else {
7470         return v;
7471     }
7472     set_float_exception_flags(old_exc_flags, status);
7473     float_raise(float_flag_invalid, status);
7474     return res;
7475 }
7476 
7477 /*----------------------------------------------------------------------------
7478 | Returns the result of converting the double-precision floating-point value
7479 | `a' to the 64-bit unsigned integer format.  The conversion is
7480 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7481 | Arithmetic---which means in particular that the conversion is rounded
7482 | according to the current rounding mode.  If `a' is a NaN, the largest
7483 | positive integer is returned.  If the conversion overflows, the
7484 | largest unsigned integer is returned.  If 'a' is negative, the value is
7485 | rounded and zero is returned; negative values that do not round to zero
7486 | will raise the inexact exception.
7487 *----------------------------------------------------------------------------*/
7488 
7489 uint64_t float64_to_uint64(float64 a, float_status *status)
7490 {
7491     flag aSign;
7492     int aExp;
7493     int shiftCount;
7494     uint64_t aSig, aSigExtra;
7495     a = float64_squash_input_denormal(a, status);
7496 
7497     aSig = extractFloat64Frac(a);
7498     aExp = extractFloat64Exp(a);
7499     aSign = extractFloat64Sign(a);
7500     if (aSign && (aExp > 1022)) {
7501         float_raise(float_flag_invalid, status);
7502         if (float64_is_any_nan(a)) {
7503             return LIT64(0xFFFFFFFFFFFFFFFF);
7504         } else {
7505             return 0;
7506         }
7507     }
7508     if (aExp) {
7509         aSig |= LIT64(0x0010000000000000);
7510     }
7511     shiftCount = 0x433 - aExp;
7512     if (shiftCount <= 0) {
7513         if (0x43E < aExp) {
7514             float_raise(float_flag_invalid, status);
7515             return LIT64(0xFFFFFFFFFFFFFFFF);
7516         }
7517         aSigExtra = 0;
7518         aSig <<= -shiftCount;
7519     } else {
7520         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7521     }
7522     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7523 }
7524 
7525 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7526 {
7527     signed char current_rounding_mode = status->float_rounding_mode;
7528     set_float_rounding_mode(float_round_to_zero, status);
7529     uint64_t v = float64_to_uint64(a, status);
7530     set_float_rounding_mode(current_rounding_mode, status);
7531     return v;
7532 }
7533 
7534 #define COMPARE(s, nan_exp)                                                  \
7535 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7536                                       int is_quiet, float_status *status)    \
7537 {                                                                            \
7538     flag aSign, bSign;                                                       \
7539     uint ## s ## _t av, bv;                                                  \
7540     a = float ## s ## _squash_input_denormal(a, status);                     \
7541     b = float ## s ## _squash_input_denormal(b, status);                     \
7542                                                                              \
7543     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7544          extractFloat ## s ## Frac( a ) ) ||                                 \
7545         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7546           extractFloat ## s ## Frac( b ) )) {                                \
7547         if (!is_quiet ||                                                     \
7548             float ## s ## _is_signaling_nan(a, status) ||                  \
7549             float ## s ## _is_signaling_nan(b, status)) {                 \
7550             float_raise(float_flag_invalid, status);                         \
7551         }                                                                    \
7552         return float_relation_unordered;                                     \
7553     }                                                                        \
7554     aSign = extractFloat ## s ## Sign( a );                                  \
7555     bSign = extractFloat ## s ## Sign( b );                                  \
7556     av = float ## s ## _val(a);                                              \
7557     bv = float ## s ## _val(b);                                              \
7558     if ( aSign != bSign ) {                                                  \
7559         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7560             /* zero case */                                                  \
7561             return float_relation_equal;                                     \
7562         } else {                                                             \
7563             return 1 - (2 * aSign);                                          \
7564         }                                                                    \
7565     } else {                                                                 \
7566         if (av == bv) {                                                      \
7567             return float_relation_equal;                                     \
7568         } else {                                                             \
7569             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7570         }                                                                    \
7571     }                                                                        \
7572 }                                                                            \
7573                                                                              \
7574 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7575 {                                                                            \
7576     return float ## s ## _compare_internal(a, b, 0, status);                 \
7577 }                                                                            \
7578                                                                              \
7579 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7580                                  float_status *status)                       \
7581 {                                                                            \
7582     return float ## s ## _compare_internal(a, b, 1, status);                 \
7583 }
7584 
7585 COMPARE(32, 0xff)
7586 COMPARE(64, 0x7ff)
7587 
7588 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7589                                             int is_quiet, float_status *status)
7590 {
7591     flag aSign, bSign;
7592 
7593     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7594         float_raise(float_flag_invalid, status);
7595         return float_relation_unordered;
7596     }
7597     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7598           ( extractFloatx80Frac( a )<<1 ) ) ||
7599         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7600           ( extractFloatx80Frac( b )<<1 ) )) {
7601         if (!is_quiet ||
7602             floatx80_is_signaling_nan(a, status) ||
7603             floatx80_is_signaling_nan(b, status)) {
7604             float_raise(float_flag_invalid, status);
7605         }
7606         return float_relation_unordered;
7607     }
7608     aSign = extractFloatx80Sign( a );
7609     bSign = extractFloatx80Sign( b );
7610     if ( aSign != bSign ) {
7611 
7612         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7613              ( ( a.low | b.low ) == 0 ) ) {
7614             /* zero case */
7615             return float_relation_equal;
7616         } else {
7617             return 1 - (2 * aSign);
7618         }
7619     } else {
7620         if (a.low == b.low && a.high == b.high) {
7621             return float_relation_equal;
7622         } else {
7623             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7624         }
7625     }
7626 }
7627 
7628 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7629 {
7630     return floatx80_compare_internal(a, b, 0, status);
7631 }
7632 
7633 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7634 {
7635     return floatx80_compare_internal(a, b, 1, status);
7636 }
7637 
7638 static inline int float128_compare_internal(float128 a, float128 b,
7639                                             int is_quiet, float_status *status)
7640 {
7641     flag aSign, bSign;
7642 
7643     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7644           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7645         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7646           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7647         if (!is_quiet ||
7648             float128_is_signaling_nan(a, status) ||
7649             float128_is_signaling_nan(b, status)) {
7650             float_raise(float_flag_invalid, status);
7651         }
7652         return float_relation_unordered;
7653     }
7654     aSign = extractFloat128Sign( a );
7655     bSign = extractFloat128Sign( b );
7656     if ( aSign != bSign ) {
7657         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7658             /* zero case */
7659             return float_relation_equal;
7660         } else {
7661             return 1 - (2 * aSign);
7662         }
7663     } else {
7664         if (a.low == b.low && a.high == b.high) {
7665             return float_relation_equal;
7666         } else {
7667             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7668         }
7669     }
7670 }
7671 
7672 int float128_compare(float128 a, float128 b, float_status *status)
7673 {
7674     return float128_compare_internal(a, b, 0, status);
7675 }
7676 
7677 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7678 {
7679     return float128_compare_internal(a, b, 1, status);
7680 }
7681 
7682 /* min() and max() functions. These can't be implemented as
7683  * 'compare and pick one input' because that would mishandle
7684  * NaNs and +0 vs -0.
7685  *
7686  * minnum() and maxnum() functions. These are similar to the min()
7687  * and max() functions but if one of the arguments is a QNaN and
7688  * the other is numerical then the numerical argument is returned.
7689  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7690  * and maxNum() operations. min() and max() are the typical min/max
7691  * semantics provided by many CPUs which predate that specification.
7692  *
7693  * minnummag() and maxnummag() functions correspond to minNumMag()
7694  * and minNumMag() from the IEEE-754 2008.
7695  */
7696 #define MINMAX(s)                                                       \
7697 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7698                                                int ismin, int isieee,   \
7699                                                int ismag,               \
7700                                                float_status *status)    \
7701 {                                                                       \
7702     flag aSign, bSign;                                                  \
7703     uint ## s ## _t av, bv, aav, abv;                                   \
7704     a = float ## s ## _squash_input_denormal(a, status);                \
7705     b = float ## s ## _squash_input_denormal(b, status);                \
7706     if (float ## s ## _is_any_nan(a) ||                                 \
7707         float ## s ## _is_any_nan(b)) {                                 \
7708         if (isieee) {                                                   \
7709             if (float ## s ## _is_quiet_nan(a, status) &&               \
7710                 !float ## s ##_is_any_nan(b)) {                         \
7711                 return b;                                               \
7712             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7713                        !float ## s ## _is_any_nan(a)) {                \
7714                 return a;                                               \
7715             }                                                           \
7716         }                                                               \
7717         return propagateFloat ## s ## NaN(a, b, status);                \
7718     }                                                                   \
7719     aSign = extractFloat ## s ## Sign(a);                               \
7720     bSign = extractFloat ## s ## Sign(b);                               \
7721     av = float ## s ## _val(a);                                         \
7722     bv = float ## s ## _val(b);                                         \
7723     if (ismag) {                                                        \
7724         aav = float ## s ## _abs(av);                                   \
7725         abv = float ## s ## _abs(bv);                                   \
7726         if (aav != abv) {                                               \
7727             if (ismin) {                                                \
7728                 return (aav < abv) ? a : b;                             \
7729             } else {                                                    \
7730                 return (aav < abv) ? b : a;                             \
7731             }                                                           \
7732         }                                                               \
7733     }                                                                   \
7734     if (aSign != bSign) {                                               \
7735         if (ismin) {                                                    \
7736             return aSign ? a : b;                                       \
7737         } else {                                                        \
7738             return aSign ? b : a;                                       \
7739         }                                                               \
7740     } else {                                                            \
7741         if (ismin) {                                                    \
7742             return (aSign ^ (av < bv)) ? a : b;                         \
7743         } else {                                                        \
7744             return (aSign ^ (av < bv)) ? b : a;                         \
7745         }                                                               \
7746     }                                                                   \
7747 }                                                                       \
7748                                                                         \
7749 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7750                               float_status *status)                     \
7751 {                                                                       \
7752     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7753 }                                                                       \
7754                                                                         \
7755 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7756                               float_status *status)                     \
7757 {                                                                       \
7758     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7759 }                                                                       \
7760                                                                         \
7761 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7762                                  float_status *status)                  \
7763 {                                                                       \
7764     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7765 }                                                                       \
7766                                                                         \
7767 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7768                                  float_status *status)                  \
7769 {                                                                       \
7770     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7771 }                                                                       \
7772                                                                         \
7773 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7774                                     float_status *status)               \
7775 {                                                                       \
7776     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7777 }                                                                       \
7778                                                                         \
7779 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7780                                     float_status *status)               \
7781 {                                                                       \
7782     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7783 }
7784 
7785 MINMAX(32)
7786 MINMAX(64)
7787 
7788 
7789 /* Multiply A by 2 raised to the power N.  */
7790 float32 float32_scalbn(float32 a, int n, float_status *status)
7791 {
7792     flag aSign;
7793     int16_t aExp;
7794     uint32_t aSig;
7795 
7796     a = float32_squash_input_denormal(a, status);
7797     aSig = extractFloat32Frac( a );
7798     aExp = extractFloat32Exp( a );
7799     aSign = extractFloat32Sign( a );
7800 
7801     if ( aExp == 0xFF ) {
7802         if ( aSig ) {
7803             return propagateFloat32NaN(a, a, status);
7804         }
7805         return a;
7806     }
7807     if (aExp != 0) {
7808         aSig |= 0x00800000;
7809     } else if (aSig == 0) {
7810         return a;
7811     } else {
7812         aExp++;
7813     }
7814 
7815     if (n > 0x200) {
7816         n = 0x200;
7817     } else if (n < -0x200) {
7818         n = -0x200;
7819     }
7820 
7821     aExp += n - 1;
7822     aSig <<= 7;
7823     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7824 }
7825 
7826 float64 float64_scalbn(float64 a, int n, float_status *status)
7827 {
7828     flag aSign;
7829     int16_t aExp;
7830     uint64_t aSig;
7831 
7832     a = float64_squash_input_denormal(a, status);
7833     aSig = extractFloat64Frac( a );
7834     aExp = extractFloat64Exp( a );
7835     aSign = extractFloat64Sign( a );
7836 
7837     if ( aExp == 0x7FF ) {
7838         if ( aSig ) {
7839             return propagateFloat64NaN(a, a, status);
7840         }
7841         return a;
7842     }
7843     if (aExp != 0) {
7844         aSig |= LIT64( 0x0010000000000000 );
7845     } else if (aSig == 0) {
7846         return a;
7847     } else {
7848         aExp++;
7849     }
7850 
7851     if (n > 0x1000) {
7852         n = 0x1000;
7853     } else if (n < -0x1000) {
7854         n = -0x1000;
7855     }
7856 
7857     aExp += n - 1;
7858     aSig <<= 10;
7859     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7860 }
7861 
7862 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7863 {
7864     flag aSign;
7865     int32_t aExp;
7866     uint64_t aSig;
7867 
7868     if (floatx80_invalid_encoding(a)) {
7869         float_raise(float_flag_invalid, status);
7870         return floatx80_default_nan(status);
7871     }
7872     aSig = extractFloatx80Frac( a );
7873     aExp = extractFloatx80Exp( a );
7874     aSign = extractFloatx80Sign( a );
7875 
7876     if ( aExp == 0x7FFF ) {
7877         if ( aSig<<1 ) {
7878             return propagateFloatx80NaN(a, a, status);
7879         }
7880         return a;
7881     }
7882 
7883     if (aExp == 0) {
7884         if (aSig == 0) {
7885             return a;
7886         }
7887         aExp++;
7888     }
7889 
7890     if (n > 0x10000) {
7891         n = 0x10000;
7892     } else if (n < -0x10000) {
7893         n = -0x10000;
7894     }
7895 
7896     aExp += n;
7897     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7898                                          aSign, aExp, aSig, 0, status);
7899 }
7900 
7901 float128 float128_scalbn(float128 a, int n, float_status *status)
7902 {
7903     flag aSign;
7904     int32_t aExp;
7905     uint64_t aSig0, aSig1;
7906 
7907     aSig1 = extractFloat128Frac1( a );
7908     aSig0 = extractFloat128Frac0( a );
7909     aExp = extractFloat128Exp( a );
7910     aSign = extractFloat128Sign( a );
7911     if ( aExp == 0x7FFF ) {
7912         if ( aSig0 | aSig1 ) {
7913             return propagateFloat128NaN(a, a, status);
7914         }
7915         return a;
7916     }
7917     if (aExp != 0) {
7918         aSig0 |= LIT64( 0x0001000000000000 );
7919     } else if (aSig0 == 0 && aSig1 == 0) {
7920         return a;
7921     } else {
7922         aExp++;
7923     }
7924 
7925     if (n > 0x10000) {
7926         n = 0x10000;
7927     } else if (n < -0x10000) {
7928         n = -0x10000;
7929     }
7930 
7931     aExp += n - 1;
7932     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7933                                          , status);
7934 
7935 }
7936