xref: /qemu/fpu/softfloat.c (revision 5dfbc9e4903c0121140f2945f05df48cea72dd82)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
101 
102 static inline uint32_t extractFloat16Frac(float16 a)
103 {
104     return float16_val(a) & 0x3ff;
105 }
106 
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110 
111 static inline int extractFloat16Exp(float16 a)
112 {
113     return (float16_val(a) >> 10) & 0x1f;
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline uint32_t extractFloat32Frac(float32 a)
121 {
122     return float32_val(a) & 0x007FFFFF;
123 }
124 
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128 
129 static inline int extractFloat32Exp(float32 a)
130 {
131     return (float32_val(a) >> 23) & 0xFF;
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137 
138 static inline flag extractFloat32Sign(float32 a)
139 {
140     return float32_val(a) >> 31;
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146 
147 static inline uint64_t extractFloat64Frac(float64 a)
148 {
149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
150 }
151 
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155 
156 static inline int extractFloat64Exp(float64 a)
157 {
158     return (float64_val(a) >> 52) & 0x7FF;
159 }
160 
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164 
165 static inline flag extractFloat64Sign(float64 a)
166 {
167     return float64_val(a) >> 63;
168 }
169 
170 /*
171  * Classify a floating point number. Everything above float_class_qnan
172  * is a NaN so cls >= float_class_qnan is any NaN.
173  */
174 
175 typedef enum __attribute__ ((__packed__)) {
176     float_class_unclassified,
177     float_class_zero,
178     float_class_normal,
179     float_class_inf,
180     float_class_qnan,  /* all NaNs from here */
181     float_class_snan,
182 } FloatClass;
183 
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
186 {
187     return unlikely(c >= float_class_qnan);
188 }
189 
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
191 {
192     return c == float_class_snan;
193 }
194 
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
196 {
197     return c == float_class_qnan;
198 }
199 
200 /*
201  * Structure holding all of the decomposed parts of a float. The
202  * exponent is unbiased and the fraction is normalized. All
203  * calculations are done with a 64 bit fraction and then rounded as
204  * appropriate for the final format.
205  *
206  * Thanks to the packed FloatClass a decent compiler should be able to
207  * fit the whole structure into registers and avoid using the stack
208  * for parameter passing.
209  */
210 
211 typedef struct {
212     uint64_t frac;
213     int32_t  exp;
214     FloatClass cls;
215     bool sign;
216 } FloatParts;
217 
218 #define DECOMPOSED_BINARY_POINT    (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
221 
222 /* Structure holding all of the relevant parameters for a format.
223  *   exp_size: the size of the exponent field
224  *   exp_bias: the offset applied to the exponent field
225  *   exp_max: the maximum normalised exponent
226  *   frac_size: the size of the fraction field
227  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228  * The following are computed based the size of fraction
229  *   frac_lsb: least significant bit of fraction
230  *   frac_lsbm1: the bit below the least significant bit (for rounding)
231  *   round_mask/roundeven_mask: masks used for rounding
232  * The following optional modifiers are available:
233  *   arm_althp: handle ARM Alternative Half Precision
234  */
235 typedef struct {
236     int exp_size;
237     int exp_bias;
238     int exp_max;
239     int frac_size;
240     int frac_shift;
241     uint64_t frac_lsb;
242     uint64_t frac_lsbm1;
243     uint64_t round_mask;
244     uint64_t roundeven_mask;
245     bool arm_althp;
246 } FloatFmt;
247 
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F)                                           \
250     .exp_size       = E,                                             \
251     .exp_bias       = ((1 << E) - 1) >> 1,                           \
252     .exp_max        = (1 << E) - 1,                                  \
253     .frac_size      = F,                                             \
254     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
255     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
256     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
257     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
258     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
259 
260 static const FloatFmt float16_params = {
261     FLOAT_PARAMS(5, 10)
262 };
263 
264 static const FloatFmt float16_params_ahp = {
265     FLOAT_PARAMS(5, 10),
266     .arm_althp = true
267 };
268 
269 static const FloatFmt float32_params = {
270     FLOAT_PARAMS(8, 23)
271 };
272 
273 static const FloatFmt float64_params = {
274     FLOAT_PARAMS(11, 52)
275 };
276 
277 /* Unpack a float to parts, but do not canonicalize.  */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
279 {
280     const int sign_pos = fmt.frac_size + fmt.exp_size;
281 
282     return (FloatParts) {
283         .cls = float_class_unclassified,
284         .sign = extract64(raw, sign_pos, 1),
285         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286         .frac = extract64(raw, 0, fmt.frac_size),
287     };
288 }
289 
290 static inline FloatParts float16_unpack_raw(float16 f)
291 {
292     return unpack_raw(float16_params, f);
293 }
294 
295 static inline FloatParts float32_unpack_raw(float32 f)
296 {
297     return unpack_raw(float32_params, f);
298 }
299 
300 static inline FloatParts float64_unpack_raw(float64 f)
301 {
302     return unpack_raw(float64_params, f);
303 }
304 
305 /* Pack a float from parts, but do not canonicalize.  */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
307 {
308     const int sign_pos = fmt.frac_size + fmt.exp_size;
309     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310     return deposit64(ret, sign_pos, 1, p.sign);
311 }
312 
313 static inline float16 float16_pack_raw(FloatParts p)
314 {
315     return make_float16(pack_raw(float16_params, p));
316 }
317 
318 static inline float32 float32_pack_raw(FloatParts p)
319 {
320     return make_float32(pack_raw(float32_params, p));
321 }
322 
323 static inline float64 float64_pack_raw(FloatParts p)
324 {
325     return make_float64(pack_raw(float64_params, p));
326 }
327 
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine:  (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output.  These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
337 
338 /* Canonicalize EXP and FRAC, setting CLS.  */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340                                float_status *status)
341 {
342     if (part.exp == parm->exp_max && !parm->arm_althp) {
343         if (part.frac == 0) {
344             part.cls = float_class_inf;
345         } else {
346             part.frac <<= parm->frac_shift;
347             part.cls = (parts_is_snan_frac(part.frac, status)
348                         ? float_class_snan : float_class_qnan);
349         }
350     } else if (part.exp == 0) {
351         if (likely(part.frac == 0)) {
352             part.cls = float_class_zero;
353         } else if (status->flush_inputs_to_zero) {
354             float_raise(float_flag_input_denormal, status);
355             part.cls = float_class_zero;
356             part.frac = 0;
357         } else {
358             int shift = clz64(part.frac) - 1;
359             part.cls = float_class_normal;
360             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361             part.frac <<= shift;
362         }
363     } else {
364         part.cls = float_class_normal;
365         part.exp -= parm->exp_bias;
366         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
367     }
368     return part;
369 }
370 
371 /* Round and uncanonicalize a floating-point number by parts. There
372  * are FRAC_SHIFT bits that may require rounding at the bottom of the
373  * fraction; these bits will be removed. The exponent will be biased
374  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
375  */
376 
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378                                   const FloatFmt *parm)
379 {
380     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381     const uint64_t round_mask = parm->round_mask;
382     const uint64_t roundeven_mask = parm->roundeven_mask;
383     const int exp_max = parm->exp_max;
384     const int frac_shift = parm->frac_shift;
385     uint64_t frac, inc;
386     int exp, flags = 0;
387     bool overflow_norm;
388 
389     frac = p.frac;
390     exp = p.exp;
391 
392     switch (p.cls) {
393     case float_class_normal:
394         switch (s->float_rounding_mode) {
395         case float_round_nearest_even:
396             overflow_norm = false;
397             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398             break;
399         case float_round_ties_away:
400             overflow_norm = false;
401             inc = frac_lsbm1;
402             break;
403         case float_round_to_zero:
404             overflow_norm = true;
405             inc = 0;
406             break;
407         case float_round_up:
408             inc = p.sign ? 0 : round_mask;
409             overflow_norm = p.sign;
410             break;
411         case float_round_down:
412             inc = p.sign ? round_mask : 0;
413             overflow_norm = !p.sign;
414             break;
415         default:
416             g_assert_not_reached();
417         }
418 
419         exp += parm->exp_bias;
420         if (likely(exp > 0)) {
421             if (frac & round_mask) {
422                 flags |= float_flag_inexact;
423                 frac += inc;
424                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425                     frac >>= 1;
426                     exp++;
427                 }
428             }
429             frac >>= frac_shift;
430 
431             if (parm->arm_althp) {
432                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
433                 if (unlikely(exp > exp_max)) {
434                     /* Overflow.  Return the maximum normal.  */
435                     flags = float_flag_invalid;
436                     exp = exp_max;
437                     frac = -1;
438                 }
439             } else if (unlikely(exp >= exp_max)) {
440                 flags |= float_flag_overflow | float_flag_inexact;
441                 if (overflow_norm) {
442                     exp = exp_max - 1;
443                     frac = -1;
444                 } else {
445                     p.cls = float_class_inf;
446                     goto do_inf;
447                 }
448             }
449         } else if (s->flush_to_zero) {
450             flags |= float_flag_output_denormal;
451             p.cls = float_class_zero;
452             goto do_zero;
453         } else {
454             bool is_tiny = (s->float_detect_tininess
455                             == float_tininess_before_rounding)
456                         || (exp < 0)
457                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
458 
459             shift64RightJamming(frac, 1 - exp, &frac);
460             if (frac & round_mask) {
461                 /* Need to recompute round-to-even.  */
462                 if (s->float_rounding_mode == float_round_nearest_even) {
463                     inc = ((frac & roundeven_mask) != frac_lsbm1
464                            ? frac_lsbm1 : 0);
465                 }
466                 flags |= float_flag_inexact;
467                 frac += inc;
468             }
469 
470             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471             frac >>= frac_shift;
472 
473             if (is_tiny && (flags & float_flag_inexact)) {
474                 flags |= float_flag_underflow;
475             }
476             if (exp == 0 && frac == 0) {
477                 p.cls = float_class_zero;
478             }
479         }
480         break;
481 
482     case float_class_zero:
483     do_zero:
484         exp = 0;
485         frac = 0;
486         break;
487 
488     case float_class_inf:
489     do_inf:
490         assert(!parm->arm_althp);
491         exp = exp_max;
492         frac = 0;
493         break;
494 
495     case float_class_qnan:
496     case float_class_snan:
497         assert(!parm->arm_althp);
498         exp = exp_max;
499         frac >>= parm->frac_shift;
500         break;
501 
502     default:
503         g_assert_not_reached();
504     }
505 
506     float_raise(flags, s);
507     p.exp = exp;
508     p.frac = frac;
509     return p;
510 }
511 
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514                                             const FloatFmt *params)
515 {
516     return canonicalize(float16_unpack_raw(f), params, s);
517 }
518 
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
520 {
521     return float16a_unpack_canonical(f, s, &float16_params);
522 }
523 
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525                                              const FloatFmt *params)
526 {
527     return float16_pack_raw(round_canonical(p, s, params));
528 }
529 
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
531 {
532     return float16a_round_pack_canonical(p, s, &float16_params);
533 }
534 
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
536 {
537     return canonicalize(float32_unpack_raw(f), &float32_params, s);
538 }
539 
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
541 {
542     return float32_pack_raw(round_canonical(p, s, &float32_params));
543 }
544 
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
546 {
547     return canonicalize(float64_unpack_raw(f), &float64_params, s);
548 }
549 
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
551 {
552     return float64_pack_raw(round_canonical(p, s, &float64_params));
553 }
554 
555 static FloatParts return_nan(FloatParts a, float_status *s)
556 {
557     switch (a.cls) {
558     case float_class_snan:
559         s->float_exception_flags |= float_flag_invalid;
560         a = parts_silence_nan(a, s);
561         /* fall through */
562     case float_class_qnan:
563         if (s->default_nan_mode) {
564             return parts_default_nan(s);
565         }
566         break;
567 
568     default:
569         g_assert_not_reached();
570     }
571     return a;
572 }
573 
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
575 {
576     if (is_snan(a.cls) || is_snan(b.cls)) {
577         s->float_exception_flags |= float_flag_invalid;
578     }
579 
580     if (s->default_nan_mode) {
581         return parts_default_nan(s);
582     } else {
583         if (pickNaN(a.cls, b.cls,
584                     a.frac > b.frac ||
585                     (a.frac == b.frac && a.sign < b.sign))) {
586             a = b;
587         }
588         if (is_snan(a.cls)) {
589             return parts_silence_nan(a, s);
590         }
591     }
592     return a;
593 }
594 
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596                                   bool inf_zero, float_status *s)
597 {
598     int which;
599 
600     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601         s->float_exception_flags |= float_flag_invalid;
602     }
603 
604     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
605 
606     if (s->default_nan_mode) {
607         /* Note that this check is after pickNaNMulAdd so that function
608          * has an opportunity to set the Invalid flag.
609          */
610         which = 3;
611     }
612 
613     switch (which) {
614     case 0:
615         break;
616     case 1:
617         a = b;
618         break;
619     case 2:
620         a = c;
621         break;
622     case 3:
623         return parts_default_nan(s);
624     default:
625         g_assert_not_reached();
626     }
627 
628     if (is_snan(a.cls)) {
629         return parts_silence_nan(a, s);
630     }
631     return a;
632 }
633 
634 /*
635  * Returns the result of adding or subtracting the values of the
636  * floating-point values `a' and `b'. The operation is performed
637  * according to the IEC/IEEE Standard for Binary Floating-Point
638  * Arithmetic.
639  */
640 
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642                                 float_status *s)
643 {
644     bool a_sign = a.sign;
645     bool b_sign = b.sign ^ subtract;
646 
647     if (a_sign != b_sign) {
648         /* Subtraction */
649 
650         if (a.cls == float_class_normal && b.cls == float_class_normal) {
651             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653                 a.frac = a.frac - b.frac;
654             } else {
655                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656                 a.frac = b.frac - a.frac;
657                 a.exp = b.exp;
658                 a_sign ^= 1;
659             }
660 
661             if (a.frac == 0) {
662                 a.cls = float_class_zero;
663                 a.sign = s->float_rounding_mode == float_round_down;
664             } else {
665                 int shift = clz64(a.frac) - 1;
666                 a.frac = a.frac << shift;
667                 a.exp = a.exp - shift;
668                 a.sign = a_sign;
669             }
670             return a;
671         }
672         if (is_nan(a.cls) || is_nan(b.cls)) {
673             return pick_nan(a, b, s);
674         }
675         if (a.cls == float_class_inf) {
676             if (b.cls == float_class_inf) {
677                 float_raise(float_flag_invalid, s);
678                 return parts_default_nan(s);
679             }
680             return a;
681         }
682         if (a.cls == float_class_zero && b.cls == float_class_zero) {
683             a.sign = s->float_rounding_mode == float_round_down;
684             return a;
685         }
686         if (a.cls == float_class_zero || b.cls == float_class_inf) {
687             b.sign = a_sign ^ 1;
688             return b;
689         }
690         if (b.cls == float_class_zero) {
691             return a;
692         }
693     } else {
694         /* Addition */
695         if (a.cls == float_class_normal && b.cls == float_class_normal) {
696             if (a.exp > b.exp) {
697                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698             } else if (a.exp < b.exp) {
699                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700                 a.exp = b.exp;
701             }
702             a.frac += b.frac;
703             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704                 shift64RightJamming(a.frac, 1, &a.frac);
705                 a.exp += 1;
706             }
707             return a;
708         }
709         if (is_nan(a.cls) || is_nan(b.cls)) {
710             return pick_nan(a, b, s);
711         }
712         if (a.cls == float_class_inf || b.cls == float_class_zero) {
713             return a;
714         }
715         if (b.cls == float_class_inf || a.cls == float_class_zero) {
716             b.sign = b_sign;
717             return b;
718         }
719     }
720     g_assert_not_reached();
721 }
722 
723 /*
724  * Returns the result of adding or subtracting the floating-point
725  * values `a' and `b'. The operation is performed according to the
726  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
727  */
728 
729 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
730                                               float_status *status)
731 {
732     FloatParts pa = float16_unpack_canonical(a, status);
733     FloatParts pb = float16_unpack_canonical(b, status);
734     FloatParts pr = addsub_floats(pa, pb, false, status);
735 
736     return float16_round_pack_canonical(pr, status);
737 }
738 
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740                                              float_status *status)
741 {
742     FloatParts pa = float32_unpack_canonical(a, status);
743     FloatParts pb = float32_unpack_canonical(b, status);
744     FloatParts pr = addsub_floats(pa, pb, false, status);
745 
746     return float32_round_pack_canonical(pr, status);
747 }
748 
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750                                              float_status *status)
751 {
752     FloatParts pa = float64_unpack_canonical(a, status);
753     FloatParts pb = float64_unpack_canonical(b, status);
754     FloatParts pr = addsub_floats(pa, pb, false, status);
755 
756     return float64_round_pack_canonical(pr, status);
757 }
758 
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760                                              float_status *status)
761 {
762     FloatParts pa = float16_unpack_canonical(a, status);
763     FloatParts pb = float16_unpack_canonical(b, status);
764     FloatParts pr = addsub_floats(pa, pb, true, status);
765 
766     return float16_round_pack_canonical(pr, status);
767 }
768 
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770                                              float_status *status)
771 {
772     FloatParts pa = float32_unpack_canonical(a, status);
773     FloatParts pb = float32_unpack_canonical(b, status);
774     FloatParts pr = addsub_floats(pa, pb, true, status);
775 
776     return float32_round_pack_canonical(pr, status);
777 }
778 
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780                                              float_status *status)
781 {
782     FloatParts pa = float64_unpack_canonical(a, status);
783     FloatParts pb = float64_unpack_canonical(b, status);
784     FloatParts pr = addsub_floats(pa, pb, true, status);
785 
786     return float64_round_pack_canonical(pr, status);
787 }
788 
789 /*
790  * Returns the result of multiplying the floating-point values `a' and
791  * `b'. The operation is performed according to the IEC/IEEE Standard
792  * for Binary Floating-Point Arithmetic.
793  */
794 
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
796 {
797     bool sign = a.sign ^ b.sign;
798 
799     if (a.cls == float_class_normal && b.cls == float_class_normal) {
800         uint64_t hi, lo;
801         int exp = a.exp + b.exp;
802 
803         mul64To128(a.frac, b.frac, &hi, &lo);
804         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805         if (lo & DECOMPOSED_OVERFLOW_BIT) {
806             shift64RightJamming(lo, 1, &lo);
807             exp += 1;
808         }
809 
810         /* Re-use a */
811         a.exp = exp;
812         a.sign = sign;
813         a.frac = lo;
814         return a;
815     }
816     /* handle all the NaN cases */
817     if (is_nan(a.cls) || is_nan(b.cls)) {
818         return pick_nan(a, b, s);
819     }
820     /* Inf * Zero == NaN */
821     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822         (a.cls == float_class_zero && b.cls == float_class_inf)) {
823         s->float_exception_flags |= float_flag_invalid;
824         return parts_default_nan(s);
825     }
826     /* Multiply by 0 or Inf */
827     if (a.cls == float_class_inf || a.cls == float_class_zero) {
828         a.sign = sign;
829         return a;
830     }
831     if (b.cls == float_class_inf || b.cls == float_class_zero) {
832         b.sign = sign;
833         return b;
834     }
835     g_assert_not_reached();
836 }
837 
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839                                              float_status *status)
840 {
841     FloatParts pa = float16_unpack_canonical(a, status);
842     FloatParts pb = float16_unpack_canonical(b, status);
843     FloatParts pr = mul_floats(pa, pb, status);
844 
845     return float16_round_pack_canonical(pr, status);
846 }
847 
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849                                              float_status *status)
850 {
851     FloatParts pa = float32_unpack_canonical(a, status);
852     FloatParts pb = float32_unpack_canonical(b, status);
853     FloatParts pr = mul_floats(pa, pb, status);
854 
855     return float32_round_pack_canonical(pr, status);
856 }
857 
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859                                              float_status *status)
860 {
861     FloatParts pa = float64_unpack_canonical(a, status);
862     FloatParts pb = float64_unpack_canonical(b, status);
863     FloatParts pr = mul_floats(pa, pb, status);
864 
865     return float64_round_pack_canonical(pr, status);
866 }
867 
868 /*
869  * Returns the result of multiplying the floating-point values `a' and
870  * `b' then adding 'c', with no intermediate rounding step after the
871  * multiplication. The operation is performed according to the
872  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873  * The flags argument allows the caller to select negation of the
874  * addend, the intermediate product, or the final result. (The
875  * difference between this and having the caller do a separate
876  * negation is that negating externally will flip the sign bit on
877  * NaNs.)
878  */
879 
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881                                 int flags, float_status *s)
882 {
883     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884                     ((1 << float_class_inf) | (1 << float_class_zero));
885     bool p_sign;
886     bool sign_flip = flags & float_muladd_negate_result;
887     FloatClass p_class;
888     uint64_t hi, lo;
889     int p_exp;
890 
891     /* It is implementation-defined whether the cases of (0,inf,qnan)
892      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893      * they return if they do), so we have to hand this information
894      * off to the target-specific pick-a-NaN routine.
895      */
896     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897         return pick_nan_muladd(a, b, c, inf_zero, s);
898     }
899 
900     if (inf_zero) {
901         s->float_exception_flags |= float_flag_invalid;
902         return parts_default_nan(s);
903     }
904 
905     if (flags & float_muladd_negate_c) {
906         c.sign ^= 1;
907     }
908 
909     p_sign = a.sign ^ b.sign;
910 
911     if (flags & float_muladd_negate_product) {
912         p_sign ^= 1;
913     }
914 
915     if (a.cls == float_class_inf || b.cls == float_class_inf) {
916         p_class = float_class_inf;
917     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918         p_class = float_class_zero;
919     } else {
920         p_class = float_class_normal;
921     }
922 
923     if (c.cls == float_class_inf) {
924         if (p_class == float_class_inf && p_sign != c.sign) {
925             s->float_exception_flags |= float_flag_invalid;
926             return parts_default_nan(s);
927         } else {
928             a.cls = float_class_inf;
929             a.sign = c.sign ^ sign_flip;
930             return a;
931         }
932     }
933 
934     if (p_class == float_class_inf) {
935         a.cls = float_class_inf;
936         a.sign = p_sign ^ sign_flip;
937         return a;
938     }
939 
940     if (p_class == float_class_zero) {
941         if (c.cls == float_class_zero) {
942             if (p_sign != c.sign) {
943                 p_sign = s->float_rounding_mode == float_round_down;
944             }
945             c.sign = p_sign;
946         } else if (flags & float_muladd_halve_result) {
947             c.exp -= 1;
948         }
949         c.sign ^= sign_flip;
950         return c;
951     }
952 
953     /* a & b should be normals now... */
954     assert(a.cls == float_class_normal &&
955            b.cls == float_class_normal);
956 
957     p_exp = a.exp + b.exp;
958 
959     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960      * result.
961      */
962     mul64To128(a.frac, b.frac, &hi, &lo);
963     /* binary point now at bit 124 */
964 
965     /* check for overflow */
966     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967         shift128RightJamming(hi, lo, 1, &hi, &lo);
968         p_exp += 1;
969     }
970 
971     /* + add/sub */
972     if (c.cls == float_class_zero) {
973         /* move binary point back to 62 */
974         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975     } else {
976         int exp_diff = p_exp - c.exp;
977         if (p_sign == c.sign) {
978             /* Addition */
979             if (exp_diff <= 0) {
980                 shift128RightJamming(hi, lo,
981                                      DECOMPOSED_BINARY_POINT - exp_diff,
982                                      &hi, &lo);
983                 lo += c.frac;
984                 p_exp = c.exp;
985             } else {
986                 uint64_t c_hi, c_lo;
987                 /* shift c to the same binary point as the product (124) */
988                 c_hi = c.frac >> 2;
989                 c_lo = 0;
990                 shift128RightJamming(c_hi, c_lo,
991                                      exp_diff,
992                                      &c_hi, &c_lo);
993                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994                 /* move binary point back to 62 */
995                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
996             }
997 
998             if (lo & DECOMPOSED_OVERFLOW_BIT) {
999                 shift64RightJamming(lo, 1, &lo);
1000                 p_exp += 1;
1001             }
1002 
1003         } else {
1004             /* Subtraction */
1005             uint64_t c_hi, c_lo;
1006             /* make C binary point match product at bit 124 */
1007             c_hi = c.frac >> 2;
1008             c_lo = 0;
1009 
1010             if (exp_diff <= 0) {
1011                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012                 if (exp_diff == 0
1013                     &&
1014                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016                 } else {
1017                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018                     p_sign ^= 1;
1019                     p_exp = c.exp;
1020                 }
1021             } else {
1022                 shift128RightJamming(c_hi, c_lo,
1023                                      exp_diff,
1024                                      &c_hi, &c_lo);
1025                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026             }
1027 
1028             if (hi == 0 && lo == 0) {
1029                 a.cls = float_class_zero;
1030                 a.sign = s->float_rounding_mode == float_round_down;
1031                 a.sign ^= sign_flip;
1032                 return a;
1033             } else {
1034                 int shift;
1035                 if (hi != 0) {
1036                     shift = clz64(hi);
1037                 } else {
1038                     shift = clz64(lo) + 64;
1039                 }
1040                 /* Normalizing to a binary point of 124 is the
1041                    correct adjust for the exponent.  However since we're
1042                    shifting, we might as well put the binary point back
1043                    at 62 where we really want it.  Therefore shift as
1044                    if we're leaving 1 bit at the top of the word, but
1045                    adjust the exponent as if we're leaving 3 bits.  */
1046                 shift -= 1;
1047                 if (shift >= 64) {
1048                     lo = lo << (shift - 64);
1049                 } else {
1050                     hi = (hi << shift) | (lo >> (64 - shift));
1051                     lo = hi | ((lo << shift) != 0);
1052                 }
1053                 p_exp -= shift - 2;
1054             }
1055         }
1056     }
1057 
1058     if (flags & float_muladd_halve_result) {
1059         p_exp -= 1;
1060     }
1061 
1062     /* finally prepare our result */
1063     a.cls = float_class_normal;
1064     a.sign = p_sign ^ sign_flip;
1065     a.exp = p_exp;
1066     a.frac = lo;
1067 
1068     return a;
1069 }
1070 
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072                                                 int flags, float_status *status)
1073 {
1074     FloatParts pa = float16_unpack_canonical(a, status);
1075     FloatParts pb = float16_unpack_canonical(b, status);
1076     FloatParts pc = float16_unpack_canonical(c, status);
1077     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1078 
1079     return float16_round_pack_canonical(pr, status);
1080 }
1081 
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083                                                 int flags, float_status *status)
1084 {
1085     FloatParts pa = float32_unpack_canonical(a, status);
1086     FloatParts pb = float32_unpack_canonical(b, status);
1087     FloatParts pc = float32_unpack_canonical(c, status);
1088     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1089 
1090     return float32_round_pack_canonical(pr, status);
1091 }
1092 
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094                                                 int flags, float_status *status)
1095 {
1096     FloatParts pa = float64_unpack_canonical(a, status);
1097     FloatParts pb = float64_unpack_canonical(b, status);
1098     FloatParts pc = float64_unpack_canonical(c, status);
1099     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1100 
1101     return float64_round_pack_canonical(pr, status);
1102 }
1103 
1104 /*
1105  * Returns the result of dividing the floating-point value `a' by the
1106  * corresponding value `b'. The operation is performed according to
1107  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108  */
1109 
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1111 {
1112     bool sign = a.sign ^ b.sign;
1113 
1114     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115         uint64_t n0, n1, q, r;
1116         int exp = a.exp - b.exp;
1117 
1118         /*
1119          * We want a 2*N / N-bit division to produce exactly an N-bit
1120          * result, so that we do not lose any precision and so that we
1121          * do not have to renormalize afterward.  If A.frac < B.frac,
1122          * then division would produce an (N-1)-bit result; shift A left
1123          * by one to produce the an N-bit result, and decrement the
1124          * exponent to match.
1125          *
1126          * The udiv_qrnnd algorithm that we're using requires normalization,
1127          * i.e. the msb of the denominator must be set.  Since we know that
1128          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1129          * by one (more), and the remainder must be shifted right by one.
1130          */
1131         if (a.frac < b.frac) {
1132             exp -= 1;
1133             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1134         } else {
1135             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1136         }
1137         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1138 
1139         /*
1140          * Set lsb if there is a remainder, to set inexact.
1141          * As mentioned above, to find the actual value of the remainder we
1142          * would need to shift right, but (1) we are only concerned about
1143          * non-zero-ness, and (2) the remainder will always be even because
1144          * both inputs to the division primitive are even.
1145          */
1146         a.frac = q | (r != 0);
1147         a.sign = sign;
1148         a.exp = exp;
1149         return a;
1150     }
1151     /* handle all the NaN cases */
1152     if (is_nan(a.cls) || is_nan(b.cls)) {
1153         return pick_nan(a, b, s);
1154     }
1155     /* 0/0 or Inf/Inf */
1156     if (a.cls == b.cls
1157         &&
1158         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1159         s->float_exception_flags |= float_flag_invalid;
1160         return parts_default_nan(s);
1161     }
1162     /* Inf / x or 0 / x */
1163     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1164         a.sign = sign;
1165         return a;
1166     }
1167     /* Div 0 => Inf */
1168     if (b.cls == float_class_zero) {
1169         s->float_exception_flags |= float_flag_divbyzero;
1170         a.cls = float_class_inf;
1171         a.sign = sign;
1172         return a;
1173     }
1174     /* Div by Inf */
1175     if (b.cls == float_class_inf) {
1176         a.cls = float_class_zero;
1177         a.sign = sign;
1178         return a;
1179     }
1180     g_assert_not_reached();
1181 }
1182 
1183 float16 float16_div(float16 a, float16 b, float_status *status)
1184 {
1185     FloatParts pa = float16_unpack_canonical(a, status);
1186     FloatParts pb = float16_unpack_canonical(b, status);
1187     FloatParts pr = div_floats(pa, pb, status);
1188 
1189     return float16_round_pack_canonical(pr, status);
1190 }
1191 
1192 float32 float32_div(float32 a, float32 b, float_status *status)
1193 {
1194     FloatParts pa = float32_unpack_canonical(a, status);
1195     FloatParts pb = float32_unpack_canonical(b, status);
1196     FloatParts pr = div_floats(pa, pb, status);
1197 
1198     return float32_round_pack_canonical(pr, status);
1199 }
1200 
1201 float64 float64_div(float64 a, float64 b, float_status *status)
1202 {
1203     FloatParts pa = float64_unpack_canonical(a, status);
1204     FloatParts pb = float64_unpack_canonical(b, status);
1205     FloatParts pr = div_floats(pa, pb, status);
1206 
1207     return float64_round_pack_canonical(pr, status);
1208 }
1209 
1210 /*
1211  * Float to Float conversions
1212  *
1213  * Returns the result of converting one float format to another. The
1214  * conversion is performed according to the IEC/IEEE Standard for
1215  * Binary Floating-Point Arithmetic.
1216  *
1217  * The float_to_float helper only needs to take care of raising
1218  * invalid exceptions and handling the conversion on NaNs.
1219  */
1220 
1221 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1222                                  float_status *s)
1223 {
1224     if (dstf->arm_althp) {
1225         switch (a.cls) {
1226         case float_class_qnan:
1227         case float_class_snan:
1228             /* There is no NaN in the destination format.  Raise Invalid
1229              * and return a zero with the sign of the input NaN.
1230              */
1231             s->float_exception_flags |= float_flag_invalid;
1232             a.cls = float_class_zero;
1233             a.frac = 0;
1234             a.exp = 0;
1235             break;
1236 
1237         case float_class_inf:
1238             /* There is no Inf in the destination format.  Raise Invalid
1239              * and return the maximum normal with the correct sign.
1240              */
1241             s->float_exception_flags |= float_flag_invalid;
1242             a.cls = float_class_normal;
1243             a.exp = dstf->exp_max;
1244             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1245             break;
1246 
1247         default:
1248             break;
1249         }
1250     } else if (is_nan(a.cls)) {
1251         if (is_snan(a.cls)) {
1252             s->float_exception_flags |= float_flag_invalid;
1253             a = parts_silence_nan(a, s);
1254         }
1255         if (s->default_nan_mode) {
1256             return parts_default_nan(s);
1257         }
1258     }
1259     return a;
1260 }
1261 
1262 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1263 {
1264     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1265     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1266     FloatParts pr = float_to_float(p, &float32_params, s);
1267     return float32_round_pack_canonical(pr, s);
1268 }
1269 
1270 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1271 {
1272     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1273     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1274     FloatParts pr = float_to_float(p, &float64_params, s);
1275     return float64_round_pack_canonical(pr, s);
1276 }
1277 
1278 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1279 {
1280     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1281     FloatParts p = float32_unpack_canonical(a, s);
1282     FloatParts pr = float_to_float(p, fmt16, s);
1283     return float16a_round_pack_canonical(pr, s, fmt16);
1284 }
1285 
1286 float64 float32_to_float64(float32 a, float_status *s)
1287 {
1288     FloatParts p = float32_unpack_canonical(a, s);
1289     FloatParts pr = float_to_float(p, &float64_params, s);
1290     return float64_round_pack_canonical(pr, s);
1291 }
1292 
1293 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1294 {
1295     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1296     FloatParts p = float64_unpack_canonical(a, s);
1297     FloatParts pr = float_to_float(p, fmt16, s);
1298     return float16a_round_pack_canonical(pr, s, fmt16);
1299 }
1300 
1301 float32 float64_to_float32(float64 a, float_status *s)
1302 {
1303     FloatParts p = float64_unpack_canonical(a, s);
1304     FloatParts pr = float_to_float(p, &float32_params, s);
1305     return float32_round_pack_canonical(pr, s);
1306 }
1307 
1308 /*
1309  * Rounds the floating-point value `a' to an integer, and returns the
1310  * result as a floating-point value. The operation is performed
1311  * according to the IEC/IEEE Standard for Binary Floating-Point
1312  * Arithmetic.
1313  */
1314 
1315 static FloatParts round_to_int(FloatParts a, int rmode,
1316                                int scale, float_status *s)
1317 {
1318     switch (a.cls) {
1319     case float_class_qnan:
1320     case float_class_snan:
1321         return return_nan(a, s);
1322 
1323     case float_class_zero:
1324     case float_class_inf:
1325         /* already "integral" */
1326         break;
1327 
1328     case float_class_normal:
1329         scale = MIN(MAX(scale, -0x10000), 0x10000);
1330         a.exp += scale;
1331 
1332         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1333             /* already integral */
1334             break;
1335         }
1336         if (a.exp < 0) {
1337             bool one;
1338             /* all fractional */
1339             s->float_exception_flags |= float_flag_inexact;
1340             switch (rmode) {
1341             case float_round_nearest_even:
1342                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1343                 break;
1344             case float_round_ties_away:
1345                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1346                 break;
1347             case float_round_to_zero:
1348                 one = false;
1349                 break;
1350             case float_round_up:
1351                 one = !a.sign;
1352                 break;
1353             case float_round_down:
1354                 one = a.sign;
1355                 break;
1356             default:
1357                 g_assert_not_reached();
1358             }
1359 
1360             if (one) {
1361                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1362                 a.exp = 0;
1363             } else {
1364                 a.cls = float_class_zero;
1365             }
1366         } else {
1367             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1368             uint64_t frac_lsbm1 = frac_lsb >> 1;
1369             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1370             uint64_t rnd_mask = rnd_even_mask >> 1;
1371             uint64_t inc;
1372 
1373             switch (rmode) {
1374             case float_round_nearest_even:
1375                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1376                 break;
1377             case float_round_ties_away:
1378                 inc = frac_lsbm1;
1379                 break;
1380             case float_round_to_zero:
1381                 inc = 0;
1382                 break;
1383             case float_round_up:
1384                 inc = a.sign ? 0 : rnd_mask;
1385                 break;
1386             case float_round_down:
1387                 inc = a.sign ? rnd_mask : 0;
1388                 break;
1389             default:
1390                 g_assert_not_reached();
1391             }
1392 
1393             if (a.frac & rnd_mask) {
1394                 s->float_exception_flags |= float_flag_inexact;
1395                 a.frac += inc;
1396                 a.frac &= ~rnd_mask;
1397                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1398                     a.frac >>= 1;
1399                     a.exp++;
1400                 }
1401             }
1402         }
1403         break;
1404     default:
1405         g_assert_not_reached();
1406     }
1407     return a;
1408 }
1409 
1410 float16 float16_round_to_int(float16 a, float_status *s)
1411 {
1412     FloatParts pa = float16_unpack_canonical(a, s);
1413     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1414     return float16_round_pack_canonical(pr, s);
1415 }
1416 
1417 float32 float32_round_to_int(float32 a, float_status *s)
1418 {
1419     FloatParts pa = float32_unpack_canonical(a, s);
1420     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1421     return float32_round_pack_canonical(pr, s);
1422 }
1423 
1424 float64 float64_round_to_int(float64 a, float_status *s)
1425 {
1426     FloatParts pa = float64_unpack_canonical(a, s);
1427     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1428     return float64_round_pack_canonical(pr, s);
1429 }
1430 
1431 /*
1432  * Returns the result of converting the floating-point value `a' to
1433  * the two's complement integer format. The conversion is performed
1434  * according to the IEC/IEEE Standard for Binary Floating-Point
1435  * Arithmetic---which means in particular that the conversion is
1436  * rounded according to the current rounding mode. If `a' is a NaN,
1437  * the largest positive integer is returned. Otherwise, if the
1438  * conversion overflows, the largest integer with the same sign as `a'
1439  * is returned.
1440 */
1441 
1442 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1443                                      int64_t min, int64_t max,
1444                                      float_status *s)
1445 {
1446     uint64_t r;
1447     int orig_flags = get_float_exception_flags(s);
1448     FloatParts p = round_to_int(in, rmode, scale, s);
1449 
1450     switch (p.cls) {
1451     case float_class_snan:
1452     case float_class_qnan:
1453         s->float_exception_flags = orig_flags | float_flag_invalid;
1454         return max;
1455     case float_class_inf:
1456         s->float_exception_flags = orig_flags | float_flag_invalid;
1457         return p.sign ? min : max;
1458     case float_class_zero:
1459         return 0;
1460     case float_class_normal:
1461         if (p.exp < DECOMPOSED_BINARY_POINT) {
1462             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1463         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1464             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1465         } else {
1466             r = UINT64_MAX;
1467         }
1468         if (p.sign) {
1469             if (r <= -(uint64_t) min) {
1470                 return -r;
1471             } else {
1472                 s->float_exception_flags = orig_flags | float_flag_invalid;
1473                 return min;
1474             }
1475         } else {
1476             if (r <= max) {
1477                 return r;
1478             } else {
1479                 s->float_exception_flags = orig_flags | float_flag_invalid;
1480                 return max;
1481             }
1482         }
1483     default:
1484         g_assert_not_reached();
1485     }
1486 }
1487 
1488 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1489                                 float_status *s)
1490 {
1491     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1492                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1493 }
1494 
1495 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1496                                 float_status *s)
1497 {
1498     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1499                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1500 }
1501 
1502 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1503                                 float_status *s)
1504 {
1505     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1506                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1507 }
1508 
1509 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1510                                 float_status *s)
1511 {
1512     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1513                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1514 }
1515 
1516 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1517                                 float_status *s)
1518 {
1519     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1520                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1521 }
1522 
1523 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1524                                 float_status *s)
1525 {
1526     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1527                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1528 }
1529 
1530 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1531                                 float_status *s)
1532 {
1533     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1534                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1535 }
1536 
1537 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1538                                 float_status *s)
1539 {
1540     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1541                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1542 }
1543 
1544 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1545                                 float_status *s)
1546 {
1547     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1548                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1549 }
1550 
1551 int16_t float16_to_int16(float16 a, float_status *s)
1552 {
1553     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1554 }
1555 
1556 int32_t float16_to_int32(float16 a, float_status *s)
1557 {
1558     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1559 }
1560 
1561 int64_t float16_to_int64(float16 a, float_status *s)
1562 {
1563     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1564 }
1565 
1566 int16_t float32_to_int16(float32 a, float_status *s)
1567 {
1568     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1569 }
1570 
1571 int32_t float32_to_int32(float32 a, float_status *s)
1572 {
1573     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1574 }
1575 
1576 int64_t float32_to_int64(float32 a, float_status *s)
1577 {
1578     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1579 }
1580 
1581 int16_t float64_to_int16(float64 a, float_status *s)
1582 {
1583     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1584 }
1585 
1586 int32_t float64_to_int32(float64 a, float_status *s)
1587 {
1588     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1589 }
1590 
1591 int64_t float64_to_int64(float64 a, float_status *s)
1592 {
1593     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1594 }
1595 
1596 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1597 {
1598     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1599 }
1600 
1601 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1602 {
1603     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1604 }
1605 
1606 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1607 {
1608     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1609 }
1610 
1611 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1612 {
1613     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1614 }
1615 
1616 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1617 {
1618     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1619 }
1620 
1621 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1622 {
1623     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1624 }
1625 
1626 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1627 {
1628     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1629 }
1630 
1631 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1632 {
1633     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1634 }
1635 
1636 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1637 {
1638     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1639 }
1640 
1641 /*
1642  *  Returns the result of converting the floating-point value `a' to
1643  *  the unsigned integer format. The conversion is performed according
1644  *  to the IEC/IEEE Standard for Binary Floating-Point
1645  *  Arithmetic---which means in particular that the conversion is
1646  *  rounded according to the current rounding mode. If `a' is a NaN,
1647  *  the largest unsigned integer is returned. Otherwise, if the
1648  *  conversion overflows, the largest unsigned integer is returned. If
1649  *  the 'a' is negative, the result is rounded and zero is returned;
1650  *  values that do not round to zero will raise the inexact exception
1651  *  flag.
1652  */
1653 
1654 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1655                                        uint64_t max, float_status *s)
1656 {
1657     int orig_flags = get_float_exception_flags(s);
1658     FloatParts p = round_to_int(in, rmode, scale, s);
1659     uint64_t r;
1660 
1661     switch (p.cls) {
1662     case float_class_snan:
1663     case float_class_qnan:
1664         s->float_exception_flags = orig_flags | float_flag_invalid;
1665         return max;
1666     case float_class_inf:
1667         s->float_exception_flags = orig_flags | float_flag_invalid;
1668         return p.sign ? 0 : max;
1669     case float_class_zero:
1670         return 0;
1671     case float_class_normal:
1672         if (p.sign) {
1673             s->float_exception_flags = orig_flags | float_flag_invalid;
1674             return 0;
1675         }
1676 
1677         if (p.exp < DECOMPOSED_BINARY_POINT) {
1678             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1679         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1680             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1681         } else {
1682             s->float_exception_flags = orig_flags | float_flag_invalid;
1683             return max;
1684         }
1685 
1686         /* For uint64 this will never trip, but if p.exp is too large
1687          * to shift a decomposed fraction we shall have exited via the
1688          * 3rd leg above.
1689          */
1690         if (r > max) {
1691             s->float_exception_flags = orig_flags | float_flag_invalid;
1692             return max;
1693         }
1694         return r;
1695     default:
1696         g_assert_not_reached();
1697     }
1698 }
1699 
1700 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1701                                   float_status *s)
1702 {
1703     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1704                                   rmode, scale, UINT16_MAX, s);
1705 }
1706 
1707 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1708                                   float_status *s)
1709 {
1710     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1711                                   rmode, scale, UINT32_MAX, s);
1712 }
1713 
1714 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1715                                   float_status *s)
1716 {
1717     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1718                                   rmode, scale, UINT64_MAX, s);
1719 }
1720 
1721 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1722                                   float_status *s)
1723 {
1724     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1725                                   rmode, scale, UINT16_MAX, s);
1726 }
1727 
1728 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1729                                   float_status *s)
1730 {
1731     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1732                                   rmode, scale, UINT32_MAX, s);
1733 }
1734 
1735 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1736                                   float_status *s)
1737 {
1738     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1739                                   rmode, scale, UINT64_MAX, s);
1740 }
1741 
1742 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1743                                   float_status *s)
1744 {
1745     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1746                                   rmode, scale, UINT16_MAX, s);
1747 }
1748 
1749 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1750                                   float_status *s)
1751 {
1752     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1753                                   rmode, scale, UINT32_MAX, s);
1754 }
1755 
1756 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1757                                   float_status *s)
1758 {
1759     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1760                                   rmode, scale, UINT64_MAX, s);
1761 }
1762 
1763 uint16_t float16_to_uint16(float16 a, float_status *s)
1764 {
1765     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1766 }
1767 
1768 uint32_t float16_to_uint32(float16 a, float_status *s)
1769 {
1770     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1771 }
1772 
1773 uint64_t float16_to_uint64(float16 a, float_status *s)
1774 {
1775     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1776 }
1777 
1778 uint16_t float32_to_uint16(float32 a, float_status *s)
1779 {
1780     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1781 }
1782 
1783 uint32_t float32_to_uint32(float32 a, float_status *s)
1784 {
1785     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1786 }
1787 
1788 uint64_t float32_to_uint64(float32 a, float_status *s)
1789 {
1790     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1791 }
1792 
1793 uint16_t float64_to_uint16(float64 a, float_status *s)
1794 {
1795     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1796 }
1797 
1798 uint32_t float64_to_uint32(float64 a, float_status *s)
1799 {
1800     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1801 }
1802 
1803 uint64_t float64_to_uint64(float64 a, float_status *s)
1804 {
1805     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1806 }
1807 
1808 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1809 {
1810     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1811 }
1812 
1813 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1814 {
1815     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1816 }
1817 
1818 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1819 {
1820     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1821 }
1822 
1823 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1824 {
1825     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1826 }
1827 
1828 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1829 {
1830     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1831 }
1832 
1833 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1834 {
1835     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1836 }
1837 
1838 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1839 {
1840     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1841 }
1842 
1843 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1844 {
1845     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1846 }
1847 
1848 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1849 {
1850     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1851 }
1852 
1853 /*
1854  * Integer to float conversions
1855  *
1856  * Returns the result of converting the two's complement integer `a'
1857  * to the floating-point format. The conversion is performed according
1858  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1859  */
1860 
1861 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1862 {
1863     FloatParts r = { .sign = false };
1864 
1865     if (a == 0) {
1866         r.cls = float_class_zero;
1867     } else {
1868         uint64_t f = a;
1869         int shift;
1870 
1871         r.cls = float_class_normal;
1872         if (a < 0) {
1873             f = -f;
1874             r.sign = true;
1875         }
1876         shift = clz64(f) - 1;
1877         scale = MIN(MAX(scale, -0x10000), 0x10000);
1878 
1879         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1880         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1881     }
1882 
1883     return r;
1884 }
1885 
1886 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1887 {
1888     FloatParts pa = int_to_float(a, scale, status);
1889     return float16_round_pack_canonical(pa, status);
1890 }
1891 
1892 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1893 {
1894     return int64_to_float16_scalbn(a, scale, status);
1895 }
1896 
1897 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1898 {
1899     return int64_to_float16_scalbn(a, scale, status);
1900 }
1901 
1902 float16 int64_to_float16(int64_t a, float_status *status)
1903 {
1904     return int64_to_float16_scalbn(a, 0, status);
1905 }
1906 
1907 float16 int32_to_float16(int32_t a, float_status *status)
1908 {
1909     return int64_to_float16_scalbn(a, 0, status);
1910 }
1911 
1912 float16 int16_to_float16(int16_t a, float_status *status)
1913 {
1914     return int64_to_float16_scalbn(a, 0, status);
1915 }
1916 
1917 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1918 {
1919     FloatParts pa = int_to_float(a, scale, status);
1920     return float32_round_pack_canonical(pa, status);
1921 }
1922 
1923 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1924 {
1925     return int64_to_float32_scalbn(a, scale, status);
1926 }
1927 
1928 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1929 {
1930     return int64_to_float32_scalbn(a, scale, status);
1931 }
1932 
1933 float32 int64_to_float32(int64_t a, float_status *status)
1934 {
1935     return int64_to_float32_scalbn(a, 0, status);
1936 }
1937 
1938 float32 int32_to_float32(int32_t a, float_status *status)
1939 {
1940     return int64_to_float32_scalbn(a, 0, status);
1941 }
1942 
1943 float32 int16_to_float32(int16_t a, float_status *status)
1944 {
1945     return int64_to_float32_scalbn(a, 0, status);
1946 }
1947 
1948 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1949 {
1950     FloatParts pa = int_to_float(a, scale, status);
1951     return float64_round_pack_canonical(pa, status);
1952 }
1953 
1954 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1955 {
1956     return int64_to_float64_scalbn(a, scale, status);
1957 }
1958 
1959 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1960 {
1961     return int64_to_float64_scalbn(a, scale, status);
1962 }
1963 
1964 float64 int64_to_float64(int64_t a, float_status *status)
1965 {
1966     return int64_to_float64_scalbn(a, 0, status);
1967 }
1968 
1969 float64 int32_to_float64(int32_t a, float_status *status)
1970 {
1971     return int64_to_float64_scalbn(a, 0, status);
1972 }
1973 
1974 float64 int16_to_float64(int16_t a, float_status *status)
1975 {
1976     return int64_to_float64_scalbn(a, 0, status);
1977 }
1978 
1979 
1980 /*
1981  * Unsigned Integer to float conversions
1982  *
1983  * Returns the result of converting the unsigned integer `a' to the
1984  * floating-point format. The conversion is performed according to the
1985  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1986  */
1987 
1988 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1989 {
1990     FloatParts r = { .sign = false };
1991 
1992     if (a == 0) {
1993         r.cls = float_class_zero;
1994     } else {
1995         scale = MIN(MAX(scale, -0x10000), 0x10000);
1996         r.cls = float_class_normal;
1997         if ((int64_t)a < 0) {
1998             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1999             shift64RightJamming(a, 1, &a);
2000             r.frac = a;
2001         } else {
2002             int shift = clz64(a) - 1;
2003             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2004             r.frac = a << shift;
2005         }
2006     }
2007 
2008     return r;
2009 }
2010 
2011 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2012 {
2013     FloatParts pa = uint_to_float(a, scale, status);
2014     return float16_round_pack_canonical(pa, status);
2015 }
2016 
2017 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2018 {
2019     return uint64_to_float16_scalbn(a, scale, status);
2020 }
2021 
2022 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2023 {
2024     return uint64_to_float16_scalbn(a, scale, status);
2025 }
2026 
2027 float16 uint64_to_float16(uint64_t a, float_status *status)
2028 {
2029     return uint64_to_float16_scalbn(a, 0, status);
2030 }
2031 
2032 float16 uint32_to_float16(uint32_t a, float_status *status)
2033 {
2034     return uint64_to_float16_scalbn(a, 0, status);
2035 }
2036 
2037 float16 uint16_to_float16(uint16_t a, float_status *status)
2038 {
2039     return uint64_to_float16_scalbn(a, 0, status);
2040 }
2041 
2042 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2043 {
2044     FloatParts pa = uint_to_float(a, scale, status);
2045     return float32_round_pack_canonical(pa, status);
2046 }
2047 
2048 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2049 {
2050     return uint64_to_float32_scalbn(a, scale, status);
2051 }
2052 
2053 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2054 {
2055     return uint64_to_float32_scalbn(a, scale, status);
2056 }
2057 
2058 float32 uint64_to_float32(uint64_t a, float_status *status)
2059 {
2060     return uint64_to_float32_scalbn(a, 0, status);
2061 }
2062 
2063 float32 uint32_to_float32(uint32_t a, float_status *status)
2064 {
2065     return uint64_to_float32_scalbn(a, 0, status);
2066 }
2067 
2068 float32 uint16_to_float32(uint16_t a, float_status *status)
2069 {
2070     return uint64_to_float32_scalbn(a, 0, status);
2071 }
2072 
2073 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2074 {
2075     FloatParts pa = uint_to_float(a, scale, status);
2076     return float64_round_pack_canonical(pa, status);
2077 }
2078 
2079 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2080 {
2081     return uint64_to_float64_scalbn(a, scale, status);
2082 }
2083 
2084 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2085 {
2086     return uint64_to_float64_scalbn(a, scale, status);
2087 }
2088 
2089 float64 uint64_to_float64(uint64_t a, float_status *status)
2090 {
2091     return uint64_to_float64_scalbn(a, 0, status);
2092 }
2093 
2094 float64 uint32_to_float64(uint32_t a, float_status *status)
2095 {
2096     return uint64_to_float64_scalbn(a, 0, status);
2097 }
2098 
2099 float64 uint16_to_float64(uint16_t a, float_status *status)
2100 {
2101     return uint64_to_float64_scalbn(a, 0, status);
2102 }
2103 
2104 /* Float Min/Max */
2105 /* min() and max() functions. These can't be implemented as
2106  * 'compare and pick one input' because that would mishandle
2107  * NaNs and +0 vs -0.
2108  *
2109  * minnum() and maxnum() functions. These are similar to the min()
2110  * and max() functions but if one of the arguments is a QNaN and
2111  * the other is numerical then the numerical argument is returned.
2112  * SNaNs will get quietened before being returned.
2113  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2114  * and maxNum() operations. min() and max() are the typical min/max
2115  * semantics provided by many CPUs which predate that specification.
2116  *
2117  * minnummag() and maxnummag() functions correspond to minNumMag()
2118  * and minNumMag() from the IEEE-754 2008.
2119  */
2120 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2121                                 bool ieee, bool ismag, float_status *s)
2122 {
2123     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2124         if (ieee) {
2125             /* Takes two floating-point values `a' and `b', one of
2126              * which is a NaN, and returns the appropriate NaN
2127              * result. If either `a' or `b' is a signaling NaN,
2128              * the invalid exception is raised.
2129              */
2130             if (is_snan(a.cls) || is_snan(b.cls)) {
2131                 return pick_nan(a, b, s);
2132             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2133                 return b;
2134             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2135                 return a;
2136             }
2137         }
2138         return pick_nan(a, b, s);
2139     } else {
2140         int a_exp, b_exp;
2141 
2142         switch (a.cls) {
2143         case float_class_normal:
2144             a_exp = a.exp;
2145             break;
2146         case float_class_inf:
2147             a_exp = INT_MAX;
2148             break;
2149         case float_class_zero:
2150             a_exp = INT_MIN;
2151             break;
2152         default:
2153             g_assert_not_reached();
2154             break;
2155         }
2156         switch (b.cls) {
2157         case float_class_normal:
2158             b_exp = b.exp;
2159             break;
2160         case float_class_inf:
2161             b_exp = INT_MAX;
2162             break;
2163         case float_class_zero:
2164             b_exp = INT_MIN;
2165             break;
2166         default:
2167             g_assert_not_reached();
2168             break;
2169         }
2170 
2171         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2172             bool a_less = a_exp < b_exp;
2173             if (a_exp == b_exp) {
2174                 a_less = a.frac < b.frac;
2175             }
2176             return a_less ^ ismin ? b : a;
2177         }
2178 
2179         if (a.sign == b.sign) {
2180             bool a_less = a_exp < b_exp;
2181             if (a_exp == b_exp) {
2182                 a_less = a.frac < b.frac;
2183             }
2184             return a.sign ^ a_less ^ ismin ? b : a;
2185         } else {
2186             return a.sign ^ ismin ? b : a;
2187         }
2188     }
2189 }
2190 
2191 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2192 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2193                                      float_status *s)                   \
2194 {                                                                       \
2195     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2196     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2197     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2198                                                                         \
2199     return float ## sz ## _round_pack_canonical(pr, s);                 \
2200 }
2201 
2202 MINMAX(16, min, true, false, false)
2203 MINMAX(16, minnum, true, true, false)
2204 MINMAX(16, minnummag, true, true, true)
2205 MINMAX(16, max, false, false, false)
2206 MINMAX(16, maxnum, false, true, false)
2207 MINMAX(16, maxnummag, false, true, true)
2208 
2209 MINMAX(32, min, true, false, false)
2210 MINMAX(32, minnum, true, true, false)
2211 MINMAX(32, minnummag, true, true, true)
2212 MINMAX(32, max, false, false, false)
2213 MINMAX(32, maxnum, false, true, false)
2214 MINMAX(32, maxnummag, false, true, true)
2215 
2216 MINMAX(64, min, true, false, false)
2217 MINMAX(64, minnum, true, true, false)
2218 MINMAX(64, minnummag, true, true, true)
2219 MINMAX(64, max, false, false, false)
2220 MINMAX(64, maxnum, false, true, false)
2221 MINMAX(64, maxnummag, false, true, true)
2222 
2223 #undef MINMAX
2224 
2225 /* Floating point compare */
2226 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2227                           float_status *s)
2228 {
2229     if (is_nan(a.cls) || is_nan(b.cls)) {
2230         if (!is_quiet ||
2231             a.cls == float_class_snan ||
2232             b.cls == float_class_snan) {
2233             s->float_exception_flags |= float_flag_invalid;
2234         }
2235         return float_relation_unordered;
2236     }
2237 
2238     if (a.cls == float_class_zero) {
2239         if (b.cls == float_class_zero) {
2240             return float_relation_equal;
2241         }
2242         return b.sign ? float_relation_greater : float_relation_less;
2243     } else if (b.cls == float_class_zero) {
2244         return a.sign ? float_relation_less : float_relation_greater;
2245     }
2246 
2247     /* The only really important thing about infinity is its sign. If
2248      * both are infinities the sign marks the smallest of the two.
2249      */
2250     if (a.cls == float_class_inf) {
2251         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2252             return float_relation_equal;
2253         }
2254         return a.sign ? float_relation_less : float_relation_greater;
2255     } else if (b.cls == float_class_inf) {
2256         return b.sign ? float_relation_greater : float_relation_less;
2257     }
2258 
2259     if (a.sign != b.sign) {
2260         return a.sign ? float_relation_less : float_relation_greater;
2261     }
2262 
2263     if (a.exp == b.exp) {
2264         if (a.frac == b.frac) {
2265             return float_relation_equal;
2266         }
2267         if (a.sign) {
2268             return a.frac > b.frac ?
2269                 float_relation_less : float_relation_greater;
2270         } else {
2271             return a.frac > b.frac ?
2272                 float_relation_greater : float_relation_less;
2273         }
2274     } else {
2275         if (a.sign) {
2276             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2277         } else {
2278             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2279         }
2280     }
2281 }
2282 
2283 #define COMPARE(sz)                                                     \
2284 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2285                             float_status *s)                            \
2286 {                                                                       \
2287     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2288     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2289     return compare_floats(pa, pb, false, s);                            \
2290 }                                                                       \
2291 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2292                                   float_status *s)                      \
2293 {                                                                       \
2294     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2295     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2296     return compare_floats(pa, pb, true, s);                             \
2297 }
2298 
2299 COMPARE(16)
2300 COMPARE(32)
2301 COMPARE(64)
2302 
2303 #undef COMPARE
2304 
2305 /* Multiply A by 2 raised to the power N.  */
2306 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2307 {
2308     if (unlikely(is_nan(a.cls))) {
2309         return return_nan(a, s);
2310     }
2311     if (a.cls == float_class_normal) {
2312         /* The largest float type (even though not supported by FloatParts)
2313          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2314          * still allows rounding to infinity, without allowing overflow
2315          * within the int32_t that backs FloatParts.exp.
2316          */
2317         n = MIN(MAX(n, -0x10000), 0x10000);
2318         a.exp += n;
2319     }
2320     return a;
2321 }
2322 
2323 float16 float16_scalbn(float16 a, int n, float_status *status)
2324 {
2325     FloatParts pa = float16_unpack_canonical(a, status);
2326     FloatParts pr = scalbn_decomposed(pa, n, status);
2327     return float16_round_pack_canonical(pr, status);
2328 }
2329 
2330 float32 float32_scalbn(float32 a, int n, float_status *status)
2331 {
2332     FloatParts pa = float32_unpack_canonical(a, status);
2333     FloatParts pr = scalbn_decomposed(pa, n, status);
2334     return float32_round_pack_canonical(pr, status);
2335 }
2336 
2337 float64 float64_scalbn(float64 a, int n, float_status *status)
2338 {
2339     FloatParts pa = float64_unpack_canonical(a, status);
2340     FloatParts pr = scalbn_decomposed(pa, n, status);
2341     return float64_round_pack_canonical(pr, status);
2342 }
2343 
2344 /*
2345  * Square Root
2346  *
2347  * The old softfloat code did an approximation step before zeroing in
2348  * on the final result. However for simpleness we just compute the
2349  * square root by iterating down from the implicit bit to enough extra
2350  * bits to ensure we get a correctly rounded result.
2351  *
2352  * This does mean however the calculation is slower than before,
2353  * especially for 64 bit floats.
2354  */
2355 
2356 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2357 {
2358     uint64_t a_frac, r_frac, s_frac;
2359     int bit, last_bit;
2360 
2361     if (is_nan(a.cls)) {
2362         return return_nan(a, s);
2363     }
2364     if (a.cls == float_class_zero) {
2365         return a;  /* sqrt(+-0) = +-0 */
2366     }
2367     if (a.sign) {
2368         s->float_exception_flags |= float_flag_invalid;
2369         return parts_default_nan(s);
2370     }
2371     if (a.cls == float_class_inf) {
2372         return a;  /* sqrt(+inf) = +inf */
2373     }
2374 
2375     assert(a.cls == float_class_normal);
2376 
2377     /* We need two overflow bits at the top. Adding room for that is a
2378      * right shift. If the exponent is odd, we can discard the low bit
2379      * by multiplying the fraction by 2; that's a left shift. Combine
2380      * those and we shift right if the exponent is even.
2381      */
2382     a_frac = a.frac;
2383     if (!(a.exp & 1)) {
2384         a_frac >>= 1;
2385     }
2386     a.exp >>= 1;
2387 
2388     /* Bit-by-bit computation of sqrt.  */
2389     r_frac = 0;
2390     s_frac = 0;
2391 
2392     /* Iterate from implicit bit down to the 3 extra bits to compute a
2393      * properly rounded result. Remember we've inserted one more bit
2394      * at the top, so these positions are one less.
2395      */
2396     bit = DECOMPOSED_BINARY_POINT - 1;
2397     last_bit = MAX(p->frac_shift - 4, 0);
2398     do {
2399         uint64_t q = 1ULL << bit;
2400         uint64_t t_frac = s_frac + q;
2401         if (t_frac <= a_frac) {
2402             s_frac = t_frac + q;
2403             a_frac -= t_frac;
2404             r_frac += q;
2405         }
2406         a_frac <<= 1;
2407     } while (--bit >= last_bit);
2408 
2409     /* Undo the right shift done above. If there is any remaining
2410      * fraction, the result is inexact. Set the sticky bit.
2411      */
2412     a.frac = (r_frac << 1) + (a_frac != 0);
2413 
2414     return a;
2415 }
2416 
2417 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2418 {
2419     FloatParts pa = float16_unpack_canonical(a, status);
2420     FloatParts pr = sqrt_float(pa, status, &float16_params);
2421     return float16_round_pack_canonical(pr, status);
2422 }
2423 
2424 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2425 {
2426     FloatParts pa = float32_unpack_canonical(a, status);
2427     FloatParts pr = sqrt_float(pa, status, &float32_params);
2428     return float32_round_pack_canonical(pr, status);
2429 }
2430 
2431 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2432 {
2433     FloatParts pa = float64_unpack_canonical(a, status);
2434     FloatParts pr = sqrt_float(pa, status, &float64_params);
2435     return float64_round_pack_canonical(pr, status);
2436 }
2437 
2438 /*----------------------------------------------------------------------------
2439 | The pattern for a default generated NaN.
2440 *----------------------------------------------------------------------------*/
2441 
2442 float16 float16_default_nan(float_status *status)
2443 {
2444     FloatParts p = parts_default_nan(status);
2445     p.frac >>= float16_params.frac_shift;
2446     return float16_pack_raw(p);
2447 }
2448 
2449 float32 float32_default_nan(float_status *status)
2450 {
2451     FloatParts p = parts_default_nan(status);
2452     p.frac >>= float32_params.frac_shift;
2453     return float32_pack_raw(p);
2454 }
2455 
2456 float64 float64_default_nan(float_status *status)
2457 {
2458     FloatParts p = parts_default_nan(status);
2459     p.frac >>= float64_params.frac_shift;
2460     return float64_pack_raw(p);
2461 }
2462 
2463 float128 float128_default_nan(float_status *status)
2464 {
2465     FloatParts p = parts_default_nan(status);
2466     float128 r;
2467 
2468     /* Extrapolate from the choices made by parts_default_nan to fill
2469      * in the quad-floating format.  If the low bit is set, assume we
2470      * want to set all non-snan bits.
2471      */
2472     r.low = -(p.frac & 1);
2473     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2474     r.high |= LIT64(0x7FFF000000000000);
2475     r.high |= (uint64_t)p.sign << 63;
2476 
2477     return r;
2478 }
2479 
2480 /*----------------------------------------------------------------------------
2481 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2482 *----------------------------------------------------------------------------*/
2483 
2484 float16 float16_silence_nan(float16 a, float_status *status)
2485 {
2486     FloatParts p = float16_unpack_raw(a);
2487     p.frac <<= float16_params.frac_shift;
2488     p = parts_silence_nan(p, status);
2489     p.frac >>= float16_params.frac_shift;
2490     return float16_pack_raw(p);
2491 }
2492 
2493 float32 float32_silence_nan(float32 a, float_status *status)
2494 {
2495     FloatParts p = float32_unpack_raw(a);
2496     p.frac <<= float32_params.frac_shift;
2497     p = parts_silence_nan(p, status);
2498     p.frac >>= float32_params.frac_shift;
2499     return float32_pack_raw(p);
2500 }
2501 
2502 float64 float64_silence_nan(float64 a, float_status *status)
2503 {
2504     FloatParts p = float64_unpack_raw(a);
2505     p.frac <<= float64_params.frac_shift;
2506     p = parts_silence_nan(p, status);
2507     p.frac >>= float64_params.frac_shift;
2508     return float64_pack_raw(p);
2509 }
2510 
2511 /*----------------------------------------------------------------------------
2512 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2513 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2514 | input.  If `zSign' is 1, the input is negated before being converted to an
2515 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2516 | is simply rounded to an integer, with the inexact exception raised if the
2517 | input cannot be represented exactly as an integer.  However, if the fixed-
2518 | point input is too large, the invalid exception is raised and the largest
2519 | positive or negative integer is returned.
2520 *----------------------------------------------------------------------------*/
2521 
2522 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2523 {
2524     int8_t roundingMode;
2525     flag roundNearestEven;
2526     int8_t roundIncrement, roundBits;
2527     int32_t z;
2528 
2529     roundingMode = status->float_rounding_mode;
2530     roundNearestEven = ( roundingMode == float_round_nearest_even );
2531     switch (roundingMode) {
2532     case float_round_nearest_even:
2533     case float_round_ties_away:
2534         roundIncrement = 0x40;
2535         break;
2536     case float_round_to_zero:
2537         roundIncrement = 0;
2538         break;
2539     case float_round_up:
2540         roundIncrement = zSign ? 0 : 0x7f;
2541         break;
2542     case float_round_down:
2543         roundIncrement = zSign ? 0x7f : 0;
2544         break;
2545     default:
2546         abort();
2547     }
2548     roundBits = absZ & 0x7F;
2549     absZ = ( absZ + roundIncrement )>>7;
2550     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2551     z = absZ;
2552     if ( zSign ) z = - z;
2553     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2554         float_raise(float_flag_invalid, status);
2555         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2556     }
2557     if (roundBits) {
2558         status->float_exception_flags |= float_flag_inexact;
2559     }
2560     return z;
2561 
2562 }
2563 
2564 /*----------------------------------------------------------------------------
2565 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2566 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2567 | and returns the properly rounded 64-bit integer corresponding to the input.
2568 | If `zSign' is 1, the input is negated before being converted to an integer.
2569 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2570 | the inexact exception raised if the input cannot be represented exactly as
2571 | an integer.  However, if the fixed-point input is too large, the invalid
2572 | exception is raised and the largest positive or negative integer is
2573 | returned.
2574 *----------------------------------------------------------------------------*/
2575 
2576 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2577                                float_status *status)
2578 {
2579     int8_t roundingMode;
2580     flag roundNearestEven, increment;
2581     int64_t z;
2582 
2583     roundingMode = status->float_rounding_mode;
2584     roundNearestEven = ( roundingMode == float_round_nearest_even );
2585     switch (roundingMode) {
2586     case float_round_nearest_even:
2587     case float_round_ties_away:
2588         increment = ((int64_t) absZ1 < 0);
2589         break;
2590     case float_round_to_zero:
2591         increment = 0;
2592         break;
2593     case float_round_up:
2594         increment = !zSign && absZ1;
2595         break;
2596     case float_round_down:
2597         increment = zSign && absZ1;
2598         break;
2599     default:
2600         abort();
2601     }
2602     if ( increment ) {
2603         ++absZ0;
2604         if ( absZ0 == 0 ) goto overflow;
2605         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2606     }
2607     z = absZ0;
2608     if ( zSign ) z = - z;
2609     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2610  overflow:
2611         float_raise(float_flag_invalid, status);
2612         return
2613               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2614             : LIT64( 0x7FFFFFFFFFFFFFFF );
2615     }
2616     if (absZ1) {
2617         status->float_exception_flags |= float_flag_inexact;
2618     }
2619     return z;
2620 
2621 }
2622 
2623 /*----------------------------------------------------------------------------
2624 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2625 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2626 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2627 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2628 | with the inexact exception raised if the input cannot be represented exactly
2629 | as an integer.  However, if the fixed-point input is too large, the invalid
2630 | exception is raised and the largest unsigned integer is returned.
2631 *----------------------------------------------------------------------------*/
2632 
2633 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2634                                 uint64_t absZ1, float_status *status)
2635 {
2636     int8_t roundingMode;
2637     flag roundNearestEven, increment;
2638 
2639     roundingMode = status->float_rounding_mode;
2640     roundNearestEven = (roundingMode == float_round_nearest_even);
2641     switch (roundingMode) {
2642     case float_round_nearest_even:
2643     case float_round_ties_away:
2644         increment = ((int64_t)absZ1 < 0);
2645         break;
2646     case float_round_to_zero:
2647         increment = 0;
2648         break;
2649     case float_round_up:
2650         increment = !zSign && absZ1;
2651         break;
2652     case float_round_down:
2653         increment = zSign && absZ1;
2654         break;
2655     default:
2656         abort();
2657     }
2658     if (increment) {
2659         ++absZ0;
2660         if (absZ0 == 0) {
2661             float_raise(float_flag_invalid, status);
2662             return LIT64(0xFFFFFFFFFFFFFFFF);
2663         }
2664         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2665     }
2666 
2667     if (zSign && absZ0) {
2668         float_raise(float_flag_invalid, status);
2669         return 0;
2670     }
2671 
2672     if (absZ1) {
2673         status->float_exception_flags |= float_flag_inexact;
2674     }
2675     return absZ0;
2676 }
2677 
2678 /*----------------------------------------------------------------------------
2679 | If `a' is denormal and we are in flush-to-zero mode then set the
2680 | input-denormal exception and return zero. Otherwise just return the value.
2681 *----------------------------------------------------------------------------*/
2682 float32 float32_squash_input_denormal(float32 a, float_status *status)
2683 {
2684     if (status->flush_inputs_to_zero) {
2685         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2686             float_raise(float_flag_input_denormal, status);
2687             return make_float32(float32_val(a) & 0x80000000);
2688         }
2689     }
2690     return a;
2691 }
2692 
2693 /*----------------------------------------------------------------------------
2694 | Normalizes the subnormal single-precision floating-point value represented
2695 | by the denormalized significand `aSig'.  The normalized exponent and
2696 | significand are stored at the locations pointed to by `zExpPtr' and
2697 | `zSigPtr', respectively.
2698 *----------------------------------------------------------------------------*/
2699 
2700 static void
2701  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2702 {
2703     int8_t shiftCount;
2704 
2705     shiftCount = clz32(aSig) - 8;
2706     *zSigPtr = aSig<<shiftCount;
2707     *zExpPtr = 1 - shiftCount;
2708 
2709 }
2710 
2711 /*----------------------------------------------------------------------------
2712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2713 | and significand `zSig', and returns the proper single-precision floating-
2714 | point value corresponding to the abstract input.  Ordinarily, the abstract
2715 | value is simply rounded and packed into the single-precision format, with
2716 | the inexact exception raised if the abstract input cannot be represented
2717 | exactly.  However, if the abstract value is too large, the overflow and
2718 | inexact exceptions are raised and an infinity or maximal finite value is
2719 | returned.  If the abstract value is too small, the input value is rounded to
2720 | a subnormal number, and the underflow and inexact exceptions are raised if
2721 | the abstract input cannot be represented exactly as a subnormal single-
2722 | precision floating-point number.
2723 |     The input significand `zSig' has its binary point between bits 30
2724 | and 29, which is 7 bits to the left of the usual location.  This shifted
2725 | significand must be normalized or smaller.  If `zSig' is not normalized,
2726 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2727 | and it must not require rounding.  In the usual case that `zSig' is
2728 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2729 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2732 
2733 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2734                                    float_status *status)
2735 {
2736     int8_t roundingMode;
2737     flag roundNearestEven;
2738     int8_t roundIncrement, roundBits;
2739     flag isTiny;
2740 
2741     roundingMode = status->float_rounding_mode;
2742     roundNearestEven = ( roundingMode == float_round_nearest_even );
2743     switch (roundingMode) {
2744     case float_round_nearest_even:
2745     case float_round_ties_away:
2746         roundIncrement = 0x40;
2747         break;
2748     case float_round_to_zero:
2749         roundIncrement = 0;
2750         break;
2751     case float_round_up:
2752         roundIncrement = zSign ? 0 : 0x7f;
2753         break;
2754     case float_round_down:
2755         roundIncrement = zSign ? 0x7f : 0;
2756         break;
2757     default:
2758         abort();
2759         break;
2760     }
2761     roundBits = zSig & 0x7F;
2762     if ( 0xFD <= (uint16_t) zExp ) {
2763         if (    ( 0xFD < zExp )
2764              || (    ( zExp == 0xFD )
2765                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2766            ) {
2767             float_raise(float_flag_overflow | float_flag_inexact, status);
2768             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2769         }
2770         if ( zExp < 0 ) {
2771             if (status->flush_to_zero) {
2772                 float_raise(float_flag_output_denormal, status);
2773                 return packFloat32(zSign, 0, 0);
2774             }
2775             isTiny =
2776                 (status->float_detect_tininess
2777                  == float_tininess_before_rounding)
2778                 || ( zExp < -1 )
2779                 || ( zSig + roundIncrement < 0x80000000 );
2780             shift32RightJamming( zSig, - zExp, &zSig );
2781             zExp = 0;
2782             roundBits = zSig & 0x7F;
2783             if (isTiny && roundBits) {
2784                 float_raise(float_flag_underflow, status);
2785             }
2786         }
2787     }
2788     if (roundBits) {
2789         status->float_exception_flags |= float_flag_inexact;
2790     }
2791     zSig = ( zSig + roundIncrement )>>7;
2792     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2793     if ( zSig == 0 ) zExp = 0;
2794     return packFloat32( zSign, zExp, zSig );
2795 
2796 }
2797 
2798 /*----------------------------------------------------------------------------
2799 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2800 | and significand `zSig', and returns the proper single-precision floating-
2801 | point value corresponding to the abstract input.  This routine is just like
2802 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2803 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2804 | floating-point exponent.
2805 *----------------------------------------------------------------------------*/
2806 
2807 static float32
2808  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2809                               float_status *status)
2810 {
2811     int8_t shiftCount;
2812 
2813     shiftCount = clz32(zSig) - 1;
2814     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2815                                status);
2816 
2817 }
2818 
2819 /*----------------------------------------------------------------------------
2820 | If `a' is denormal and we are in flush-to-zero mode then set the
2821 | input-denormal exception and return zero. Otherwise just return the value.
2822 *----------------------------------------------------------------------------*/
2823 float64 float64_squash_input_denormal(float64 a, float_status *status)
2824 {
2825     if (status->flush_inputs_to_zero) {
2826         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2827             float_raise(float_flag_input_denormal, status);
2828             return make_float64(float64_val(a) & (1ULL << 63));
2829         }
2830     }
2831     return a;
2832 }
2833 
2834 /*----------------------------------------------------------------------------
2835 | Normalizes the subnormal double-precision floating-point value represented
2836 | by the denormalized significand `aSig'.  The normalized exponent and
2837 | significand are stored at the locations pointed to by `zExpPtr' and
2838 | `zSigPtr', respectively.
2839 *----------------------------------------------------------------------------*/
2840 
2841 static void
2842  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2843 {
2844     int8_t shiftCount;
2845 
2846     shiftCount = clz64(aSig) - 11;
2847     *zSigPtr = aSig<<shiftCount;
2848     *zExpPtr = 1 - shiftCount;
2849 
2850 }
2851 
2852 /*----------------------------------------------------------------------------
2853 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2854 | double-precision floating-point value, returning the result.  After being
2855 | shifted into the proper positions, the three fields are simply added
2856 | together to form the result.  This means that any integer portion of `zSig'
2857 | will be added into the exponent.  Since a properly normalized significand
2858 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2859 | than the desired result exponent whenever `zSig' is a complete, normalized
2860 | significand.
2861 *----------------------------------------------------------------------------*/
2862 
2863 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2864 {
2865 
2866     return make_float64(
2867         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2868 
2869 }
2870 
2871 /*----------------------------------------------------------------------------
2872 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2873 | and significand `zSig', and returns the proper double-precision floating-
2874 | point value corresponding to the abstract input.  Ordinarily, the abstract
2875 | value is simply rounded and packed into the double-precision format, with
2876 | the inexact exception raised if the abstract input cannot be represented
2877 | exactly.  However, if the abstract value is too large, the overflow and
2878 | inexact exceptions are raised and an infinity or maximal finite value is
2879 | returned.  If the abstract value is too small, the input value is rounded to
2880 | a subnormal number, and the underflow and inexact exceptions are raised if
2881 | the abstract input cannot be represented exactly as a subnormal double-
2882 | precision floating-point number.
2883 |     The input significand `zSig' has its binary point between bits 62
2884 | and 61, which is 10 bits to the left of the usual location.  This shifted
2885 | significand must be normalized or smaller.  If `zSig' is not normalized,
2886 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2887 | and it must not require rounding.  In the usual case that `zSig' is
2888 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2889 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2890 | Binary Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2892 
2893 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2894                                    float_status *status)
2895 {
2896     int8_t roundingMode;
2897     flag roundNearestEven;
2898     int roundIncrement, roundBits;
2899     flag isTiny;
2900 
2901     roundingMode = status->float_rounding_mode;
2902     roundNearestEven = ( roundingMode == float_round_nearest_even );
2903     switch (roundingMode) {
2904     case float_round_nearest_even:
2905     case float_round_ties_away:
2906         roundIncrement = 0x200;
2907         break;
2908     case float_round_to_zero:
2909         roundIncrement = 0;
2910         break;
2911     case float_round_up:
2912         roundIncrement = zSign ? 0 : 0x3ff;
2913         break;
2914     case float_round_down:
2915         roundIncrement = zSign ? 0x3ff : 0;
2916         break;
2917     case float_round_to_odd:
2918         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2919         break;
2920     default:
2921         abort();
2922     }
2923     roundBits = zSig & 0x3FF;
2924     if ( 0x7FD <= (uint16_t) zExp ) {
2925         if (    ( 0x7FD < zExp )
2926              || (    ( zExp == 0x7FD )
2927                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2928            ) {
2929             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2930                                    roundIncrement != 0;
2931             float_raise(float_flag_overflow | float_flag_inexact, status);
2932             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2933         }
2934         if ( zExp < 0 ) {
2935             if (status->flush_to_zero) {
2936                 float_raise(float_flag_output_denormal, status);
2937                 return packFloat64(zSign, 0, 0);
2938             }
2939             isTiny =
2940                    (status->float_detect_tininess
2941                     == float_tininess_before_rounding)
2942                 || ( zExp < -1 )
2943                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2944             shift64RightJamming( zSig, - zExp, &zSig );
2945             zExp = 0;
2946             roundBits = zSig & 0x3FF;
2947             if (isTiny && roundBits) {
2948                 float_raise(float_flag_underflow, status);
2949             }
2950             if (roundingMode == float_round_to_odd) {
2951                 /*
2952                  * For round-to-odd case, the roundIncrement depends on
2953                  * zSig which just changed.
2954                  */
2955                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2956             }
2957         }
2958     }
2959     if (roundBits) {
2960         status->float_exception_flags |= float_flag_inexact;
2961     }
2962     zSig = ( zSig + roundIncrement )>>10;
2963     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2964     if ( zSig == 0 ) zExp = 0;
2965     return packFloat64( zSign, zExp, zSig );
2966 
2967 }
2968 
2969 /*----------------------------------------------------------------------------
2970 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2971 | and significand `zSig', and returns the proper double-precision floating-
2972 | point value corresponding to the abstract input.  This routine is just like
2973 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2974 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2975 | floating-point exponent.
2976 *----------------------------------------------------------------------------*/
2977 
2978 static float64
2979  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2980                               float_status *status)
2981 {
2982     int8_t shiftCount;
2983 
2984     shiftCount = clz64(zSig) - 1;
2985     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2986                                status);
2987 
2988 }
2989 
2990 /*----------------------------------------------------------------------------
2991 | Normalizes the subnormal extended double-precision floating-point value
2992 | represented by the denormalized significand `aSig'.  The normalized exponent
2993 | and significand are stored at the locations pointed to by `zExpPtr' and
2994 | `zSigPtr', respectively.
2995 *----------------------------------------------------------------------------*/
2996 
2997 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2998                                 uint64_t *zSigPtr)
2999 {
3000     int8_t shiftCount;
3001 
3002     shiftCount = clz64(aSig);
3003     *zSigPtr = aSig<<shiftCount;
3004     *zExpPtr = 1 - shiftCount;
3005 }
3006 
3007 /*----------------------------------------------------------------------------
3008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3009 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3010 | and returns the proper extended double-precision floating-point value
3011 | corresponding to the abstract input.  Ordinarily, the abstract value is
3012 | rounded and packed into the extended double-precision format, with the
3013 | inexact exception raised if the abstract input cannot be represented
3014 | exactly.  However, if the abstract value is too large, the overflow and
3015 | inexact exceptions are raised and an infinity or maximal finite value is
3016 | returned.  If the abstract value is too small, the input value is rounded to
3017 | a subnormal number, and the underflow and inexact exceptions are raised if
3018 | the abstract input cannot be represented exactly as a subnormal extended
3019 | double-precision floating-point number.
3020 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3021 | number of bits as single or double precision, respectively.  Otherwise, the
3022 | result is rounded to the full precision of the extended double-precision
3023 | format.
3024 |     The input significand must be normalized or smaller.  If the input
3025 | significand is not normalized, `zExp' must be 0; in that case, the result
3026 | returned is a subnormal number, and it must not require rounding.  The
3027 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3028 | Floating-Point Arithmetic.
3029 *----------------------------------------------------------------------------*/
3030 
3031 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3032                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3033                               float_status *status)
3034 {
3035     int8_t roundingMode;
3036     flag roundNearestEven, increment, isTiny;
3037     int64_t roundIncrement, roundMask, roundBits;
3038 
3039     roundingMode = status->float_rounding_mode;
3040     roundNearestEven = ( roundingMode == float_round_nearest_even );
3041     if ( roundingPrecision == 80 ) goto precision80;
3042     if ( roundingPrecision == 64 ) {
3043         roundIncrement = LIT64( 0x0000000000000400 );
3044         roundMask = LIT64( 0x00000000000007FF );
3045     }
3046     else if ( roundingPrecision == 32 ) {
3047         roundIncrement = LIT64( 0x0000008000000000 );
3048         roundMask = LIT64( 0x000000FFFFFFFFFF );
3049     }
3050     else {
3051         goto precision80;
3052     }
3053     zSig0 |= ( zSig1 != 0 );
3054     switch (roundingMode) {
3055     case float_round_nearest_even:
3056     case float_round_ties_away:
3057         break;
3058     case float_round_to_zero:
3059         roundIncrement = 0;
3060         break;
3061     case float_round_up:
3062         roundIncrement = zSign ? 0 : roundMask;
3063         break;
3064     case float_round_down:
3065         roundIncrement = zSign ? roundMask : 0;
3066         break;
3067     default:
3068         abort();
3069     }
3070     roundBits = zSig0 & roundMask;
3071     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3072         if (    ( 0x7FFE < zExp )
3073              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3074            ) {
3075             goto overflow;
3076         }
3077         if ( zExp <= 0 ) {
3078             if (status->flush_to_zero) {
3079                 float_raise(float_flag_output_denormal, status);
3080                 return packFloatx80(zSign, 0, 0);
3081             }
3082             isTiny =
3083                    (status->float_detect_tininess
3084                     == float_tininess_before_rounding)
3085                 || ( zExp < 0 )
3086                 || ( zSig0 <= zSig0 + roundIncrement );
3087             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3088             zExp = 0;
3089             roundBits = zSig0 & roundMask;
3090             if (isTiny && roundBits) {
3091                 float_raise(float_flag_underflow, status);
3092             }
3093             if (roundBits) {
3094                 status->float_exception_flags |= float_flag_inexact;
3095             }
3096             zSig0 += roundIncrement;
3097             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3098             roundIncrement = roundMask + 1;
3099             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3100                 roundMask |= roundIncrement;
3101             }
3102             zSig0 &= ~ roundMask;
3103             return packFloatx80( zSign, zExp, zSig0 );
3104         }
3105     }
3106     if (roundBits) {
3107         status->float_exception_flags |= float_flag_inexact;
3108     }
3109     zSig0 += roundIncrement;
3110     if ( zSig0 < roundIncrement ) {
3111         ++zExp;
3112         zSig0 = LIT64( 0x8000000000000000 );
3113     }
3114     roundIncrement = roundMask + 1;
3115     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3116         roundMask |= roundIncrement;
3117     }
3118     zSig0 &= ~ roundMask;
3119     if ( zSig0 == 0 ) zExp = 0;
3120     return packFloatx80( zSign, zExp, zSig0 );
3121  precision80:
3122     switch (roundingMode) {
3123     case float_round_nearest_even:
3124     case float_round_ties_away:
3125         increment = ((int64_t)zSig1 < 0);
3126         break;
3127     case float_round_to_zero:
3128         increment = 0;
3129         break;
3130     case float_round_up:
3131         increment = !zSign && zSig1;
3132         break;
3133     case float_round_down:
3134         increment = zSign && zSig1;
3135         break;
3136     default:
3137         abort();
3138     }
3139     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3140         if (    ( 0x7FFE < zExp )
3141              || (    ( zExp == 0x7FFE )
3142                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3143                   && increment
3144                 )
3145            ) {
3146             roundMask = 0;
3147  overflow:
3148             float_raise(float_flag_overflow | float_flag_inexact, status);
3149             if (    ( roundingMode == float_round_to_zero )
3150                  || ( zSign && ( roundingMode == float_round_up ) )
3151                  || ( ! zSign && ( roundingMode == float_round_down ) )
3152                ) {
3153                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3154             }
3155             return packFloatx80(zSign,
3156                                 floatx80_infinity_high,
3157                                 floatx80_infinity_low);
3158         }
3159         if ( zExp <= 0 ) {
3160             isTiny =
3161                    (status->float_detect_tininess
3162                     == float_tininess_before_rounding)
3163                 || ( zExp < 0 )
3164                 || ! increment
3165                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3166             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3167             zExp = 0;
3168             if (isTiny && zSig1) {
3169                 float_raise(float_flag_underflow, status);
3170             }
3171             if (zSig1) {
3172                 status->float_exception_flags |= float_flag_inexact;
3173             }
3174             switch (roundingMode) {
3175             case float_round_nearest_even:
3176             case float_round_ties_away:
3177                 increment = ((int64_t)zSig1 < 0);
3178                 break;
3179             case float_round_to_zero:
3180                 increment = 0;
3181                 break;
3182             case float_round_up:
3183                 increment = !zSign && zSig1;
3184                 break;
3185             case float_round_down:
3186                 increment = zSign && zSig1;
3187                 break;
3188             default:
3189                 abort();
3190             }
3191             if ( increment ) {
3192                 ++zSig0;
3193                 zSig0 &=
3194                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3195                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3196             }
3197             return packFloatx80( zSign, zExp, zSig0 );
3198         }
3199     }
3200     if (zSig1) {
3201         status->float_exception_flags |= float_flag_inexact;
3202     }
3203     if ( increment ) {
3204         ++zSig0;
3205         if ( zSig0 == 0 ) {
3206             ++zExp;
3207             zSig0 = LIT64( 0x8000000000000000 );
3208         }
3209         else {
3210             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3211         }
3212     }
3213     else {
3214         if ( zSig0 == 0 ) zExp = 0;
3215     }
3216     return packFloatx80( zSign, zExp, zSig0 );
3217 
3218 }
3219 
3220 /*----------------------------------------------------------------------------
3221 | Takes an abstract floating-point value having sign `zSign', exponent
3222 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3223 | and returns the proper extended double-precision floating-point value
3224 | corresponding to the abstract input.  This routine is just like
3225 | `roundAndPackFloatx80' except that the input significand does not have to be
3226 | normalized.
3227 *----------------------------------------------------------------------------*/
3228 
3229 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3230                                        flag zSign, int32_t zExp,
3231                                        uint64_t zSig0, uint64_t zSig1,
3232                                        float_status *status)
3233 {
3234     int8_t shiftCount;
3235 
3236     if ( zSig0 == 0 ) {
3237         zSig0 = zSig1;
3238         zSig1 = 0;
3239         zExp -= 64;
3240     }
3241     shiftCount = clz64(zSig0);
3242     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3243     zExp -= shiftCount;
3244     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3245                                 zSig0, zSig1, status);
3246 
3247 }
3248 
3249 /*----------------------------------------------------------------------------
3250 | Returns the least-significant 64 fraction bits of the quadruple-precision
3251 | floating-point value `a'.
3252 *----------------------------------------------------------------------------*/
3253 
3254 static inline uint64_t extractFloat128Frac1( float128 a )
3255 {
3256 
3257     return a.low;
3258 
3259 }
3260 
3261 /*----------------------------------------------------------------------------
3262 | Returns the most-significant 48 fraction bits of the quadruple-precision
3263 | floating-point value `a'.
3264 *----------------------------------------------------------------------------*/
3265 
3266 static inline uint64_t extractFloat128Frac0( float128 a )
3267 {
3268 
3269     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3270 
3271 }
3272 
3273 /*----------------------------------------------------------------------------
3274 | Returns the exponent bits of the quadruple-precision floating-point value
3275 | `a'.
3276 *----------------------------------------------------------------------------*/
3277 
3278 static inline int32_t extractFloat128Exp( float128 a )
3279 {
3280 
3281     return ( a.high>>48 ) & 0x7FFF;
3282 
3283 }
3284 
3285 /*----------------------------------------------------------------------------
3286 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3287 *----------------------------------------------------------------------------*/
3288 
3289 static inline flag extractFloat128Sign( float128 a )
3290 {
3291 
3292     return a.high>>63;
3293 
3294 }
3295 
3296 /*----------------------------------------------------------------------------
3297 | Normalizes the subnormal quadruple-precision floating-point value
3298 | represented by the denormalized significand formed by the concatenation of
3299 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3300 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3301 | significand are stored at the location pointed to by `zSig0Ptr', and the
3302 | least significant 64 bits of the normalized significand are stored at the
3303 | location pointed to by `zSig1Ptr'.
3304 *----------------------------------------------------------------------------*/
3305 
3306 static void
3307  normalizeFloat128Subnormal(
3308      uint64_t aSig0,
3309      uint64_t aSig1,
3310      int32_t *zExpPtr,
3311      uint64_t *zSig0Ptr,
3312      uint64_t *zSig1Ptr
3313  )
3314 {
3315     int8_t shiftCount;
3316 
3317     if ( aSig0 == 0 ) {
3318         shiftCount = clz64(aSig1) - 15;
3319         if ( shiftCount < 0 ) {
3320             *zSig0Ptr = aSig1>>( - shiftCount );
3321             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3322         }
3323         else {
3324             *zSig0Ptr = aSig1<<shiftCount;
3325             *zSig1Ptr = 0;
3326         }
3327         *zExpPtr = - shiftCount - 63;
3328     }
3329     else {
3330         shiftCount = clz64(aSig0) - 15;
3331         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3332         *zExpPtr = 1 - shiftCount;
3333     }
3334 
3335 }
3336 
3337 /*----------------------------------------------------------------------------
3338 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3339 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3340 | floating-point value, returning the result.  After being shifted into the
3341 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3342 | added together to form the most significant 32 bits of the result.  This
3343 | means that any integer portion of `zSig0' will be added into the exponent.
3344 | Since a properly normalized significand will have an integer portion equal
3345 | to 1, the `zExp' input should be 1 less than the desired result exponent
3346 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3347 | significand.
3348 *----------------------------------------------------------------------------*/
3349 
3350 static inline float128
3351  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3352 {
3353     float128 z;
3354 
3355     z.low = zSig1;
3356     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3357     return z;
3358 
3359 }
3360 
3361 /*----------------------------------------------------------------------------
3362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3363 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3364 | and `zSig2', and returns the proper quadruple-precision floating-point value
3365 | corresponding to the abstract input.  Ordinarily, the abstract value is
3366 | simply rounded and packed into the quadruple-precision format, with the
3367 | inexact exception raised if the abstract input cannot be represented
3368 | exactly.  However, if the abstract value is too large, the overflow and
3369 | inexact exceptions are raised and an infinity or maximal finite value is
3370 | returned.  If the abstract value is too small, the input value is rounded to
3371 | a subnormal number, and the underflow and inexact exceptions are raised if
3372 | the abstract input cannot be represented exactly as a subnormal quadruple-
3373 | precision floating-point number.
3374 |     The input significand must be normalized or smaller.  If the input
3375 | significand is not normalized, `zExp' must be 0; in that case, the result
3376 | returned is a subnormal number, and it must not require rounding.  In the
3377 | usual case that the input significand is normalized, `zExp' must be 1 less
3378 | than the ``true'' floating-point exponent.  The handling of underflow and
3379 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3381 
3382 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3383                                      uint64_t zSig0, uint64_t zSig1,
3384                                      uint64_t zSig2, float_status *status)
3385 {
3386     int8_t roundingMode;
3387     flag roundNearestEven, increment, isTiny;
3388 
3389     roundingMode = status->float_rounding_mode;
3390     roundNearestEven = ( roundingMode == float_round_nearest_even );
3391     switch (roundingMode) {
3392     case float_round_nearest_even:
3393     case float_round_ties_away:
3394         increment = ((int64_t)zSig2 < 0);
3395         break;
3396     case float_round_to_zero:
3397         increment = 0;
3398         break;
3399     case float_round_up:
3400         increment = !zSign && zSig2;
3401         break;
3402     case float_round_down:
3403         increment = zSign && zSig2;
3404         break;
3405     case float_round_to_odd:
3406         increment = !(zSig1 & 0x1) && zSig2;
3407         break;
3408     default:
3409         abort();
3410     }
3411     if ( 0x7FFD <= (uint32_t) zExp ) {
3412         if (    ( 0x7FFD < zExp )
3413              || (    ( zExp == 0x7FFD )
3414                   && eq128(
3415                          LIT64( 0x0001FFFFFFFFFFFF ),
3416                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3417                          zSig0,
3418                          zSig1
3419                      )
3420                   && increment
3421                 )
3422            ) {
3423             float_raise(float_flag_overflow | float_flag_inexact, status);
3424             if (    ( roundingMode == float_round_to_zero )
3425                  || ( zSign && ( roundingMode == float_round_up ) )
3426                  || ( ! zSign && ( roundingMode == float_round_down ) )
3427                  || (roundingMode == float_round_to_odd)
3428                ) {
3429                 return
3430                     packFloat128(
3431                         zSign,
3432                         0x7FFE,
3433                         LIT64( 0x0000FFFFFFFFFFFF ),
3434                         LIT64( 0xFFFFFFFFFFFFFFFF )
3435                     );
3436             }
3437             return packFloat128( zSign, 0x7FFF, 0, 0 );
3438         }
3439         if ( zExp < 0 ) {
3440             if (status->flush_to_zero) {
3441                 float_raise(float_flag_output_denormal, status);
3442                 return packFloat128(zSign, 0, 0, 0);
3443             }
3444             isTiny =
3445                    (status->float_detect_tininess
3446                     == float_tininess_before_rounding)
3447                 || ( zExp < -1 )
3448                 || ! increment
3449                 || lt128(
3450                        zSig0,
3451                        zSig1,
3452                        LIT64( 0x0001FFFFFFFFFFFF ),
3453                        LIT64( 0xFFFFFFFFFFFFFFFF )
3454                    );
3455             shift128ExtraRightJamming(
3456                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3457             zExp = 0;
3458             if (isTiny && zSig2) {
3459                 float_raise(float_flag_underflow, status);
3460             }
3461             switch (roundingMode) {
3462             case float_round_nearest_even:
3463             case float_round_ties_away:
3464                 increment = ((int64_t)zSig2 < 0);
3465                 break;
3466             case float_round_to_zero:
3467                 increment = 0;
3468                 break;
3469             case float_round_up:
3470                 increment = !zSign && zSig2;
3471                 break;
3472             case float_round_down:
3473                 increment = zSign && zSig2;
3474                 break;
3475             case float_round_to_odd:
3476                 increment = !(zSig1 & 0x1) && zSig2;
3477                 break;
3478             default:
3479                 abort();
3480             }
3481         }
3482     }
3483     if (zSig2) {
3484         status->float_exception_flags |= float_flag_inexact;
3485     }
3486     if ( increment ) {
3487         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3488         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3489     }
3490     else {
3491         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3492     }
3493     return packFloat128( zSign, zExp, zSig0, zSig1 );
3494 
3495 }
3496 
3497 /*----------------------------------------------------------------------------
3498 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3499 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3500 | returns the proper quadruple-precision floating-point value corresponding
3501 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3502 | except that the input significand has fewer bits and does not have to be
3503 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3504 | point exponent.
3505 *----------------------------------------------------------------------------*/
3506 
3507 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3508                                               uint64_t zSig0, uint64_t zSig1,
3509                                               float_status *status)
3510 {
3511     int8_t shiftCount;
3512     uint64_t zSig2;
3513 
3514     if ( zSig0 == 0 ) {
3515         zSig0 = zSig1;
3516         zSig1 = 0;
3517         zExp -= 64;
3518     }
3519     shiftCount = clz64(zSig0) - 15;
3520     if ( 0 <= shiftCount ) {
3521         zSig2 = 0;
3522         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3523     }
3524     else {
3525         shift128ExtraRightJamming(
3526             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3527     }
3528     zExp -= shiftCount;
3529     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3530 
3531 }
3532 
3533 
3534 /*----------------------------------------------------------------------------
3535 | Returns the result of converting the 32-bit two's complement integer `a'
3536 | to the extended double-precision floating-point format.  The conversion
3537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3538 | Arithmetic.
3539 *----------------------------------------------------------------------------*/
3540 
3541 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3542 {
3543     flag zSign;
3544     uint32_t absA;
3545     int8_t shiftCount;
3546     uint64_t zSig;
3547 
3548     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3549     zSign = ( a < 0 );
3550     absA = zSign ? - a : a;
3551     shiftCount = clz32(absA) + 32;
3552     zSig = absA;
3553     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3554 
3555 }
3556 
3557 /*----------------------------------------------------------------------------
3558 | Returns the result of converting the 32-bit two's complement integer `a' to
3559 | the quadruple-precision floating-point format.  The conversion is performed
3560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3561 *----------------------------------------------------------------------------*/
3562 
3563 float128 int32_to_float128(int32_t a, float_status *status)
3564 {
3565     flag zSign;
3566     uint32_t absA;
3567     int8_t shiftCount;
3568     uint64_t zSig0;
3569 
3570     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3571     zSign = ( a < 0 );
3572     absA = zSign ? - a : a;
3573     shiftCount = clz32(absA) + 17;
3574     zSig0 = absA;
3575     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3576 
3577 }
3578 
3579 /*----------------------------------------------------------------------------
3580 | Returns the result of converting the 64-bit two's complement integer `a'
3581 | to the extended double-precision floating-point format.  The conversion
3582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3583 | Arithmetic.
3584 *----------------------------------------------------------------------------*/
3585 
3586 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3587 {
3588     flag zSign;
3589     uint64_t absA;
3590     int8_t shiftCount;
3591 
3592     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3593     zSign = ( a < 0 );
3594     absA = zSign ? - a : a;
3595     shiftCount = clz64(absA);
3596     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3597 
3598 }
3599 
3600 /*----------------------------------------------------------------------------
3601 | Returns the result of converting the 64-bit two's complement integer `a' to
3602 | the quadruple-precision floating-point format.  The conversion is performed
3603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3605 
3606 float128 int64_to_float128(int64_t a, float_status *status)
3607 {
3608     flag zSign;
3609     uint64_t absA;
3610     int8_t shiftCount;
3611     int32_t zExp;
3612     uint64_t zSig0, zSig1;
3613 
3614     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3615     zSign = ( a < 0 );
3616     absA = zSign ? - a : a;
3617     shiftCount = clz64(absA) + 49;
3618     zExp = 0x406E - shiftCount;
3619     if ( 64 <= shiftCount ) {
3620         zSig1 = 0;
3621         zSig0 = absA;
3622         shiftCount -= 64;
3623     }
3624     else {
3625         zSig1 = absA;
3626         zSig0 = 0;
3627     }
3628     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3629     return packFloat128( zSign, zExp, zSig0, zSig1 );
3630 
3631 }
3632 
3633 /*----------------------------------------------------------------------------
3634 | Returns the result of converting the 64-bit unsigned integer `a'
3635 | to the quadruple-precision floating-point format.  The conversion is performed
3636 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638 
3639 float128 uint64_to_float128(uint64_t a, float_status *status)
3640 {
3641     if (a == 0) {
3642         return float128_zero;
3643     }
3644     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3645 }
3646 
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the single-precision floating-point value
3649 | `a' to the extended double-precision floating-point format.  The conversion
3650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3651 | Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653 
3654 floatx80 float32_to_floatx80(float32 a, float_status *status)
3655 {
3656     flag aSign;
3657     int aExp;
3658     uint32_t aSig;
3659 
3660     a = float32_squash_input_denormal(a, status);
3661     aSig = extractFloat32Frac( a );
3662     aExp = extractFloat32Exp( a );
3663     aSign = extractFloat32Sign( a );
3664     if ( aExp == 0xFF ) {
3665         if (aSig) {
3666             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3667         }
3668         return packFloatx80(aSign,
3669                             floatx80_infinity_high,
3670                             floatx80_infinity_low);
3671     }
3672     if ( aExp == 0 ) {
3673         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3674         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3675     }
3676     aSig |= 0x00800000;
3677     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3678 
3679 }
3680 
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the single-precision floating-point value
3683 | `a' to the double-precision floating-point format.  The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687 
3688 float128 float32_to_float128(float32 a, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint32_t aSig;
3693 
3694     a = float32_squash_input_denormal(a, status);
3695     aSig = extractFloat32Frac( a );
3696     aExp = extractFloat32Exp( a );
3697     aSign = extractFloat32Sign( a );
3698     if ( aExp == 0xFF ) {
3699         if (aSig) {
3700             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3701         }
3702         return packFloat128( aSign, 0x7FFF, 0, 0 );
3703     }
3704     if ( aExp == 0 ) {
3705         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3707         --aExp;
3708     }
3709     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3710 
3711 }
3712 
3713 /*----------------------------------------------------------------------------
3714 | Returns the remainder of the single-precision floating-point value `a'
3715 | with respect to the corresponding value `b'.  The operation is performed
3716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3718 
3719 float32 float32_rem(float32 a, float32 b, float_status *status)
3720 {
3721     flag aSign, zSign;
3722     int aExp, bExp, expDiff;
3723     uint32_t aSig, bSig;
3724     uint32_t q;
3725     uint64_t aSig64, bSig64, q64;
3726     uint32_t alternateASig;
3727     int32_t sigMean;
3728     a = float32_squash_input_denormal(a, status);
3729     b = float32_squash_input_denormal(b, status);
3730 
3731     aSig = extractFloat32Frac( a );
3732     aExp = extractFloat32Exp( a );
3733     aSign = extractFloat32Sign( a );
3734     bSig = extractFloat32Frac( b );
3735     bExp = extractFloat32Exp( b );
3736     if ( aExp == 0xFF ) {
3737         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3738             return propagateFloat32NaN(a, b, status);
3739         }
3740         float_raise(float_flag_invalid, status);
3741         return float32_default_nan(status);
3742     }
3743     if ( bExp == 0xFF ) {
3744         if (bSig) {
3745             return propagateFloat32NaN(a, b, status);
3746         }
3747         return a;
3748     }
3749     if ( bExp == 0 ) {
3750         if ( bSig == 0 ) {
3751             float_raise(float_flag_invalid, status);
3752             return float32_default_nan(status);
3753         }
3754         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3755     }
3756     if ( aExp == 0 ) {
3757         if ( aSig == 0 ) return a;
3758         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3759     }
3760     expDiff = aExp - bExp;
3761     aSig |= 0x00800000;
3762     bSig |= 0x00800000;
3763     if ( expDiff < 32 ) {
3764         aSig <<= 8;
3765         bSig <<= 8;
3766         if ( expDiff < 0 ) {
3767             if ( expDiff < -1 ) return a;
3768             aSig >>= 1;
3769         }
3770         q = ( bSig <= aSig );
3771         if ( q ) aSig -= bSig;
3772         if ( 0 < expDiff ) {
3773             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3774             q >>= 32 - expDiff;
3775             bSig >>= 2;
3776             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3777         }
3778         else {
3779             aSig >>= 2;
3780             bSig >>= 2;
3781         }
3782     }
3783     else {
3784         if ( bSig <= aSig ) aSig -= bSig;
3785         aSig64 = ( (uint64_t) aSig )<<40;
3786         bSig64 = ( (uint64_t) bSig )<<40;
3787         expDiff -= 64;
3788         while ( 0 < expDiff ) {
3789             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3790             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3791             aSig64 = - ( ( bSig * q64 )<<38 );
3792             expDiff -= 62;
3793         }
3794         expDiff += 64;
3795         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3796         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3797         q = q64>>( 64 - expDiff );
3798         bSig <<= 6;
3799         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3800     }
3801     do {
3802         alternateASig = aSig;
3803         ++q;
3804         aSig -= bSig;
3805     } while ( 0 <= (int32_t) aSig );
3806     sigMean = aSig + alternateASig;
3807     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3808         aSig = alternateASig;
3809     }
3810     zSign = ( (int32_t) aSig < 0 );
3811     if ( zSign ) aSig = - aSig;
3812     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3813 }
3814 
3815 
3816 
3817 /*----------------------------------------------------------------------------
3818 | Returns the binary exponential of the single-precision floating-point value
3819 | `a'. The operation is performed according to the IEC/IEEE Standard for
3820 | Binary Floating-Point Arithmetic.
3821 |
3822 | Uses the following identities:
3823 |
3824 | 1. -------------------------------------------------------------------------
3825 |      x    x*ln(2)
3826 |     2  = e
3827 |
3828 | 2. -------------------------------------------------------------------------
3829 |                      2     3     4     5           n
3830 |      x        x     x     x     x     x           x
3831 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3832 |               1!    2!    3!    4!    5!          n!
3833 *----------------------------------------------------------------------------*/
3834 
3835 static const float64 float32_exp2_coefficients[15] =
3836 {
3837     const_float64( 0x3ff0000000000000ll ), /*  1 */
3838     const_float64( 0x3fe0000000000000ll ), /*  2 */
3839     const_float64( 0x3fc5555555555555ll ), /*  3 */
3840     const_float64( 0x3fa5555555555555ll ), /*  4 */
3841     const_float64( 0x3f81111111111111ll ), /*  5 */
3842     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3843     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3844     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3845     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3846     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3847     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3848     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3849     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3850     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3851     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3852 };
3853 
3854 float32 float32_exp2(float32 a, float_status *status)
3855 {
3856     flag aSign;
3857     int aExp;
3858     uint32_t aSig;
3859     float64 r, x, xn;
3860     int i;
3861     a = float32_squash_input_denormal(a, status);
3862 
3863     aSig = extractFloat32Frac( a );
3864     aExp = extractFloat32Exp( a );
3865     aSign = extractFloat32Sign( a );
3866 
3867     if ( aExp == 0xFF) {
3868         if (aSig) {
3869             return propagateFloat32NaN(a, float32_zero, status);
3870         }
3871         return (aSign) ? float32_zero : a;
3872     }
3873     if (aExp == 0) {
3874         if (aSig == 0) return float32_one;
3875     }
3876 
3877     float_raise(float_flag_inexact, status);
3878 
3879     /* ******************************* */
3880     /* using float64 for approximation */
3881     /* ******************************* */
3882     x = float32_to_float64(a, status);
3883     x = float64_mul(x, float64_ln2, status);
3884 
3885     xn = x;
3886     r = float64_one;
3887     for (i = 0 ; i < 15 ; i++) {
3888         float64 f;
3889 
3890         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3891         r = float64_add(r, f, status);
3892 
3893         xn = float64_mul(xn, x, status);
3894     }
3895 
3896     return float64_to_float32(r, status);
3897 }
3898 
3899 /*----------------------------------------------------------------------------
3900 | Returns the binary log of the single-precision floating-point value `a'.
3901 | The operation is performed according to the IEC/IEEE Standard for Binary
3902 | Floating-Point Arithmetic.
3903 *----------------------------------------------------------------------------*/
3904 float32 float32_log2(float32 a, float_status *status)
3905 {
3906     flag aSign, zSign;
3907     int aExp;
3908     uint32_t aSig, zSig, i;
3909 
3910     a = float32_squash_input_denormal(a, status);
3911     aSig = extractFloat32Frac( a );
3912     aExp = extractFloat32Exp( a );
3913     aSign = extractFloat32Sign( a );
3914 
3915     if ( aExp == 0 ) {
3916         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3917         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3918     }
3919     if ( aSign ) {
3920         float_raise(float_flag_invalid, status);
3921         return float32_default_nan(status);
3922     }
3923     if ( aExp == 0xFF ) {
3924         if (aSig) {
3925             return propagateFloat32NaN(a, float32_zero, status);
3926         }
3927         return a;
3928     }
3929 
3930     aExp -= 0x7F;
3931     aSig |= 0x00800000;
3932     zSign = aExp < 0;
3933     zSig = aExp << 23;
3934 
3935     for (i = 1 << 22; i > 0; i >>= 1) {
3936         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3937         if ( aSig & 0x01000000 ) {
3938             aSig >>= 1;
3939             zSig |= i;
3940         }
3941     }
3942 
3943     if ( zSign )
3944         zSig = -zSig;
3945 
3946     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3947 }
3948 
3949 /*----------------------------------------------------------------------------
3950 | Returns 1 if the single-precision floating-point value `a' is equal to
3951 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3952 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3953 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955 
3956 int float32_eq(float32 a, float32 b, float_status *status)
3957 {
3958     uint32_t av, bv;
3959     a = float32_squash_input_denormal(a, status);
3960     b = float32_squash_input_denormal(b, status);
3961 
3962     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3963          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3964        ) {
3965         float_raise(float_flag_invalid, status);
3966         return 0;
3967     }
3968     av = float32_val(a);
3969     bv = float32_val(b);
3970     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3971 }
3972 
3973 /*----------------------------------------------------------------------------
3974 | Returns 1 if the single-precision floating-point value `a' is less than
3975 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3976 | exception is raised if either operand is a NaN.  The comparison is performed
3977 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3978 *----------------------------------------------------------------------------*/
3979 
3980 int float32_le(float32 a, float32 b, float_status *status)
3981 {
3982     flag aSign, bSign;
3983     uint32_t av, bv;
3984     a = float32_squash_input_denormal(a, status);
3985     b = float32_squash_input_denormal(b, status);
3986 
3987     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3988          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3989        ) {
3990         float_raise(float_flag_invalid, status);
3991         return 0;
3992     }
3993     aSign = extractFloat32Sign( a );
3994     bSign = extractFloat32Sign( b );
3995     av = float32_val(a);
3996     bv = float32_val(b);
3997     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3998     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3999 
4000 }
4001 
4002 /*----------------------------------------------------------------------------
4003 | Returns 1 if the single-precision floating-point value `a' is less than
4004 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4005 | raised if either operand is a NaN.  The comparison is performed according
4006 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4007 *----------------------------------------------------------------------------*/
4008 
4009 int float32_lt(float32 a, float32 b, float_status *status)
4010 {
4011     flag aSign, bSign;
4012     uint32_t av, bv;
4013     a = float32_squash_input_denormal(a, status);
4014     b = float32_squash_input_denormal(b, status);
4015 
4016     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4017          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4018        ) {
4019         float_raise(float_flag_invalid, status);
4020         return 0;
4021     }
4022     aSign = extractFloat32Sign( a );
4023     bSign = extractFloat32Sign( b );
4024     av = float32_val(a);
4025     bv = float32_val(b);
4026     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4027     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4028 
4029 }
4030 
4031 /*----------------------------------------------------------------------------
4032 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4033 | be compared, and 0 otherwise.  The invalid exception is raised if either
4034 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4035 | Standard for Binary Floating-Point Arithmetic.
4036 *----------------------------------------------------------------------------*/
4037 
4038 int float32_unordered(float32 a, float32 b, float_status *status)
4039 {
4040     a = float32_squash_input_denormal(a, status);
4041     b = float32_squash_input_denormal(b, status);
4042 
4043     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4044          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4045        ) {
4046         float_raise(float_flag_invalid, status);
4047         return 1;
4048     }
4049     return 0;
4050 }
4051 
4052 /*----------------------------------------------------------------------------
4053 | Returns 1 if the single-precision floating-point value `a' is equal to
4054 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4055 | exception.  The comparison is performed according to the IEC/IEEE Standard
4056 | for Binary Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4058 
4059 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4060 {
4061     a = float32_squash_input_denormal(a, status);
4062     b = float32_squash_input_denormal(b, status);
4063 
4064     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4065          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4066        ) {
4067         if (float32_is_signaling_nan(a, status)
4068          || float32_is_signaling_nan(b, status)) {
4069             float_raise(float_flag_invalid, status);
4070         }
4071         return 0;
4072     }
4073     return ( float32_val(a) == float32_val(b) ) ||
4074             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4075 }
4076 
4077 /*----------------------------------------------------------------------------
4078 | Returns 1 if the single-precision floating-point value `a' is less than or
4079 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4080 | cause an exception.  Otherwise, the comparison is performed according to the
4081 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4082 *----------------------------------------------------------------------------*/
4083 
4084 int float32_le_quiet(float32 a, float32 b, float_status *status)
4085 {
4086     flag aSign, bSign;
4087     uint32_t av, bv;
4088     a = float32_squash_input_denormal(a, status);
4089     b = float32_squash_input_denormal(b, status);
4090 
4091     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4092          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4093        ) {
4094         if (float32_is_signaling_nan(a, status)
4095          || float32_is_signaling_nan(b, status)) {
4096             float_raise(float_flag_invalid, status);
4097         }
4098         return 0;
4099     }
4100     aSign = extractFloat32Sign( a );
4101     bSign = extractFloat32Sign( b );
4102     av = float32_val(a);
4103     bv = float32_val(b);
4104     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4105     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4106 
4107 }
4108 
4109 /*----------------------------------------------------------------------------
4110 | Returns 1 if the single-precision floating-point value `a' is less than
4111 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4112 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4113 | Standard for Binary Floating-Point Arithmetic.
4114 *----------------------------------------------------------------------------*/
4115 
4116 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4117 {
4118     flag aSign, bSign;
4119     uint32_t av, bv;
4120     a = float32_squash_input_denormal(a, status);
4121     b = float32_squash_input_denormal(b, status);
4122 
4123     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4124          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4125        ) {
4126         if (float32_is_signaling_nan(a, status)
4127          || float32_is_signaling_nan(b, status)) {
4128             float_raise(float_flag_invalid, status);
4129         }
4130         return 0;
4131     }
4132     aSign = extractFloat32Sign( a );
4133     bSign = extractFloat32Sign( b );
4134     av = float32_val(a);
4135     bv = float32_val(b);
4136     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4137     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4138 
4139 }
4140 
4141 /*----------------------------------------------------------------------------
4142 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4143 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4144 | comparison is performed according to the IEC/IEEE Standard for Binary
4145 | Floating-Point Arithmetic.
4146 *----------------------------------------------------------------------------*/
4147 
4148 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4149 {
4150     a = float32_squash_input_denormal(a, status);
4151     b = float32_squash_input_denormal(b, status);
4152 
4153     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4154          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4155        ) {
4156         if (float32_is_signaling_nan(a, status)
4157          || float32_is_signaling_nan(b, status)) {
4158             float_raise(float_flag_invalid, status);
4159         }
4160         return 1;
4161     }
4162     return 0;
4163 }
4164 
4165 /*----------------------------------------------------------------------------
4166 | If `a' is denormal and we are in flush-to-zero mode then set the
4167 | input-denormal exception and return zero. Otherwise just return the value.
4168 *----------------------------------------------------------------------------*/
4169 float16 float16_squash_input_denormal(float16 a, float_status *status)
4170 {
4171     if (status->flush_inputs_to_zero) {
4172         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4173             float_raise(float_flag_input_denormal, status);
4174             return make_float16(float16_val(a) & 0x8000);
4175         }
4176     }
4177     return a;
4178 }
4179 
4180 /*----------------------------------------------------------------------------
4181 | Returns the result of converting the double-precision floating-point value
4182 | `a' to the extended double-precision floating-point format.  The conversion
4183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4184 | Arithmetic.
4185 *----------------------------------------------------------------------------*/
4186 
4187 floatx80 float64_to_floatx80(float64 a, float_status *status)
4188 {
4189     flag aSign;
4190     int aExp;
4191     uint64_t aSig;
4192 
4193     a = float64_squash_input_denormal(a, status);
4194     aSig = extractFloat64Frac( a );
4195     aExp = extractFloat64Exp( a );
4196     aSign = extractFloat64Sign( a );
4197     if ( aExp == 0x7FF ) {
4198         if (aSig) {
4199             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4200         }
4201         return packFloatx80(aSign,
4202                             floatx80_infinity_high,
4203                             floatx80_infinity_low);
4204     }
4205     if ( aExp == 0 ) {
4206         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4207         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4208     }
4209     return
4210         packFloatx80(
4211             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4212 
4213 }
4214 
4215 /*----------------------------------------------------------------------------
4216 | Returns the result of converting the double-precision floating-point value
4217 | `a' to the quadruple-precision floating-point format.  The conversion is
4218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4219 | Arithmetic.
4220 *----------------------------------------------------------------------------*/
4221 
4222 float128 float64_to_float128(float64 a, float_status *status)
4223 {
4224     flag aSign;
4225     int aExp;
4226     uint64_t aSig, zSig0, zSig1;
4227 
4228     a = float64_squash_input_denormal(a, status);
4229     aSig = extractFloat64Frac( a );
4230     aExp = extractFloat64Exp( a );
4231     aSign = extractFloat64Sign( a );
4232     if ( aExp == 0x7FF ) {
4233         if (aSig) {
4234             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4235         }
4236         return packFloat128( aSign, 0x7FFF, 0, 0 );
4237     }
4238     if ( aExp == 0 ) {
4239         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4240         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4241         --aExp;
4242     }
4243     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4244     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4245 
4246 }
4247 
4248 
4249 /*----------------------------------------------------------------------------
4250 | Returns the remainder of the double-precision floating-point value `a'
4251 | with respect to the corresponding value `b'.  The operation is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4254 
4255 float64 float64_rem(float64 a, float64 b, float_status *status)
4256 {
4257     flag aSign, zSign;
4258     int aExp, bExp, expDiff;
4259     uint64_t aSig, bSig;
4260     uint64_t q, alternateASig;
4261     int64_t sigMean;
4262 
4263     a = float64_squash_input_denormal(a, status);
4264     b = float64_squash_input_denormal(b, status);
4265     aSig = extractFloat64Frac( a );
4266     aExp = extractFloat64Exp( a );
4267     aSign = extractFloat64Sign( a );
4268     bSig = extractFloat64Frac( b );
4269     bExp = extractFloat64Exp( b );
4270     if ( aExp == 0x7FF ) {
4271         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4272             return propagateFloat64NaN(a, b, status);
4273         }
4274         float_raise(float_flag_invalid, status);
4275         return float64_default_nan(status);
4276     }
4277     if ( bExp == 0x7FF ) {
4278         if (bSig) {
4279             return propagateFloat64NaN(a, b, status);
4280         }
4281         return a;
4282     }
4283     if ( bExp == 0 ) {
4284         if ( bSig == 0 ) {
4285             float_raise(float_flag_invalid, status);
4286             return float64_default_nan(status);
4287         }
4288         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4289     }
4290     if ( aExp == 0 ) {
4291         if ( aSig == 0 ) return a;
4292         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4293     }
4294     expDiff = aExp - bExp;
4295     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4296     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4297     if ( expDiff < 0 ) {
4298         if ( expDiff < -1 ) return a;
4299         aSig >>= 1;
4300     }
4301     q = ( bSig <= aSig );
4302     if ( q ) aSig -= bSig;
4303     expDiff -= 64;
4304     while ( 0 < expDiff ) {
4305         q = estimateDiv128To64( aSig, 0, bSig );
4306         q = ( 2 < q ) ? q - 2 : 0;
4307         aSig = - ( ( bSig>>2 ) * q );
4308         expDiff -= 62;
4309     }
4310     expDiff += 64;
4311     if ( 0 < expDiff ) {
4312         q = estimateDiv128To64( aSig, 0, bSig );
4313         q = ( 2 < q ) ? q - 2 : 0;
4314         q >>= 64 - expDiff;
4315         bSig >>= 2;
4316         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4317     }
4318     else {
4319         aSig >>= 2;
4320         bSig >>= 2;
4321     }
4322     do {
4323         alternateASig = aSig;
4324         ++q;
4325         aSig -= bSig;
4326     } while ( 0 <= (int64_t) aSig );
4327     sigMean = aSig + alternateASig;
4328     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4329         aSig = alternateASig;
4330     }
4331     zSign = ( (int64_t) aSig < 0 );
4332     if ( zSign ) aSig = - aSig;
4333     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4334 
4335 }
4336 
4337 /*----------------------------------------------------------------------------
4338 | Returns the binary log of the double-precision floating-point value `a'.
4339 | The operation is performed according to the IEC/IEEE Standard for Binary
4340 | Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342 float64 float64_log2(float64 a, float_status *status)
4343 {
4344     flag aSign, zSign;
4345     int aExp;
4346     uint64_t aSig, aSig0, aSig1, zSig, i;
4347     a = float64_squash_input_denormal(a, status);
4348 
4349     aSig = extractFloat64Frac( a );
4350     aExp = extractFloat64Exp( a );
4351     aSign = extractFloat64Sign( a );
4352 
4353     if ( aExp == 0 ) {
4354         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4355         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4356     }
4357     if ( aSign ) {
4358         float_raise(float_flag_invalid, status);
4359         return float64_default_nan(status);
4360     }
4361     if ( aExp == 0x7FF ) {
4362         if (aSig) {
4363             return propagateFloat64NaN(a, float64_zero, status);
4364         }
4365         return a;
4366     }
4367 
4368     aExp -= 0x3FF;
4369     aSig |= LIT64( 0x0010000000000000 );
4370     zSign = aExp < 0;
4371     zSig = (uint64_t)aExp << 52;
4372     for (i = 1LL << 51; i > 0; i >>= 1) {
4373         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4374         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4375         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4376             aSig >>= 1;
4377             zSig |= i;
4378         }
4379     }
4380 
4381     if ( zSign )
4382         zSig = -zSig;
4383     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4384 }
4385 
4386 /*----------------------------------------------------------------------------
4387 | Returns 1 if the double-precision floating-point value `a' is equal to the
4388 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4389 | if either operand is a NaN.  Otherwise, the comparison is performed
4390 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4391 *----------------------------------------------------------------------------*/
4392 
4393 int float64_eq(float64 a, float64 b, float_status *status)
4394 {
4395     uint64_t av, bv;
4396     a = float64_squash_input_denormal(a, status);
4397     b = float64_squash_input_denormal(b, status);
4398 
4399     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4400          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4401        ) {
4402         float_raise(float_flag_invalid, status);
4403         return 0;
4404     }
4405     av = float64_val(a);
4406     bv = float64_val(b);
4407     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4408 
4409 }
4410 
4411 /*----------------------------------------------------------------------------
4412 | Returns 1 if the double-precision floating-point value `a' is less than or
4413 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4414 | exception is raised if either operand is a NaN.  The comparison is performed
4415 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4416 *----------------------------------------------------------------------------*/
4417 
4418 int float64_le(float64 a, float64 b, float_status *status)
4419 {
4420     flag aSign, bSign;
4421     uint64_t av, bv;
4422     a = float64_squash_input_denormal(a, status);
4423     b = float64_squash_input_denormal(b, status);
4424 
4425     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4426          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4427        ) {
4428         float_raise(float_flag_invalid, status);
4429         return 0;
4430     }
4431     aSign = extractFloat64Sign( a );
4432     bSign = extractFloat64Sign( b );
4433     av = float64_val(a);
4434     bv = float64_val(b);
4435     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4436     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4437 
4438 }
4439 
4440 /*----------------------------------------------------------------------------
4441 | Returns 1 if the double-precision floating-point value `a' is less than
4442 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4443 | raised if either operand is a NaN.  The comparison is performed according
4444 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4445 *----------------------------------------------------------------------------*/
4446 
4447 int float64_lt(float64 a, float64 b, float_status *status)
4448 {
4449     flag aSign, bSign;
4450     uint64_t av, bv;
4451 
4452     a = float64_squash_input_denormal(a, status);
4453     b = float64_squash_input_denormal(b, status);
4454     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4455          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4456        ) {
4457         float_raise(float_flag_invalid, status);
4458         return 0;
4459     }
4460     aSign = extractFloat64Sign( a );
4461     bSign = extractFloat64Sign( b );
4462     av = float64_val(a);
4463     bv = float64_val(b);
4464     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4465     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4466 
4467 }
4468 
4469 /*----------------------------------------------------------------------------
4470 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4471 | be compared, and 0 otherwise.  The invalid exception is raised if either
4472 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4473 | Standard for Binary Floating-Point Arithmetic.
4474 *----------------------------------------------------------------------------*/
4475 
4476 int float64_unordered(float64 a, float64 b, float_status *status)
4477 {
4478     a = float64_squash_input_denormal(a, status);
4479     b = float64_squash_input_denormal(b, status);
4480 
4481     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4482          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4483        ) {
4484         float_raise(float_flag_invalid, status);
4485         return 1;
4486     }
4487     return 0;
4488 }
4489 
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point value `a' is equal to the
4492 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4493 | exception.The comparison is performed according to the IEC/IEEE Standard
4494 | for Binary Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4496 
4497 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4498 {
4499     uint64_t av, bv;
4500     a = float64_squash_input_denormal(a, status);
4501     b = float64_squash_input_denormal(b, status);
4502 
4503     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4504          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4505        ) {
4506         if (float64_is_signaling_nan(a, status)
4507          || float64_is_signaling_nan(b, status)) {
4508             float_raise(float_flag_invalid, status);
4509         }
4510         return 0;
4511     }
4512     av = float64_val(a);
4513     bv = float64_val(b);
4514     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4515 
4516 }
4517 
4518 /*----------------------------------------------------------------------------
4519 | Returns 1 if the double-precision floating-point value `a' is less than or
4520 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4521 | cause an exception.  Otherwise, the comparison is performed according to the
4522 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4523 *----------------------------------------------------------------------------*/
4524 
4525 int float64_le_quiet(float64 a, float64 b, float_status *status)
4526 {
4527     flag aSign, bSign;
4528     uint64_t av, bv;
4529     a = float64_squash_input_denormal(a, status);
4530     b = float64_squash_input_denormal(b, status);
4531 
4532     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4533          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4534        ) {
4535         if (float64_is_signaling_nan(a, status)
4536          || float64_is_signaling_nan(b, status)) {
4537             float_raise(float_flag_invalid, status);
4538         }
4539         return 0;
4540     }
4541     aSign = extractFloat64Sign( a );
4542     bSign = extractFloat64Sign( b );
4543     av = float64_val(a);
4544     bv = float64_val(b);
4545     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4546     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4547 
4548 }
4549 
4550 /*----------------------------------------------------------------------------
4551 | Returns 1 if the double-precision floating-point value `a' is less than
4552 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4553 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4554 | Standard for Binary Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4556 
4557 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4558 {
4559     flag aSign, bSign;
4560     uint64_t av, bv;
4561     a = float64_squash_input_denormal(a, status);
4562     b = float64_squash_input_denormal(b, status);
4563 
4564     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4565          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4566        ) {
4567         if (float64_is_signaling_nan(a, status)
4568          || float64_is_signaling_nan(b, status)) {
4569             float_raise(float_flag_invalid, status);
4570         }
4571         return 0;
4572     }
4573     aSign = extractFloat64Sign( a );
4574     bSign = extractFloat64Sign( b );
4575     av = float64_val(a);
4576     bv = float64_val(b);
4577     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4578     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4579 
4580 }
4581 
4582 /*----------------------------------------------------------------------------
4583 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4584 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4585 | comparison is performed according to the IEC/IEEE Standard for Binary
4586 | Floating-Point Arithmetic.
4587 *----------------------------------------------------------------------------*/
4588 
4589 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4590 {
4591     a = float64_squash_input_denormal(a, status);
4592     b = float64_squash_input_denormal(b, status);
4593 
4594     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4595          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4596        ) {
4597         if (float64_is_signaling_nan(a, status)
4598          || float64_is_signaling_nan(b, status)) {
4599             float_raise(float_flag_invalid, status);
4600         }
4601         return 1;
4602     }
4603     return 0;
4604 }
4605 
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the extended double-precision floating-
4608 | point value `a' to the 32-bit two's complement integer format.  The
4609 | conversion is performed according to the IEC/IEEE Standard for Binary
4610 | Floating-Point Arithmetic---which means in particular that the conversion
4611 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4612 | largest positive integer is returned.  Otherwise, if the conversion
4613 | overflows, the largest integer with the same sign as `a' is returned.
4614 *----------------------------------------------------------------------------*/
4615 
4616 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4617 {
4618     flag aSign;
4619     int32_t aExp, shiftCount;
4620     uint64_t aSig;
4621 
4622     if (floatx80_invalid_encoding(a)) {
4623         float_raise(float_flag_invalid, status);
4624         return 1 << 31;
4625     }
4626     aSig = extractFloatx80Frac( a );
4627     aExp = extractFloatx80Exp( a );
4628     aSign = extractFloatx80Sign( a );
4629     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4630     shiftCount = 0x4037 - aExp;
4631     if ( shiftCount <= 0 ) shiftCount = 1;
4632     shift64RightJamming( aSig, shiftCount, &aSig );
4633     return roundAndPackInt32(aSign, aSig, status);
4634 
4635 }
4636 
4637 /*----------------------------------------------------------------------------
4638 | Returns the result of converting the extended double-precision floating-
4639 | point value `a' to the 32-bit two's complement integer format.  The
4640 | conversion is performed according to the IEC/IEEE Standard for Binary
4641 | Floating-Point Arithmetic, except that the conversion is always rounded
4642 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4643 | Otherwise, if the conversion overflows, the largest integer with the same
4644 | sign as `a' is returned.
4645 *----------------------------------------------------------------------------*/
4646 
4647 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4648 {
4649     flag aSign;
4650     int32_t aExp, shiftCount;
4651     uint64_t aSig, savedASig;
4652     int32_t z;
4653 
4654     if (floatx80_invalid_encoding(a)) {
4655         float_raise(float_flag_invalid, status);
4656         return 1 << 31;
4657     }
4658     aSig = extractFloatx80Frac( a );
4659     aExp = extractFloatx80Exp( a );
4660     aSign = extractFloatx80Sign( a );
4661     if ( 0x401E < aExp ) {
4662         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4663         goto invalid;
4664     }
4665     else if ( aExp < 0x3FFF ) {
4666         if (aExp || aSig) {
4667             status->float_exception_flags |= float_flag_inexact;
4668         }
4669         return 0;
4670     }
4671     shiftCount = 0x403E - aExp;
4672     savedASig = aSig;
4673     aSig >>= shiftCount;
4674     z = aSig;
4675     if ( aSign ) z = - z;
4676     if ( ( z < 0 ) ^ aSign ) {
4677  invalid:
4678         float_raise(float_flag_invalid, status);
4679         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4680     }
4681     if ( ( aSig<<shiftCount ) != savedASig ) {
4682         status->float_exception_flags |= float_flag_inexact;
4683     }
4684     return z;
4685 
4686 }
4687 
4688 /*----------------------------------------------------------------------------
4689 | Returns the result of converting the extended double-precision floating-
4690 | point value `a' to the 64-bit two's complement integer format.  The
4691 | conversion is performed according to the IEC/IEEE Standard for Binary
4692 | Floating-Point Arithmetic---which means in particular that the conversion
4693 | is rounded according to the current rounding mode.  If `a' is a NaN,
4694 | the largest positive integer is returned.  Otherwise, if the conversion
4695 | overflows, the largest integer with the same sign as `a' is returned.
4696 *----------------------------------------------------------------------------*/
4697 
4698 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4699 {
4700     flag aSign;
4701     int32_t aExp, shiftCount;
4702     uint64_t aSig, aSigExtra;
4703 
4704     if (floatx80_invalid_encoding(a)) {
4705         float_raise(float_flag_invalid, status);
4706         return 1ULL << 63;
4707     }
4708     aSig = extractFloatx80Frac( a );
4709     aExp = extractFloatx80Exp( a );
4710     aSign = extractFloatx80Sign( a );
4711     shiftCount = 0x403E - aExp;
4712     if ( shiftCount <= 0 ) {
4713         if ( shiftCount ) {
4714             float_raise(float_flag_invalid, status);
4715             if (!aSign || floatx80_is_any_nan(a)) {
4716                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4717             }
4718             return (int64_t) LIT64( 0x8000000000000000 );
4719         }
4720         aSigExtra = 0;
4721     }
4722     else {
4723         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4724     }
4725     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4726 
4727 }
4728 
4729 /*----------------------------------------------------------------------------
4730 | Returns the result of converting the extended double-precision floating-
4731 | point value `a' to the 64-bit two's complement integer format.  The
4732 | conversion is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic, except that the conversion is always rounded
4734 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4735 | Otherwise, if the conversion overflows, the largest integer with the same
4736 | sign as `a' is returned.
4737 *----------------------------------------------------------------------------*/
4738 
4739 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4740 {
4741     flag aSign;
4742     int32_t aExp, shiftCount;
4743     uint64_t aSig;
4744     int64_t z;
4745 
4746     if (floatx80_invalid_encoding(a)) {
4747         float_raise(float_flag_invalid, status);
4748         return 1ULL << 63;
4749     }
4750     aSig = extractFloatx80Frac( a );
4751     aExp = extractFloatx80Exp( a );
4752     aSign = extractFloatx80Sign( a );
4753     shiftCount = aExp - 0x403E;
4754     if ( 0 <= shiftCount ) {
4755         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4756         if ( ( a.high != 0xC03E ) || aSig ) {
4757             float_raise(float_flag_invalid, status);
4758             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4759                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4760             }
4761         }
4762         return (int64_t) LIT64( 0x8000000000000000 );
4763     }
4764     else if ( aExp < 0x3FFF ) {
4765         if (aExp | aSig) {
4766             status->float_exception_flags |= float_flag_inexact;
4767         }
4768         return 0;
4769     }
4770     z = aSig>>( - shiftCount );
4771     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4772         status->float_exception_flags |= float_flag_inexact;
4773     }
4774     if ( aSign ) z = - z;
4775     return z;
4776 
4777 }
4778 
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the extended double-precision floating-
4781 | point value `a' to the single-precision floating-point format.  The
4782 | conversion is performed according to the IEC/IEEE Standard for Binary
4783 | Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4785 
4786 float32 floatx80_to_float32(floatx80 a, float_status *status)
4787 {
4788     flag aSign;
4789     int32_t aExp;
4790     uint64_t aSig;
4791 
4792     if (floatx80_invalid_encoding(a)) {
4793         float_raise(float_flag_invalid, status);
4794         return float32_default_nan(status);
4795     }
4796     aSig = extractFloatx80Frac( a );
4797     aExp = extractFloatx80Exp( a );
4798     aSign = extractFloatx80Sign( a );
4799     if ( aExp == 0x7FFF ) {
4800         if ( (uint64_t) ( aSig<<1 ) ) {
4801             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4802         }
4803         return packFloat32( aSign, 0xFF, 0 );
4804     }
4805     shift64RightJamming( aSig, 33, &aSig );
4806     if ( aExp || aSig ) aExp -= 0x3F81;
4807     return roundAndPackFloat32(aSign, aExp, aSig, status);
4808 
4809 }
4810 
4811 /*----------------------------------------------------------------------------
4812 | Returns the result of converting the extended double-precision floating-
4813 | point value `a' to the double-precision floating-point format.  The
4814 | conversion is performed according to the IEC/IEEE Standard for Binary
4815 | Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4817 
4818 float64 floatx80_to_float64(floatx80 a, float_status *status)
4819 {
4820     flag aSign;
4821     int32_t aExp;
4822     uint64_t aSig, zSig;
4823 
4824     if (floatx80_invalid_encoding(a)) {
4825         float_raise(float_flag_invalid, status);
4826         return float64_default_nan(status);
4827     }
4828     aSig = extractFloatx80Frac( a );
4829     aExp = extractFloatx80Exp( a );
4830     aSign = extractFloatx80Sign( a );
4831     if ( aExp == 0x7FFF ) {
4832         if ( (uint64_t) ( aSig<<1 ) ) {
4833             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4834         }
4835         return packFloat64( aSign, 0x7FF, 0 );
4836     }
4837     shift64RightJamming( aSig, 1, &zSig );
4838     if ( aExp || aSig ) aExp -= 0x3C01;
4839     return roundAndPackFloat64(aSign, aExp, zSig, status);
4840 
4841 }
4842 
4843 /*----------------------------------------------------------------------------
4844 | Returns the result of converting the extended double-precision floating-
4845 | point value `a' to the quadruple-precision floating-point format.  The
4846 | conversion is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4849 
4850 float128 floatx80_to_float128(floatx80 a, float_status *status)
4851 {
4852     flag aSign;
4853     int aExp;
4854     uint64_t aSig, zSig0, zSig1;
4855 
4856     if (floatx80_invalid_encoding(a)) {
4857         float_raise(float_flag_invalid, status);
4858         return float128_default_nan(status);
4859     }
4860     aSig = extractFloatx80Frac( a );
4861     aExp = extractFloatx80Exp( a );
4862     aSign = extractFloatx80Sign( a );
4863     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4864         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4865     }
4866     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4867     return packFloat128( aSign, aExp, zSig0, zSig1 );
4868 
4869 }
4870 
4871 /*----------------------------------------------------------------------------
4872 | Rounds the extended double-precision floating-point value `a'
4873 | to the precision provided by floatx80_rounding_precision and returns the
4874 | result as an extended double-precision floating-point value.
4875 | The operation is performed according to the IEC/IEEE Standard for Binary
4876 | Floating-Point Arithmetic.
4877 *----------------------------------------------------------------------------*/
4878 
4879 floatx80 floatx80_round(floatx80 a, float_status *status)
4880 {
4881     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4882                                 extractFloatx80Sign(a),
4883                                 extractFloatx80Exp(a),
4884                                 extractFloatx80Frac(a), 0, status);
4885 }
4886 
4887 /*----------------------------------------------------------------------------
4888 | Rounds the extended double-precision floating-point value `a' to an integer,
4889 | and returns the result as an extended quadruple-precision floating-point
4890 | value.  The operation is performed according to the IEC/IEEE Standard for
4891 | Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4893 
4894 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4895 {
4896     flag aSign;
4897     int32_t aExp;
4898     uint64_t lastBitMask, roundBitsMask;
4899     floatx80 z;
4900 
4901     if (floatx80_invalid_encoding(a)) {
4902         float_raise(float_flag_invalid, status);
4903         return floatx80_default_nan(status);
4904     }
4905     aExp = extractFloatx80Exp( a );
4906     if ( 0x403E <= aExp ) {
4907         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4908             return propagateFloatx80NaN(a, a, status);
4909         }
4910         return a;
4911     }
4912     if ( aExp < 0x3FFF ) {
4913         if (    ( aExp == 0 )
4914              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4915             return a;
4916         }
4917         status->float_exception_flags |= float_flag_inexact;
4918         aSign = extractFloatx80Sign( a );
4919         switch (status->float_rounding_mode) {
4920          case float_round_nearest_even:
4921             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4922                ) {
4923                 return
4924                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4925             }
4926             break;
4927         case float_round_ties_away:
4928             if (aExp == 0x3FFE) {
4929                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4930             }
4931             break;
4932          case float_round_down:
4933             return
4934                   aSign ?
4935                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4936                 : packFloatx80( 0, 0, 0 );
4937          case float_round_up:
4938             return
4939                   aSign ? packFloatx80( 1, 0, 0 )
4940                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4941         }
4942         return packFloatx80( aSign, 0, 0 );
4943     }
4944     lastBitMask = 1;
4945     lastBitMask <<= 0x403E - aExp;
4946     roundBitsMask = lastBitMask - 1;
4947     z = a;
4948     switch (status->float_rounding_mode) {
4949     case float_round_nearest_even:
4950         z.low += lastBitMask>>1;
4951         if ((z.low & roundBitsMask) == 0) {
4952             z.low &= ~lastBitMask;
4953         }
4954         break;
4955     case float_round_ties_away:
4956         z.low += lastBitMask >> 1;
4957         break;
4958     case float_round_to_zero:
4959         break;
4960     case float_round_up:
4961         if (!extractFloatx80Sign(z)) {
4962             z.low += roundBitsMask;
4963         }
4964         break;
4965     case float_round_down:
4966         if (extractFloatx80Sign(z)) {
4967             z.low += roundBitsMask;
4968         }
4969         break;
4970     default:
4971         abort();
4972     }
4973     z.low &= ~ roundBitsMask;
4974     if ( z.low == 0 ) {
4975         ++z.high;
4976         z.low = LIT64( 0x8000000000000000 );
4977     }
4978     if (z.low != a.low) {
4979         status->float_exception_flags |= float_flag_inexact;
4980     }
4981     return z;
4982 
4983 }
4984 
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of adding the absolute values of the extended double-
4987 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4988 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4989 | The addition is performed according to the IEC/IEEE Standard for Binary
4990 | Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4992 
4993 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4994                                 float_status *status)
4995 {
4996     int32_t aExp, bExp, zExp;
4997     uint64_t aSig, bSig, zSig0, zSig1;
4998     int32_t expDiff;
4999 
5000     aSig = extractFloatx80Frac( a );
5001     aExp = extractFloatx80Exp( a );
5002     bSig = extractFloatx80Frac( b );
5003     bExp = extractFloatx80Exp( b );
5004     expDiff = aExp - bExp;
5005     if ( 0 < expDiff ) {
5006         if ( aExp == 0x7FFF ) {
5007             if ((uint64_t)(aSig << 1)) {
5008                 return propagateFloatx80NaN(a, b, status);
5009             }
5010             return a;
5011         }
5012         if ( bExp == 0 ) --expDiff;
5013         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5014         zExp = aExp;
5015     }
5016     else if ( expDiff < 0 ) {
5017         if ( bExp == 0x7FFF ) {
5018             if ((uint64_t)(bSig << 1)) {
5019                 return propagateFloatx80NaN(a, b, status);
5020             }
5021             return packFloatx80(zSign,
5022                                 floatx80_infinity_high,
5023                                 floatx80_infinity_low);
5024         }
5025         if ( aExp == 0 ) ++expDiff;
5026         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5027         zExp = bExp;
5028     }
5029     else {
5030         if ( aExp == 0x7FFF ) {
5031             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5032                 return propagateFloatx80NaN(a, b, status);
5033             }
5034             return a;
5035         }
5036         zSig1 = 0;
5037         zSig0 = aSig + bSig;
5038         if ( aExp == 0 ) {
5039             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5040             goto roundAndPack;
5041         }
5042         zExp = aExp;
5043         goto shiftRight1;
5044     }
5045     zSig0 = aSig + bSig;
5046     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5047  shiftRight1:
5048     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5049     zSig0 |= LIT64( 0x8000000000000000 );
5050     ++zExp;
5051  roundAndPack:
5052     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5053                                 zSign, zExp, zSig0, zSig1, status);
5054 }
5055 
5056 /*----------------------------------------------------------------------------
5057 | Returns the result of subtracting the absolute values of the extended
5058 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5059 | difference is negated before being returned.  `zSign' is ignored if the
5060 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5061 | Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063 
5064 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5065                                 float_status *status)
5066 {
5067     int32_t aExp, bExp, zExp;
5068     uint64_t aSig, bSig, zSig0, zSig1;
5069     int32_t expDiff;
5070 
5071     aSig = extractFloatx80Frac( a );
5072     aExp = extractFloatx80Exp( a );
5073     bSig = extractFloatx80Frac( b );
5074     bExp = extractFloatx80Exp( b );
5075     expDiff = aExp - bExp;
5076     if ( 0 < expDiff ) goto aExpBigger;
5077     if ( expDiff < 0 ) goto bExpBigger;
5078     if ( aExp == 0x7FFF ) {
5079         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5080             return propagateFloatx80NaN(a, b, status);
5081         }
5082         float_raise(float_flag_invalid, status);
5083         return floatx80_default_nan(status);
5084     }
5085     if ( aExp == 0 ) {
5086         aExp = 1;
5087         bExp = 1;
5088     }
5089     zSig1 = 0;
5090     if ( bSig < aSig ) goto aBigger;
5091     if ( aSig < bSig ) goto bBigger;
5092     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5093  bExpBigger:
5094     if ( bExp == 0x7FFF ) {
5095         if ((uint64_t)(bSig << 1)) {
5096             return propagateFloatx80NaN(a, b, status);
5097         }
5098         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5099                             floatx80_infinity_low);
5100     }
5101     if ( aExp == 0 ) ++expDiff;
5102     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5103  bBigger:
5104     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5105     zExp = bExp;
5106     zSign ^= 1;
5107     goto normalizeRoundAndPack;
5108  aExpBigger:
5109     if ( aExp == 0x7FFF ) {
5110         if ((uint64_t)(aSig << 1)) {
5111             return propagateFloatx80NaN(a, b, status);
5112         }
5113         return a;
5114     }
5115     if ( bExp == 0 ) --expDiff;
5116     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5117  aBigger:
5118     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5119     zExp = aExp;
5120  normalizeRoundAndPack:
5121     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5122                                          zSign, zExp, zSig0, zSig1, status);
5123 }
5124 
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5130 
5131 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5132 {
5133     flag aSign, bSign;
5134 
5135     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5136         float_raise(float_flag_invalid, status);
5137         return floatx80_default_nan(status);
5138     }
5139     aSign = extractFloatx80Sign( a );
5140     bSign = extractFloatx80Sign( b );
5141     if ( aSign == bSign ) {
5142         return addFloatx80Sigs(a, b, aSign, status);
5143     }
5144     else {
5145         return subFloatx80Sigs(a, b, aSign, status);
5146     }
5147 
5148 }
5149 
5150 /*----------------------------------------------------------------------------
5151 | Returns the result of subtracting the extended double-precision floating-
5152 | point values `a' and `b'.  The operation is performed according to the
5153 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5154 *----------------------------------------------------------------------------*/
5155 
5156 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5157 {
5158     flag aSign, bSign;
5159 
5160     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5161         float_raise(float_flag_invalid, status);
5162         return floatx80_default_nan(status);
5163     }
5164     aSign = extractFloatx80Sign( a );
5165     bSign = extractFloatx80Sign( b );
5166     if ( aSign == bSign ) {
5167         return subFloatx80Sigs(a, b, aSign, status);
5168     }
5169     else {
5170         return addFloatx80Sigs(a, b, aSign, status);
5171     }
5172 
5173 }
5174 
5175 /*----------------------------------------------------------------------------
5176 | Returns the result of multiplying the extended double-precision floating-
5177 | point values `a' and `b'.  The operation is performed according to the
5178 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5180 
5181 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5182 {
5183     flag aSign, bSign, zSign;
5184     int32_t aExp, bExp, zExp;
5185     uint64_t aSig, bSig, zSig0, zSig1;
5186 
5187     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5188         float_raise(float_flag_invalid, status);
5189         return floatx80_default_nan(status);
5190     }
5191     aSig = extractFloatx80Frac( a );
5192     aExp = extractFloatx80Exp( a );
5193     aSign = extractFloatx80Sign( a );
5194     bSig = extractFloatx80Frac( b );
5195     bExp = extractFloatx80Exp( b );
5196     bSign = extractFloatx80Sign( b );
5197     zSign = aSign ^ bSign;
5198     if ( aExp == 0x7FFF ) {
5199         if (    (uint64_t) ( aSig<<1 )
5200              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5201             return propagateFloatx80NaN(a, b, status);
5202         }
5203         if ( ( bExp | bSig ) == 0 ) goto invalid;
5204         return packFloatx80(zSign, floatx80_infinity_high,
5205                                    floatx80_infinity_low);
5206     }
5207     if ( bExp == 0x7FFF ) {
5208         if ((uint64_t)(bSig << 1)) {
5209             return propagateFloatx80NaN(a, b, status);
5210         }
5211         if ( ( aExp | aSig ) == 0 ) {
5212  invalid:
5213             float_raise(float_flag_invalid, status);
5214             return floatx80_default_nan(status);
5215         }
5216         return packFloatx80(zSign, floatx80_infinity_high,
5217                                    floatx80_infinity_low);
5218     }
5219     if ( aExp == 0 ) {
5220         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5221         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5222     }
5223     if ( bExp == 0 ) {
5224         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5225         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5226     }
5227     zExp = aExp + bExp - 0x3FFE;
5228     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5229     if ( 0 < (int64_t) zSig0 ) {
5230         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5231         --zExp;
5232     }
5233     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5234                                 zSign, zExp, zSig0, zSig1, status);
5235 }
5236 
5237 /*----------------------------------------------------------------------------
5238 | Returns the result of dividing the extended double-precision floating-point
5239 | value `a' by the corresponding value `b'.  The operation is performed
5240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5241 *----------------------------------------------------------------------------*/
5242 
5243 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5244 {
5245     flag aSign, bSign, zSign;
5246     int32_t aExp, bExp, zExp;
5247     uint64_t aSig, bSig, zSig0, zSig1;
5248     uint64_t rem0, rem1, rem2, term0, term1, term2;
5249 
5250     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5251         float_raise(float_flag_invalid, status);
5252         return floatx80_default_nan(status);
5253     }
5254     aSig = extractFloatx80Frac( a );
5255     aExp = extractFloatx80Exp( a );
5256     aSign = extractFloatx80Sign( a );
5257     bSig = extractFloatx80Frac( b );
5258     bExp = extractFloatx80Exp( b );
5259     bSign = extractFloatx80Sign( b );
5260     zSign = aSign ^ bSign;
5261     if ( aExp == 0x7FFF ) {
5262         if ((uint64_t)(aSig << 1)) {
5263             return propagateFloatx80NaN(a, b, status);
5264         }
5265         if ( bExp == 0x7FFF ) {
5266             if ((uint64_t)(bSig << 1)) {
5267                 return propagateFloatx80NaN(a, b, status);
5268             }
5269             goto invalid;
5270         }
5271         return packFloatx80(zSign, floatx80_infinity_high,
5272                                    floatx80_infinity_low);
5273     }
5274     if ( bExp == 0x7FFF ) {
5275         if ((uint64_t)(bSig << 1)) {
5276             return propagateFloatx80NaN(a, b, status);
5277         }
5278         return packFloatx80( zSign, 0, 0 );
5279     }
5280     if ( bExp == 0 ) {
5281         if ( bSig == 0 ) {
5282             if ( ( aExp | aSig ) == 0 ) {
5283  invalid:
5284                 float_raise(float_flag_invalid, status);
5285                 return floatx80_default_nan(status);
5286             }
5287             float_raise(float_flag_divbyzero, status);
5288             return packFloatx80(zSign, floatx80_infinity_high,
5289                                        floatx80_infinity_low);
5290         }
5291         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5292     }
5293     if ( aExp == 0 ) {
5294         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5295         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5296     }
5297     zExp = aExp - bExp + 0x3FFE;
5298     rem1 = 0;
5299     if ( bSig <= aSig ) {
5300         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5301         ++zExp;
5302     }
5303     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5304     mul64To128( bSig, zSig0, &term0, &term1 );
5305     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5306     while ( (int64_t) rem0 < 0 ) {
5307         --zSig0;
5308         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5309     }
5310     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5311     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5312         mul64To128( bSig, zSig1, &term1, &term2 );
5313         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5314         while ( (int64_t) rem1 < 0 ) {
5315             --zSig1;
5316             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5317         }
5318         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5319     }
5320     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5321                                 zSign, zExp, zSig0, zSig1, status);
5322 }
5323 
5324 /*----------------------------------------------------------------------------
5325 | Returns the remainder of the extended double-precision floating-point value
5326 | `a' with respect to the corresponding value `b'.  The operation is performed
5327 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5328 *----------------------------------------------------------------------------*/
5329 
5330 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5331 {
5332     flag aSign, zSign;
5333     int32_t aExp, bExp, expDiff;
5334     uint64_t aSig0, aSig1, bSig;
5335     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5336 
5337     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5338         float_raise(float_flag_invalid, status);
5339         return floatx80_default_nan(status);
5340     }
5341     aSig0 = extractFloatx80Frac( a );
5342     aExp = extractFloatx80Exp( a );
5343     aSign = extractFloatx80Sign( a );
5344     bSig = extractFloatx80Frac( b );
5345     bExp = extractFloatx80Exp( b );
5346     if ( aExp == 0x7FFF ) {
5347         if (    (uint64_t) ( aSig0<<1 )
5348              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5349             return propagateFloatx80NaN(a, b, status);
5350         }
5351         goto invalid;
5352     }
5353     if ( bExp == 0x7FFF ) {
5354         if ((uint64_t)(bSig << 1)) {
5355             return propagateFloatx80NaN(a, b, status);
5356         }
5357         return a;
5358     }
5359     if ( bExp == 0 ) {
5360         if ( bSig == 0 ) {
5361  invalid:
5362             float_raise(float_flag_invalid, status);
5363             return floatx80_default_nan(status);
5364         }
5365         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5366     }
5367     if ( aExp == 0 ) {
5368         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5369         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5370     }
5371     bSig |= LIT64( 0x8000000000000000 );
5372     zSign = aSign;
5373     expDiff = aExp - bExp;
5374     aSig1 = 0;
5375     if ( expDiff < 0 ) {
5376         if ( expDiff < -1 ) return a;
5377         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5378         expDiff = 0;
5379     }
5380     q = ( bSig <= aSig0 );
5381     if ( q ) aSig0 -= bSig;
5382     expDiff -= 64;
5383     while ( 0 < expDiff ) {
5384         q = estimateDiv128To64( aSig0, aSig1, bSig );
5385         q = ( 2 < q ) ? q - 2 : 0;
5386         mul64To128( bSig, q, &term0, &term1 );
5387         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5388         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5389         expDiff -= 62;
5390     }
5391     expDiff += 64;
5392     if ( 0 < expDiff ) {
5393         q = estimateDiv128To64( aSig0, aSig1, bSig );
5394         q = ( 2 < q ) ? q - 2 : 0;
5395         q >>= 64 - expDiff;
5396         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5397         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5398         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5399         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5400             ++q;
5401             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5402         }
5403     }
5404     else {
5405         term1 = 0;
5406         term0 = bSig;
5407     }
5408     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5409     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5410          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5411               && ( q & 1 ) )
5412        ) {
5413         aSig0 = alternateASig0;
5414         aSig1 = alternateASig1;
5415         zSign = ! zSign;
5416     }
5417     return
5418         normalizeRoundAndPackFloatx80(
5419             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5420 
5421 }
5422 
5423 /*----------------------------------------------------------------------------
5424 | Returns the square root of the extended double-precision floating-point
5425 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5426 | for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5428 
5429 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5430 {
5431     flag aSign;
5432     int32_t aExp, zExp;
5433     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5434     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5435 
5436     if (floatx80_invalid_encoding(a)) {
5437         float_raise(float_flag_invalid, status);
5438         return floatx80_default_nan(status);
5439     }
5440     aSig0 = extractFloatx80Frac( a );
5441     aExp = extractFloatx80Exp( a );
5442     aSign = extractFloatx80Sign( a );
5443     if ( aExp == 0x7FFF ) {
5444         if ((uint64_t)(aSig0 << 1)) {
5445             return propagateFloatx80NaN(a, a, status);
5446         }
5447         if ( ! aSign ) return a;
5448         goto invalid;
5449     }
5450     if ( aSign ) {
5451         if ( ( aExp | aSig0 ) == 0 ) return a;
5452  invalid:
5453         float_raise(float_flag_invalid, status);
5454         return floatx80_default_nan(status);
5455     }
5456     if ( aExp == 0 ) {
5457         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5458         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5459     }
5460     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5461     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5462     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5463     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5464     doubleZSig0 = zSig0<<1;
5465     mul64To128( zSig0, zSig0, &term0, &term1 );
5466     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5467     while ( (int64_t) rem0 < 0 ) {
5468         --zSig0;
5469         doubleZSig0 -= 2;
5470         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5471     }
5472     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5473     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5474         if ( zSig1 == 0 ) zSig1 = 1;
5475         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5476         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5477         mul64To128( zSig1, zSig1, &term2, &term3 );
5478         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5479         while ( (int64_t) rem1 < 0 ) {
5480             --zSig1;
5481             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5482             term3 |= 1;
5483             term2 |= doubleZSig0;
5484             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5485         }
5486         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5487     }
5488     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5489     zSig0 |= doubleZSig0;
5490     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5491                                 0, zExp, zSig0, zSig1, status);
5492 }
5493 
5494 /*----------------------------------------------------------------------------
5495 | Returns 1 if the extended double-precision floating-point value `a' is equal
5496 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5497 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5499 *----------------------------------------------------------------------------*/
5500 
5501 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5502 {
5503 
5504     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5505         || (extractFloatx80Exp(a) == 0x7FFF
5506             && (uint64_t) (extractFloatx80Frac(a) << 1))
5507         || (extractFloatx80Exp(b) == 0x7FFF
5508             && (uint64_t) (extractFloatx80Frac(b) << 1))
5509        ) {
5510         float_raise(float_flag_invalid, status);
5511         return 0;
5512     }
5513     return
5514            ( a.low == b.low )
5515         && (    ( a.high == b.high )
5516              || (    ( a.low == 0 )
5517                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5518            );
5519 
5520 }
5521 
5522 /*----------------------------------------------------------------------------
5523 | Returns 1 if the extended double-precision floating-point value `a' is
5524 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5525 | invalid exception is raised if either operand is a NaN.  The comparison is
5526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5527 | Arithmetic.
5528 *----------------------------------------------------------------------------*/
5529 
5530 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5531 {
5532     flag aSign, bSign;
5533 
5534     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5535         || (extractFloatx80Exp(a) == 0x7FFF
5536             && (uint64_t) (extractFloatx80Frac(a) << 1))
5537         || (extractFloatx80Exp(b) == 0x7FFF
5538             && (uint64_t) (extractFloatx80Frac(b) << 1))
5539        ) {
5540         float_raise(float_flag_invalid, status);
5541         return 0;
5542     }
5543     aSign = extractFloatx80Sign( a );
5544     bSign = extractFloatx80Sign( b );
5545     if ( aSign != bSign ) {
5546         return
5547                aSign
5548             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5549                  == 0 );
5550     }
5551     return
5552           aSign ? le128( b.high, b.low, a.high, a.low )
5553         : le128( a.high, a.low, b.high, b.low );
5554 
5555 }
5556 
5557 /*----------------------------------------------------------------------------
5558 | Returns 1 if the extended double-precision floating-point value `a' is
5559 | less than the corresponding value `b', and 0 otherwise.  The invalid
5560 | exception is raised if either operand is a NaN.  The comparison is performed
5561 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5563 
5564 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5565 {
5566     flag aSign, bSign;
5567 
5568     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5569         || (extractFloatx80Exp(a) == 0x7FFF
5570             && (uint64_t) (extractFloatx80Frac(a) << 1))
5571         || (extractFloatx80Exp(b) == 0x7FFF
5572             && (uint64_t) (extractFloatx80Frac(b) << 1))
5573        ) {
5574         float_raise(float_flag_invalid, status);
5575         return 0;
5576     }
5577     aSign = extractFloatx80Sign( a );
5578     bSign = extractFloatx80Sign( b );
5579     if ( aSign != bSign ) {
5580         return
5581                aSign
5582             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5583                  != 0 );
5584     }
5585     return
5586           aSign ? lt128( b.high, b.low, a.high, a.low )
5587         : lt128( a.high, a.low, b.high, b.low );
5588 
5589 }
5590 
5591 /*----------------------------------------------------------------------------
5592 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5593 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5594 | either operand is a NaN.   The comparison is performed according to the
5595 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5597 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5598 {
5599     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5600         || (extractFloatx80Exp(a) == 0x7FFF
5601             && (uint64_t) (extractFloatx80Frac(a) << 1))
5602         || (extractFloatx80Exp(b) == 0x7FFF
5603             && (uint64_t) (extractFloatx80Frac(b) << 1))
5604        ) {
5605         float_raise(float_flag_invalid, status);
5606         return 1;
5607     }
5608     return 0;
5609 }
5610 
5611 /*----------------------------------------------------------------------------
5612 | Returns 1 if the extended double-precision floating-point value `a' is
5613 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5614 | cause an exception.  The comparison is performed according to the IEC/IEEE
5615 | Standard for Binary Floating-Point Arithmetic.
5616 *----------------------------------------------------------------------------*/
5617 
5618 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5619 {
5620 
5621     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5622         float_raise(float_flag_invalid, status);
5623         return 0;
5624     }
5625     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5626               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5627          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5628               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5629        ) {
5630         if (floatx80_is_signaling_nan(a, status)
5631          || floatx80_is_signaling_nan(b, status)) {
5632             float_raise(float_flag_invalid, status);
5633         }
5634         return 0;
5635     }
5636     return
5637            ( a.low == b.low )
5638         && (    ( a.high == b.high )
5639              || (    ( a.low == 0 )
5640                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5641            );
5642 
5643 }
5644 
5645 /*----------------------------------------------------------------------------
5646 | Returns 1 if the extended double-precision floating-point value `a' is less
5647 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5648 | do not cause an exception.  Otherwise, the comparison is performed according
5649 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5650 *----------------------------------------------------------------------------*/
5651 
5652 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5653 {
5654     flag aSign, bSign;
5655 
5656     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5657         float_raise(float_flag_invalid, status);
5658         return 0;
5659     }
5660     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5661               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5662          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5663               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5664        ) {
5665         if (floatx80_is_signaling_nan(a, status)
5666          || floatx80_is_signaling_nan(b, status)) {
5667             float_raise(float_flag_invalid, status);
5668         }
5669         return 0;
5670     }
5671     aSign = extractFloatx80Sign( a );
5672     bSign = extractFloatx80Sign( b );
5673     if ( aSign != bSign ) {
5674         return
5675                aSign
5676             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5677                  == 0 );
5678     }
5679     return
5680           aSign ? le128( b.high, b.low, a.high, a.low )
5681         : le128( a.high, a.low, b.high, b.low );
5682 
5683 }
5684 
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the extended double-precision floating-point value `a' is less
5687 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5688 | an exception.  Otherwise, the comparison is performed according to the
5689 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5690 *----------------------------------------------------------------------------*/
5691 
5692 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5693 {
5694     flag aSign, bSign;
5695 
5696     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5697         float_raise(float_flag_invalid, status);
5698         return 0;
5699     }
5700     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5701               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5702          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5703               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5704        ) {
5705         if (floatx80_is_signaling_nan(a, status)
5706          || floatx80_is_signaling_nan(b, status)) {
5707             float_raise(float_flag_invalid, status);
5708         }
5709         return 0;
5710     }
5711     aSign = extractFloatx80Sign( a );
5712     bSign = extractFloatx80Sign( b );
5713     if ( aSign != bSign ) {
5714         return
5715                aSign
5716             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5717                  != 0 );
5718     }
5719     return
5720           aSign ? lt128( b.high, b.low, a.high, a.low )
5721         : lt128( a.high, a.low, b.high, b.low );
5722 
5723 }
5724 
5725 /*----------------------------------------------------------------------------
5726 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5727 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5728 | The comparison is performed according to the IEC/IEEE Standard for Binary
5729 | Floating-Point Arithmetic.
5730 *----------------------------------------------------------------------------*/
5731 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5732 {
5733     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5734         float_raise(float_flag_invalid, status);
5735         return 1;
5736     }
5737     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5738               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5739          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5740               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5741        ) {
5742         if (floatx80_is_signaling_nan(a, status)
5743          || floatx80_is_signaling_nan(b, status)) {
5744             float_raise(float_flag_invalid, status);
5745         }
5746         return 1;
5747     }
5748     return 0;
5749 }
5750 
5751 /*----------------------------------------------------------------------------
5752 | Returns the result of converting the quadruple-precision floating-point
5753 | value `a' to the 32-bit two's complement integer format.  The conversion
5754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic---which means in particular that the conversion is rounded
5756 | according to the current rounding mode.  If `a' is a NaN, the largest
5757 | positive integer is returned.  Otherwise, if the conversion overflows, the
5758 | largest integer with the same sign as `a' is returned.
5759 *----------------------------------------------------------------------------*/
5760 
5761 int32_t float128_to_int32(float128 a, float_status *status)
5762 {
5763     flag aSign;
5764     int32_t aExp, shiftCount;
5765     uint64_t aSig0, aSig1;
5766 
5767     aSig1 = extractFloat128Frac1( a );
5768     aSig0 = extractFloat128Frac0( a );
5769     aExp = extractFloat128Exp( a );
5770     aSign = extractFloat128Sign( a );
5771     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5772     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5773     aSig0 |= ( aSig1 != 0 );
5774     shiftCount = 0x4028 - aExp;
5775     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5776     return roundAndPackInt32(aSign, aSig0, status);
5777 
5778 }
5779 
5780 /*----------------------------------------------------------------------------
5781 | Returns the result of converting the quadruple-precision floating-point
5782 | value `a' to the 32-bit two's complement integer format.  The conversion
5783 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784 | Arithmetic, except that the conversion is always rounded toward zero.  If
5785 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5786 | conversion overflows, the largest integer with the same sign as `a' is
5787 | returned.
5788 *----------------------------------------------------------------------------*/
5789 
5790 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5791 {
5792     flag aSign;
5793     int32_t aExp, shiftCount;
5794     uint64_t aSig0, aSig1, savedASig;
5795     int32_t z;
5796 
5797     aSig1 = extractFloat128Frac1( a );
5798     aSig0 = extractFloat128Frac0( a );
5799     aExp = extractFloat128Exp( a );
5800     aSign = extractFloat128Sign( a );
5801     aSig0 |= ( aSig1 != 0 );
5802     if ( 0x401E < aExp ) {
5803         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5804         goto invalid;
5805     }
5806     else if ( aExp < 0x3FFF ) {
5807         if (aExp || aSig0) {
5808             status->float_exception_flags |= float_flag_inexact;
5809         }
5810         return 0;
5811     }
5812     aSig0 |= LIT64( 0x0001000000000000 );
5813     shiftCount = 0x402F - aExp;
5814     savedASig = aSig0;
5815     aSig0 >>= shiftCount;
5816     z = aSig0;
5817     if ( aSign ) z = - z;
5818     if ( ( z < 0 ) ^ aSign ) {
5819  invalid:
5820         float_raise(float_flag_invalid, status);
5821         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5822     }
5823     if ( ( aSig0<<shiftCount ) != savedASig ) {
5824         status->float_exception_flags |= float_flag_inexact;
5825     }
5826     return z;
5827 
5828 }
5829 
5830 /*----------------------------------------------------------------------------
5831 | Returns the result of converting the quadruple-precision floating-point
5832 | value `a' to the 64-bit two's complement integer format.  The conversion
5833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5834 | Arithmetic---which means in particular that the conversion is rounded
5835 | according to the current rounding mode.  If `a' is a NaN, the largest
5836 | positive integer is returned.  Otherwise, if the conversion overflows, the
5837 | largest integer with the same sign as `a' is returned.
5838 *----------------------------------------------------------------------------*/
5839 
5840 int64_t float128_to_int64(float128 a, float_status *status)
5841 {
5842     flag aSign;
5843     int32_t aExp, shiftCount;
5844     uint64_t aSig0, aSig1;
5845 
5846     aSig1 = extractFloat128Frac1( a );
5847     aSig0 = extractFloat128Frac0( a );
5848     aExp = extractFloat128Exp( a );
5849     aSign = extractFloat128Sign( a );
5850     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5851     shiftCount = 0x402F - aExp;
5852     if ( shiftCount <= 0 ) {
5853         if ( 0x403E < aExp ) {
5854             float_raise(float_flag_invalid, status);
5855             if (    ! aSign
5856                  || (    ( aExp == 0x7FFF )
5857                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5858                     )
5859                ) {
5860                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5861             }
5862             return (int64_t) LIT64( 0x8000000000000000 );
5863         }
5864         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5865     }
5866     else {
5867         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5868     }
5869     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5870 
5871 }
5872 
5873 /*----------------------------------------------------------------------------
5874 | Returns the result of converting the quadruple-precision floating-point
5875 | value `a' to the 64-bit two's complement integer format.  The conversion
5876 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5877 | Arithmetic, except that the conversion is always rounded toward zero.
5878 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5879 | the conversion overflows, the largest integer with the same sign as `a' is
5880 | returned.
5881 *----------------------------------------------------------------------------*/
5882 
5883 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5884 {
5885     flag aSign;
5886     int32_t aExp, shiftCount;
5887     uint64_t aSig0, aSig1;
5888     int64_t z;
5889 
5890     aSig1 = extractFloat128Frac1( a );
5891     aSig0 = extractFloat128Frac0( a );
5892     aExp = extractFloat128Exp( a );
5893     aSign = extractFloat128Sign( a );
5894     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5895     shiftCount = aExp - 0x402F;
5896     if ( 0 < shiftCount ) {
5897         if ( 0x403E <= aExp ) {
5898             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5899             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5900                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5901                 if (aSig1) {
5902                     status->float_exception_flags |= float_flag_inexact;
5903                 }
5904             }
5905             else {
5906                 float_raise(float_flag_invalid, status);
5907                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5908                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5909                 }
5910             }
5911             return (int64_t) LIT64( 0x8000000000000000 );
5912         }
5913         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5914         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5915             status->float_exception_flags |= float_flag_inexact;
5916         }
5917     }
5918     else {
5919         if ( aExp < 0x3FFF ) {
5920             if ( aExp | aSig0 | aSig1 ) {
5921                 status->float_exception_flags |= float_flag_inexact;
5922             }
5923             return 0;
5924         }
5925         z = aSig0>>( - shiftCount );
5926         if (    aSig1
5927              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5928             status->float_exception_flags |= float_flag_inexact;
5929         }
5930     }
5931     if ( aSign ) z = - z;
5932     return z;
5933 
5934 }
5935 
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of converting the quadruple-precision floating-point value
5938 | `a' to the 64-bit unsigned integer format.  The conversion is
5939 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5940 | Arithmetic---which means in particular that the conversion is rounded
5941 | according to the current rounding mode.  If `a' is a NaN, the largest
5942 | positive integer is returned.  If the conversion overflows, the
5943 | largest unsigned integer is returned.  If 'a' is negative, the value is
5944 | rounded and zero is returned; negative values that do not round to zero
5945 | will raise the inexact exception.
5946 *----------------------------------------------------------------------------*/
5947 
5948 uint64_t float128_to_uint64(float128 a, float_status *status)
5949 {
5950     flag aSign;
5951     int aExp;
5952     int shiftCount;
5953     uint64_t aSig0, aSig1;
5954 
5955     aSig0 = extractFloat128Frac0(a);
5956     aSig1 = extractFloat128Frac1(a);
5957     aExp = extractFloat128Exp(a);
5958     aSign = extractFloat128Sign(a);
5959     if (aSign && (aExp > 0x3FFE)) {
5960         float_raise(float_flag_invalid, status);
5961         if (float128_is_any_nan(a)) {
5962             return LIT64(0xFFFFFFFFFFFFFFFF);
5963         } else {
5964             return 0;
5965         }
5966     }
5967     if (aExp) {
5968         aSig0 |= LIT64(0x0001000000000000);
5969     }
5970     shiftCount = 0x402F - aExp;
5971     if (shiftCount <= 0) {
5972         if (0x403E < aExp) {
5973             float_raise(float_flag_invalid, status);
5974             return LIT64(0xFFFFFFFFFFFFFFFF);
5975         }
5976         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5977     } else {
5978         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5979     }
5980     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5981 }
5982 
5983 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5984 {
5985     uint64_t v;
5986     signed char current_rounding_mode = status->float_rounding_mode;
5987 
5988     set_float_rounding_mode(float_round_to_zero, status);
5989     v = float128_to_uint64(a, status);
5990     set_float_rounding_mode(current_rounding_mode, status);
5991 
5992     return v;
5993 }
5994 
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the 32-bit unsigned integer format.  The conversion
5998 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5999 | Arithmetic except that the conversion is always rounded toward zero.
6000 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6001 | if the conversion overflows, the largest unsigned integer is returned.
6002 | If 'a' is negative, the value is rounded and zero is returned; negative
6003 | values that do not round to zero will raise the inexact exception.
6004 *----------------------------------------------------------------------------*/
6005 
6006 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6007 {
6008     uint64_t v;
6009     uint32_t res;
6010     int old_exc_flags = get_float_exception_flags(status);
6011 
6012     v = float128_to_uint64_round_to_zero(a, status);
6013     if (v > 0xffffffff) {
6014         res = 0xffffffff;
6015     } else {
6016         return v;
6017     }
6018     set_float_exception_flags(old_exc_flags, status);
6019     float_raise(float_flag_invalid, status);
6020     return res;
6021 }
6022 
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of converting the quadruple-precision floating-point
6025 | value `a' to the single-precision floating-point format.  The conversion
6026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6027 | Arithmetic.
6028 *----------------------------------------------------------------------------*/
6029 
6030 float32 float128_to_float32(float128 a, float_status *status)
6031 {
6032     flag aSign;
6033     int32_t aExp;
6034     uint64_t aSig0, aSig1;
6035     uint32_t zSig;
6036 
6037     aSig1 = extractFloat128Frac1( a );
6038     aSig0 = extractFloat128Frac0( a );
6039     aExp = extractFloat128Exp( a );
6040     aSign = extractFloat128Sign( a );
6041     if ( aExp == 0x7FFF ) {
6042         if ( aSig0 | aSig1 ) {
6043             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6044         }
6045         return packFloat32( aSign, 0xFF, 0 );
6046     }
6047     aSig0 |= ( aSig1 != 0 );
6048     shift64RightJamming( aSig0, 18, &aSig0 );
6049     zSig = aSig0;
6050     if ( aExp || zSig ) {
6051         zSig |= 0x40000000;
6052         aExp -= 0x3F81;
6053     }
6054     return roundAndPackFloat32(aSign, aExp, zSig, status);
6055 
6056 }
6057 
6058 /*----------------------------------------------------------------------------
6059 | Returns the result of converting the quadruple-precision floating-point
6060 | value `a' to the double-precision floating-point format.  The conversion
6061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6062 | Arithmetic.
6063 *----------------------------------------------------------------------------*/
6064 
6065 float64 float128_to_float64(float128 a, float_status *status)
6066 {
6067     flag aSign;
6068     int32_t aExp;
6069     uint64_t aSig0, aSig1;
6070 
6071     aSig1 = extractFloat128Frac1( a );
6072     aSig0 = extractFloat128Frac0( a );
6073     aExp = extractFloat128Exp( a );
6074     aSign = extractFloat128Sign( a );
6075     if ( aExp == 0x7FFF ) {
6076         if ( aSig0 | aSig1 ) {
6077             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6078         }
6079         return packFloat64( aSign, 0x7FF, 0 );
6080     }
6081     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6082     aSig0 |= ( aSig1 != 0 );
6083     if ( aExp || aSig0 ) {
6084         aSig0 |= LIT64( 0x4000000000000000 );
6085         aExp -= 0x3C01;
6086     }
6087     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6088 
6089 }
6090 
6091 /*----------------------------------------------------------------------------
6092 | Returns the result of converting the quadruple-precision floating-point
6093 | value `a' to the extended double-precision floating-point format.  The
6094 | conversion is performed according to the IEC/IEEE Standard for Binary
6095 | Floating-Point Arithmetic.
6096 *----------------------------------------------------------------------------*/
6097 
6098 floatx80 float128_to_floatx80(float128 a, float_status *status)
6099 {
6100     flag aSign;
6101     int32_t aExp;
6102     uint64_t aSig0, aSig1;
6103 
6104     aSig1 = extractFloat128Frac1( a );
6105     aSig0 = extractFloat128Frac0( a );
6106     aExp = extractFloat128Exp( a );
6107     aSign = extractFloat128Sign( a );
6108     if ( aExp == 0x7FFF ) {
6109         if ( aSig0 | aSig1 ) {
6110             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6111         }
6112         return packFloatx80(aSign, floatx80_infinity_high,
6113                                    floatx80_infinity_low);
6114     }
6115     if ( aExp == 0 ) {
6116         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6117         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6118     }
6119     else {
6120         aSig0 |= LIT64( 0x0001000000000000 );
6121     }
6122     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6123     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6124 
6125 }
6126 
6127 /*----------------------------------------------------------------------------
6128 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6129 | returns the result as a quadruple-precision floating-point value.  The
6130 | operation is performed according to the IEC/IEEE Standard for Binary
6131 | Floating-Point Arithmetic.
6132 *----------------------------------------------------------------------------*/
6133 
6134 float128 float128_round_to_int(float128 a, float_status *status)
6135 {
6136     flag aSign;
6137     int32_t aExp;
6138     uint64_t lastBitMask, roundBitsMask;
6139     float128 z;
6140 
6141     aExp = extractFloat128Exp( a );
6142     if ( 0x402F <= aExp ) {
6143         if ( 0x406F <= aExp ) {
6144             if (    ( aExp == 0x7FFF )
6145                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6146                ) {
6147                 return propagateFloat128NaN(a, a, status);
6148             }
6149             return a;
6150         }
6151         lastBitMask = 1;
6152         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6153         roundBitsMask = lastBitMask - 1;
6154         z = a;
6155         switch (status->float_rounding_mode) {
6156         case float_round_nearest_even:
6157             if ( lastBitMask ) {
6158                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6159                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6160             }
6161             else {
6162                 if ( (int64_t) z.low < 0 ) {
6163                     ++z.high;
6164                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6165                 }
6166             }
6167             break;
6168         case float_round_ties_away:
6169             if (lastBitMask) {
6170                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6171             } else {
6172                 if ((int64_t) z.low < 0) {
6173                     ++z.high;
6174                 }
6175             }
6176             break;
6177         case float_round_to_zero:
6178             break;
6179         case float_round_up:
6180             if (!extractFloat128Sign(z)) {
6181                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6182             }
6183             break;
6184         case float_round_down:
6185             if (extractFloat128Sign(z)) {
6186                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6187             }
6188             break;
6189         default:
6190             abort();
6191         }
6192         z.low &= ~ roundBitsMask;
6193     }
6194     else {
6195         if ( aExp < 0x3FFF ) {
6196             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6197             status->float_exception_flags |= float_flag_inexact;
6198             aSign = extractFloat128Sign( a );
6199             switch (status->float_rounding_mode) {
6200              case float_round_nearest_even:
6201                 if (    ( aExp == 0x3FFE )
6202                      && (   extractFloat128Frac0( a )
6203                           | extractFloat128Frac1( a ) )
6204                    ) {
6205                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6206                 }
6207                 break;
6208             case float_round_ties_away:
6209                 if (aExp == 0x3FFE) {
6210                     return packFloat128(aSign, 0x3FFF, 0, 0);
6211                 }
6212                 break;
6213              case float_round_down:
6214                 return
6215                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6216                     : packFloat128( 0, 0, 0, 0 );
6217              case float_round_up:
6218                 return
6219                       aSign ? packFloat128( 1, 0, 0, 0 )
6220                     : packFloat128( 0, 0x3FFF, 0, 0 );
6221             }
6222             return packFloat128( aSign, 0, 0, 0 );
6223         }
6224         lastBitMask = 1;
6225         lastBitMask <<= 0x402F - aExp;
6226         roundBitsMask = lastBitMask - 1;
6227         z.low = 0;
6228         z.high = a.high;
6229         switch (status->float_rounding_mode) {
6230         case float_round_nearest_even:
6231             z.high += lastBitMask>>1;
6232             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6233                 z.high &= ~ lastBitMask;
6234             }
6235             break;
6236         case float_round_ties_away:
6237             z.high += lastBitMask>>1;
6238             break;
6239         case float_round_to_zero:
6240             break;
6241         case float_round_up:
6242             if (!extractFloat128Sign(z)) {
6243                 z.high |= ( a.low != 0 );
6244                 z.high += roundBitsMask;
6245             }
6246             break;
6247         case float_round_down:
6248             if (extractFloat128Sign(z)) {
6249                 z.high |= (a.low != 0);
6250                 z.high += roundBitsMask;
6251             }
6252             break;
6253         default:
6254             abort();
6255         }
6256         z.high &= ~ roundBitsMask;
6257     }
6258     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6259         status->float_exception_flags |= float_flag_inexact;
6260     }
6261     return z;
6262 
6263 }
6264 
6265 /*----------------------------------------------------------------------------
6266 | Returns the result of adding the absolute values of the quadruple-precision
6267 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6268 | before being returned.  `zSign' is ignored if the result is a NaN.
6269 | The addition is performed according to the IEC/IEEE Standard for Binary
6270 | Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6272 
6273 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6274                                 float_status *status)
6275 {
6276     int32_t aExp, bExp, zExp;
6277     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6278     int32_t expDiff;
6279 
6280     aSig1 = extractFloat128Frac1( a );
6281     aSig0 = extractFloat128Frac0( a );
6282     aExp = extractFloat128Exp( a );
6283     bSig1 = extractFloat128Frac1( b );
6284     bSig0 = extractFloat128Frac0( b );
6285     bExp = extractFloat128Exp( b );
6286     expDiff = aExp - bExp;
6287     if ( 0 < expDiff ) {
6288         if ( aExp == 0x7FFF ) {
6289             if (aSig0 | aSig1) {
6290                 return propagateFloat128NaN(a, b, status);
6291             }
6292             return a;
6293         }
6294         if ( bExp == 0 ) {
6295             --expDiff;
6296         }
6297         else {
6298             bSig0 |= LIT64( 0x0001000000000000 );
6299         }
6300         shift128ExtraRightJamming(
6301             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6302         zExp = aExp;
6303     }
6304     else if ( expDiff < 0 ) {
6305         if ( bExp == 0x7FFF ) {
6306             if (bSig0 | bSig1) {
6307                 return propagateFloat128NaN(a, b, status);
6308             }
6309             return packFloat128( zSign, 0x7FFF, 0, 0 );
6310         }
6311         if ( aExp == 0 ) {
6312             ++expDiff;
6313         }
6314         else {
6315             aSig0 |= LIT64( 0x0001000000000000 );
6316         }
6317         shift128ExtraRightJamming(
6318             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6319         zExp = bExp;
6320     }
6321     else {
6322         if ( aExp == 0x7FFF ) {
6323             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6324                 return propagateFloat128NaN(a, b, status);
6325             }
6326             return a;
6327         }
6328         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6329         if ( aExp == 0 ) {
6330             if (status->flush_to_zero) {
6331                 if (zSig0 | zSig1) {
6332                     float_raise(float_flag_output_denormal, status);
6333                 }
6334                 return packFloat128(zSign, 0, 0, 0);
6335             }
6336             return packFloat128( zSign, 0, zSig0, zSig1 );
6337         }
6338         zSig2 = 0;
6339         zSig0 |= LIT64( 0x0002000000000000 );
6340         zExp = aExp;
6341         goto shiftRight1;
6342     }
6343     aSig0 |= LIT64( 0x0001000000000000 );
6344     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6345     --zExp;
6346     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6347     ++zExp;
6348  shiftRight1:
6349     shift128ExtraRightJamming(
6350         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6351  roundAndPack:
6352     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6353 
6354 }
6355 
6356 /*----------------------------------------------------------------------------
6357 | Returns the result of subtracting the absolute values of the quadruple-
6358 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6359 | difference is negated before being returned.  `zSign' is ignored if the
6360 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6361 | Standard for Binary Floating-Point Arithmetic.
6362 *----------------------------------------------------------------------------*/
6363 
6364 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6365                                 float_status *status)
6366 {
6367     int32_t aExp, bExp, zExp;
6368     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6369     int32_t expDiff;
6370 
6371     aSig1 = extractFloat128Frac1( a );
6372     aSig0 = extractFloat128Frac0( a );
6373     aExp = extractFloat128Exp( a );
6374     bSig1 = extractFloat128Frac1( b );
6375     bSig0 = extractFloat128Frac0( b );
6376     bExp = extractFloat128Exp( b );
6377     expDiff = aExp - bExp;
6378     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6379     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6380     if ( 0 < expDiff ) goto aExpBigger;
6381     if ( expDiff < 0 ) goto bExpBigger;
6382     if ( aExp == 0x7FFF ) {
6383         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6384             return propagateFloat128NaN(a, b, status);
6385         }
6386         float_raise(float_flag_invalid, status);
6387         return float128_default_nan(status);
6388     }
6389     if ( aExp == 0 ) {
6390         aExp = 1;
6391         bExp = 1;
6392     }
6393     if ( bSig0 < aSig0 ) goto aBigger;
6394     if ( aSig0 < bSig0 ) goto bBigger;
6395     if ( bSig1 < aSig1 ) goto aBigger;
6396     if ( aSig1 < bSig1 ) goto bBigger;
6397     return packFloat128(status->float_rounding_mode == float_round_down,
6398                         0, 0, 0);
6399  bExpBigger:
6400     if ( bExp == 0x7FFF ) {
6401         if (bSig0 | bSig1) {
6402             return propagateFloat128NaN(a, b, status);
6403         }
6404         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6405     }
6406     if ( aExp == 0 ) {
6407         ++expDiff;
6408     }
6409     else {
6410         aSig0 |= LIT64( 0x4000000000000000 );
6411     }
6412     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6413     bSig0 |= LIT64( 0x4000000000000000 );
6414  bBigger:
6415     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6416     zExp = bExp;
6417     zSign ^= 1;
6418     goto normalizeRoundAndPack;
6419  aExpBigger:
6420     if ( aExp == 0x7FFF ) {
6421         if (aSig0 | aSig1) {
6422             return propagateFloat128NaN(a, b, status);
6423         }
6424         return a;
6425     }
6426     if ( bExp == 0 ) {
6427         --expDiff;
6428     }
6429     else {
6430         bSig0 |= LIT64( 0x4000000000000000 );
6431     }
6432     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6433     aSig0 |= LIT64( 0x4000000000000000 );
6434  aBigger:
6435     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6436     zExp = aExp;
6437  normalizeRoundAndPack:
6438     --zExp;
6439     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6440                                          status);
6441 
6442 }
6443 
6444 /*----------------------------------------------------------------------------
6445 | Returns the result of adding the quadruple-precision floating-point values
6446 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6447 | for Binary Floating-Point Arithmetic.
6448 *----------------------------------------------------------------------------*/
6449 
6450 float128 float128_add(float128 a, float128 b, float_status *status)
6451 {
6452     flag aSign, bSign;
6453 
6454     aSign = extractFloat128Sign( a );
6455     bSign = extractFloat128Sign( b );
6456     if ( aSign == bSign ) {
6457         return addFloat128Sigs(a, b, aSign, status);
6458     }
6459     else {
6460         return subFloat128Sigs(a, b, aSign, status);
6461     }
6462 
6463 }
6464 
6465 /*----------------------------------------------------------------------------
6466 | Returns the result of subtracting the quadruple-precision floating-point
6467 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6468 | Standard for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6470 
6471 float128 float128_sub(float128 a, float128 b, float_status *status)
6472 {
6473     flag aSign, bSign;
6474 
6475     aSign = extractFloat128Sign( a );
6476     bSign = extractFloat128Sign( b );
6477     if ( aSign == bSign ) {
6478         return subFloat128Sigs(a, b, aSign, status);
6479     }
6480     else {
6481         return addFloat128Sigs(a, b, aSign, status);
6482     }
6483 
6484 }
6485 
6486 /*----------------------------------------------------------------------------
6487 | Returns the result of multiplying the quadruple-precision floating-point
6488 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6489 | Standard for Binary Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6491 
6492 float128 float128_mul(float128 a, float128 b, float_status *status)
6493 {
6494     flag aSign, bSign, zSign;
6495     int32_t aExp, bExp, zExp;
6496     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6497 
6498     aSig1 = extractFloat128Frac1( a );
6499     aSig0 = extractFloat128Frac0( a );
6500     aExp = extractFloat128Exp( a );
6501     aSign = extractFloat128Sign( a );
6502     bSig1 = extractFloat128Frac1( b );
6503     bSig0 = extractFloat128Frac0( b );
6504     bExp = extractFloat128Exp( b );
6505     bSign = extractFloat128Sign( b );
6506     zSign = aSign ^ bSign;
6507     if ( aExp == 0x7FFF ) {
6508         if (    ( aSig0 | aSig1 )
6509              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6510             return propagateFloat128NaN(a, b, status);
6511         }
6512         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6513         return packFloat128( zSign, 0x7FFF, 0, 0 );
6514     }
6515     if ( bExp == 0x7FFF ) {
6516         if (bSig0 | bSig1) {
6517             return propagateFloat128NaN(a, b, status);
6518         }
6519         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6520  invalid:
6521             float_raise(float_flag_invalid, status);
6522             return float128_default_nan(status);
6523         }
6524         return packFloat128( zSign, 0x7FFF, 0, 0 );
6525     }
6526     if ( aExp == 0 ) {
6527         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6528         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6529     }
6530     if ( bExp == 0 ) {
6531         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6532         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6533     }
6534     zExp = aExp + bExp - 0x4000;
6535     aSig0 |= LIT64( 0x0001000000000000 );
6536     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6537     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6538     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6539     zSig2 |= ( zSig3 != 0 );
6540     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6541         shift128ExtraRightJamming(
6542             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6543         ++zExp;
6544     }
6545     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6546 
6547 }
6548 
6549 /*----------------------------------------------------------------------------
6550 | Returns the result of dividing the quadruple-precision floating-point value
6551 | `a' by the corresponding value `b'.  The operation is performed according to
6552 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6553 *----------------------------------------------------------------------------*/
6554 
6555 float128 float128_div(float128 a, float128 b, float_status *status)
6556 {
6557     flag aSign, bSign, zSign;
6558     int32_t aExp, bExp, zExp;
6559     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6560     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6561 
6562     aSig1 = extractFloat128Frac1( a );
6563     aSig0 = extractFloat128Frac0( a );
6564     aExp = extractFloat128Exp( a );
6565     aSign = extractFloat128Sign( a );
6566     bSig1 = extractFloat128Frac1( b );
6567     bSig0 = extractFloat128Frac0( b );
6568     bExp = extractFloat128Exp( b );
6569     bSign = extractFloat128Sign( b );
6570     zSign = aSign ^ bSign;
6571     if ( aExp == 0x7FFF ) {
6572         if (aSig0 | aSig1) {
6573             return propagateFloat128NaN(a, b, status);
6574         }
6575         if ( bExp == 0x7FFF ) {
6576             if (bSig0 | bSig1) {
6577                 return propagateFloat128NaN(a, b, status);
6578             }
6579             goto invalid;
6580         }
6581         return packFloat128( zSign, 0x7FFF, 0, 0 );
6582     }
6583     if ( bExp == 0x7FFF ) {
6584         if (bSig0 | bSig1) {
6585             return propagateFloat128NaN(a, b, status);
6586         }
6587         return packFloat128( zSign, 0, 0, 0 );
6588     }
6589     if ( bExp == 0 ) {
6590         if ( ( bSig0 | bSig1 ) == 0 ) {
6591             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6592  invalid:
6593                 float_raise(float_flag_invalid, status);
6594                 return float128_default_nan(status);
6595             }
6596             float_raise(float_flag_divbyzero, status);
6597             return packFloat128( zSign, 0x7FFF, 0, 0 );
6598         }
6599         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6600     }
6601     if ( aExp == 0 ) {
6602         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6603         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6604     }
6605     zExp = aExp - bExp + 0x3FFD;
6606     shortShift128Left(
6607         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6608     shortShift128Left(
6609         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6610     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6611         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6612         ++zExp;
6613     }
6614     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6615     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6616     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6617     while ( (int64_t) rem0 < 0 ) {
6618         --zSig0;
6619         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6620     }
6621     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6622     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6623         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6624         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6625         while ( (int64_t) rem1 < 0 ) {
6626             --zSig1;
6627             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6628         }
6629         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6630     }
6631     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6632     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6633 
6634 }
6635 
6636 /*----------------------------------------------------------------------------
6637 | Returns the remainder of the quadruple-precision floating-point value `a'
6638 | with respect to the corresponding value `b'.  The operation is performed
6639 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6641 
6642 float128 float128_rem(float128 a, float128 b, float_status *status)
6643 {
6644     flag aSign, zSign;
6645     int32_t aExp, bExp, expDiff;
6646     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6647     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6648     int64_t sigMean0;
6649 
6650     aSig1 = extractFloat128Frac1( a );
6651     aSig0 = extractFloat128Frac0( a );
6652     aExp = extractFloat128Exp( a );
6653     aSign = extractFloat128Sign( a );
6654     bSig1 = extractFloat128Frac1( b );
6655     bSig0 = extractFloat128Frac0( b );
6656     bExp = extractFloat128Exp( b );
6657     if ( aExp == 0x7FFF ) {
6658         if (    ( aSig0 | aSig1 )
6659              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6660             return propagateFloat128NaN(a, b, status);
6661         }
6662         goto invalid;
6663     }
6664     if ( bExp == 0x7FFF ) {
6665         if (bSig0 | bSig1) {
6666             return propagateFloat128NaN(a, b, status);
6667         }
6668         return a;
6669     }
6670     if ( bExp == 0 ) {
6671         if ( ( bSig0 | bSig1 ) == 0 ) {
6672  invalid:
6673             float_raise(float_flag_invalid, status);
6674             return float128_default_nan(status);
6675         }
6676         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6677     }
6678     if ( aExp == 0 ) {
6679         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6680         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6681     }
6682     expDiff = aExp - bExp;
6683     if ( expDiff < -1 ) return a;
6684     shortShift128Left(
6685         aSig0 | LIT64( 0x0001000000000000 ),
6686         aSig1,
6687         15 - ( expDiff < 0 ),
6688         &aSig0,
6689         &aSig1
6690     );
6691     shortShift128Left(
6692         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6693     q = le128( bSig0, bSig1, aSig0, aSig1 );
6694     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6695     expDiff -= 64;
6696     while ( 0 < expDiff ) {
6697         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6698         q = ( 4 < q ) ? q - 4 : 0;
6699         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6700         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6701         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6702         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6703         expDiff -= 61;
6704     }
6705     if ( -64 < expDiff ) {
6706         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6707         q = ( 4 < q ) ? q - 4 : 0;
6708         q >>= - expDiff;
6709         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6710         expDiff += 52;
6711         if ( expDiff < 0 ) {
6712             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6713         }
6714         else {
6715             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6716         }
6717         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6718         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6719     }
6720     else {
6721         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6722         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6723     }
6724     do {
6725         alternateASig0 = aSig0;
6726         alternateASig1 = aSig1;
6727         ++q;
6728         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6729     } while ( 0 <= (int64_t) aSig0 );
6730     add128(
6731         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6732     if (    ( sigMean0 < 0 )
6733          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6734         aSig0 = alternateASig0;
6735         aSig1 = alternateASig1;
6736     }
6737     zSign = ( (int64_t) aSig0 < 0 );
6738     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6739     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6740                                          status);
6741 }
6742 
6743 /*----------------------------------------------------------------------------
6744 | Returns the square root of the quadruple-precision floating-point value `a'.
6745 | The operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6748 
6749 float128 float128_sqrt(float128 a, float_status *status)
6750 {
6751     flag aSign;
6752     int32_t aExp, zExp;
6753     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6754     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6755 
6756     aSig1 = extractFloat128Frac1( a );
6757     aSig0 = extractFloat128Frac0( a );
6758     aExp = extractFloat128Exp( a );
6759     aSign = extractFloat128Sign( a );
6760     if ( aExp == 0x7FFF ) {
6761         if (aSig0 | aSig1) {
6762             return propagateFloat128NaN(a, a, status);
6763         }
6764         if ( ! aSign ) return a;
6765         goto invalid;
6766     }
6767     if ( aSign ) {
6768         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6769  invalid:
6770         float_raise(float_flag_invalid, status);
6771         return float128_default_nan(status);
6772     }
6773     if ( aExp == 0 ) {
6774         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6775         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6776     }
6777     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6778     aSig0 |= LIT64( 0x0001000000000000 );
6779     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6780     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6781     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6782     doubleZSig0 = zSig0<<1;
6783     mul64To128( zSig0, zSig0, &term0, &term1 );
6784     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6785     while ( (int64_t) rem0 < 0 ) {
6786         --zSig0;
6787         doubleZSig0 -= 2;
6788         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6789     }
6790     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6791     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6792         if ( zSig1 == 0 ) zSig1 = 1;
6793         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6794         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6795         mul64To128( zSig1, zSig1, &term2, &term3 );
6796         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6797         while ( (int64_t) rem1 < 0 ) {
6798             --zSig1;
6799             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6800             term3 |= 1;
6801             term2 |= doubleZSig0;
6802             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6803         }
6804         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6805     }
6806     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6807     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6808 
6809 }
6810 
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6813 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6814 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6815 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6817 
6818 int float128_eq(float128 a, float128 b, float_status *status)
6819 {
6820 
6821     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6822               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6823          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6824               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6825        ) {
6826         float_raise(float_flag_invalid, status);
6827         return 0;
6828     }
6829     return
6830            ( a.low == b.low )
6831         && (    ( a.high == b.high )
6832              || (    ( a.low == 0 )
6833                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6834            );
6835 
6836 }
6837 
6838 /*----------------------------------------------------------------------------
6839 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6840 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6841 | exception is raised if either operand is a NaN.  The comparison is performed
6842 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6844 
6845 int float128_le(float128 a, float128 b, float_status *status)
6846 {
6847     flag aSign, bSign;
6848 
6849     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6850               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6851          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6852               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6853        ) {
6854         float_raise(float_flag_invalid, status);
6855         return 0;
6856     }
6857     aSign = extractFloat128Sign( a );
6858     bSign = extractFloat128Sign( b );
6859     if ( aSign != bSign ) {
6860         return
6861                aSign
6862             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6863                  == 0 );
6864     }
6865     return
6866           aSign ? le128( b.high, b.low, a.high, a.low )
6867         : le128( a.high, a.low, b.high, b.low );
6868 
6869 }
6870 
6871 /*----------------------------------------------------------------------------
6872 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6873 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6874 | raised if either operand is a NaN.  The comparison is performed according
6875 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6876 *----------------------------------------------------------------------------*/
6877 
6878 int float128_lt(float128 a, float128 b, float_status *status)
6879 {
6880     flag aSign, bSign;
6881 
6882     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6883               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6884          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6885               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6886        ) {
6887         float_raise(float_flag_invalid, status);
6888         return 0;
6889     }
6890     aSign = extractFloat128Sign( a );
6891     bSign = extractFloat128Sign( b );
6892     if ( aSign != bSign ) {
6893         return
6894                aSign
6895             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6896                  != 0 );
6897     }
6898     return
6899           aSign ? lt128( b.high, b.low, a.high, a.low )
6900         : lt128( a.high, a.low, b.high, b.low );
6901 
6902 }
6903 
6904 /*----------------------------------------------------------------------------
6905 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6906 | be compared, and 0 otherwise.  The invalid exception is raised if either
6907 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6908 | Standard for Binary Floating-Point Arithmetic.
6909 *----------------------------------------------------------------------------*/
6910 
6911 int float128_unordered(float128 a, float128 b, float_status *status)
6912 {
6913     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6914               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6915          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6916               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6917        ) {
6918         float_raise(float_flag_invalid, status);
6919         return 1;
6920     }
6921     return 0;
6922 }
6923 
6924 /*----------------------------------------------------------------------------
6925 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6926 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6927 | exception.  The comparison is performed according to the IEC/IEEE Standard
6928 | for Binary Floating-Point Arithmetic.
6929 *----------------------------------------------------------------------------*/
6930 
6931 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6932 {
6933 
6934     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6935               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6936          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6937               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6938        ) {
6939         if (float128_is_signaling_nan(a, status)
6940          || float128_is_signaling_nan(b, status)) {
6941             float_raise(float_flag_invalid, status);
6942         }
6943         return 0;
6944     }
6945     return
6946            ( a.low == b.low )
6947         && (    ( a.high == b.high )
6948              || (    ( a.low == 0 )
6949                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6950            );
6951 
6952 }
6953 
6954 /*----------------------------------------------------------------------------
6955 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6956 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6957 | cause an exception.  Otherwise, the comparison is performed according to the
6958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6959 *----------------------------------------------------------------------------*/
6960 
6961 int float128_le_quiet(float128 a, float128 b, float_status *status)
6962 {
6963     flag aSign, bSign;
6964 
6965     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6966               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6967          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6968               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6969        ) {
6970         if (float128_is_signaling_nan(a, status)
6971          || float128_is_signaling_nan(b, status)) {
6972             float_raise(float_flag_invalid, status);
6973         }
6974         return 0;
6975     }
6976     aSign = extractFloat128Sign( a );
6977     bSign = extractFloat128Sign( b );
6978     if ( aSign != bSign ) {
6979         return
6980                aSign
6981             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6982                  == 0 );
6983     }
6984     return
6985           aSign ? le128( b.high, b.low, a.high, a.low )
6986         : le128( a.high, a.low, b.high, b.low );
6987 
6988 }
6989 
6990 /*----------------------------------------------------------------------------
6991 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6992 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6993 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6994 | Standard for Binary Floating-Point Arithmetic.
6995 *----------------------------------------------------------------------------*/
6996 
6997 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6998 {
6999     flag aSign, bSign;
7000 
7001     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7002               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7003          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7004               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7005        ) {
7006         if (float128_is_signaling_nan(a, status)
7007          || float128_is_signaling_nan(b, status)) {
7008             float_raise(float_flag_invalid, status);
7009         }
7010         return 0;
7011     }
7012     aSign = extractFloat128Sign( a );
7013     bSign = extractFloat128Sign( b );
7014     if ( aSign != bSign ) {
7015         return
7016                aSign
7017             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7018                  != 0 );
7019     }
7020     return
7021           aSign ? lt128( b.high, b.low, a.high, a.low )
7022         : lt128( a.high, a.low, b.high, b.low );
7023 
7024 }
7025 
7026 /*----------------------------------------------------------------------------
7027 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7028 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7029 | comparison is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7032 
7033 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7034 {
7035     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7036               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7037          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7038               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7039        ) {
7040         if (float128_is_signaling_nan(a, status)
7041          || float128_is_signaling_nan(b, status)) {
7042             float_raise(float_flag_invalid, status);
7043         }
7044         return 1;
7045     }
7046     return 0;
7047 }
7048 
7049 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7050                                             int is_quiet, float_status *status)
7051 {
7052     flag aSign, bSign;
7053 
7054     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7055         float_raise(float_flag_invalid, status);
7056         return float_relation_unordered;
7057     }
7058     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7059           ( extractFloatx80Frac( a )<<1 ) ) ||
7060         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7061           ( extractFloatx80Frac( b )<<1 ) )) {
7062         if (!is_quiet ||
7063             floatx80_is_signaling_nan(a, status) ||
7064             floatx80_is_signaling_nan(b, status)) {
7065             float_raise(float_flag_invalid, status);
7066         }
7067         return float_relation_unordered;
7068     }
7069     aSign = extractFloatx80Sign( a );
7070     bSign = extractFloatx80Sign( b );
7071     if ( aSign != bSign ) {
7072 
7073         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7074              ( ( a.low | b.low ) == 0 ) ) {
7075             /* zero case */
7076             return float_relation_equal;
7077         } else {
7078             return 1 - (2 * aSign);
7079         }
7080     } else {
7081         if (a.low == b.low && a.high == b.high) {
7082             return float_relation_equal;
7083         } else {
7084             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7085         }
7086     }
7087 }
7088 
7089 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7090 {
7091     return floatx80_compare_internal(a, b, 0, status);
7092 }
7093 
7094 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7095 {
7096     return floatx80_compare_internal(a, b, 1, status);
7097 }
7098 
7099 static inline int float128_compare_internal(float128 a, float128 b,
7100                                             int is_quiet, float_status *status)
7101 {
7102     flag aSign, bSign;
7103 
7104     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7105           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7106         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7107           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7108         if (!is_quiet ||
7109             float128_is_signaling_nan(a, status) ||
7110             float128_is_signaling_nan(b, status)) {
7111             float_raise(float_flag_invalid, status);
7112         }
7113         return float_relation_unordered;
7114     }
7115     aSign = extractFloat128Sign( a );
7116     bSign = extractFloat128Sign( b );
7117     if ( aSign != bSign ) {
7118         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7119             /* zero case */
7120             return float_relation_equal;
7121         } else {
7122             return 1 - (2 * aSign);
7123         }
7124     } else {
7125         if (a.low == b.low && a.high == b.high) {
7126             return float_relation_equal;
7127         } else {
7128             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7129         }
7130     }
7131 }
7132 
7133 int float128_compare(float128 a, float128 b, float_status *status)
7134 {
7135     return float128_compare_internal(a, b, 0, status);
7136 }
7137 
7138 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7139 {
7140     return float128_compare_internal(a, b, 1, status);
7141 }
7142 
7143 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7144 {
7145     flag aSign;
7146     int32_t aExp;
7147     uint64_t aSig;
7148 
7149     if (floatx80_invalid_encoding(a)) {
7150         float_raise(float_flag_invalid, status);
7151         return floatx80_default_nan(status);
7152     }
7153     aSig = extractFloatx80Frac( a );
7154     aExp = extractFloatx80Exp( a );
7155     aSign = extractFloatx80Sign( a );
7156 
7157     if ( aExp == 0x7FFF ) {
7158         if ( aSig<<1 ) {
7159             return propagateFloatx80NaN(a, a, status);
7160         }
7161         return a;
7162     }
7163 
7164     if (aExp == 0) {
7165         if (aSig == 0) {
7166             return a;
7167         }
7168         aExp++;
7169     }
7170 
7171     if (n > 0x10000) {
7172         n = 0x10000;
7173     } else if (n < -0x10000) {
7174         n = -0x10000;
7175     }
7176 
7177     aExp += n;
7178     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7179                                          aSign, aExp, aSig, 0, status);
7180 }
7181 
7182 float128 float128_scalbn(float128 a, int n, float_status *status)
7183 {
7184     flag aSign;
7185     int32_t aExp;
7186     uint64_t aSig0, aSig1;
7187 
7188     aSig1 = extractFloat128Frac1( a );
7189     aSig0 = extractFloat128Frac0( a );
7190     aExp = extractFloat128Exp( a );
7191     aSign = extractFloat128Sign( a );
7192     if ( aExp == 0x7FFF ) {
7193         if ( aSig0 | aSig1 ) {
7194             return propagateFloat128NaN(a, a, status);
7195         }
7196         return a;
7197     }
7198     if (aExp != 0) {
7199         aSig0 |= LIT64( 0x0001000000000000 );
7200     } else if (aSig0 == 0 && aSig1 == 0) {
7201         return a;
7202     } else {
7203         aExp++;
7204     }
7205 
7206     if (n > 0x10000) {
7207         n = 0x10000;
7208     } else if (n < -0x10000) {
7209         n = -0x10000;
7210     }
7211 
7212     aExp += n - 1;
7213     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7214                                          , status);
7215 
7216 }
7217