xref: /qemu/fpu/softfloat.c (revision c953da8f0be5e026d1c9128660736d72294feb3e)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
101 
102 static inline uint32_t extractFloat16Frac(float16 a)
103 {
104     return float16_val(a) & 0x3ff;
105 }
106 
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110 
111 static inline int extractFloat16Exp(float16 a)
112 {
113     return (float16_val(a) >> 10) & 0x1f;
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline uint32_t extractFloat32Frac(float32 a)
121 {
122     return float32_val(a) & 0x007FFFFF;
123 }
124 
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128 
129 static inline int extractFloat32Exp(float32 a)
130 {
131     return (float32_val(a) >> 23) & 0xFF;
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137 
138 static inline flag extractFloat32Sign(float32 a)
139 {
140     return float32_val(a) >> 31;
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146 
147 static inline uint64_t extractFloat64Frac(float64 a)
148 {
149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
150 }
151 
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155 
156 static inline int extractFloat64Exp(float64 a)
157 {
158     return (float64_val(a) >> 52) & 0x7FF;
159 }
160 
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164 
165 static inline flag extractFloat64Sign(float64 a)
166 {
167     return float64_val(a) >> 63;
168 }
169 
170 /*
171  * Classify a floating point number. Everything above float_class_qnan
172  * is a NaN so cls >= float_class_qnan is any NaN.
173  */
174 
175 typedef enum __attribute__ ((__packed__)) {
176     float_class_unclassified,
177     float_class_zero,
178     float_class_normal,
179     float_class_inf,
180     float_class_qnan,  /* all NaNs from here */
181     float_class_snan,
182 } FloatClass;
183 
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
186 {
187     return unlikely(c >= float_class_qnan);
188 }
189 
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
191 {
192     return c == float_class_snan;
193 }
194 
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
196 {
197     return c == float_class_qnan;
198 }
199 
200 /*
201  * Structure holding all of the decomposed parts of a float. The
202  * exponent is unbiased and the fraction is normalized. All
203  * calculations are done with a 64 bit fraction and then rounded as
204  * appropriate for the final format.
205  *
206  * Thanks to the packed FloatClass a decent compiler should be able to
207  * fit the whole structure into registers and avoid using the stack
208  * for parameter passing.
209  */
210 
211 typedef struct {
212     uint64_t frac;
213     int32_t  exp;
214     FloatClass cls;
215     bool sign;
216 } FloatParts;
217 
218 #define DECOMPOSED_BINARY_POINT    (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
221 
222 /* Structure holding all of the relevant parameters for a format.
223  *   exp_size: the size of the exponent field
224  *   exp_bias: the offset applied to the exponent field
225  *   exp_max: the maximum normalised exponent
226  *   frac_size: the size of the fraction field
227  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228  * The following are computed based the size of fraction
229  *   frac_lsb: least significant bit of fraction
230  *   frac_lsbm1: the bit below the least significant bit (for rounding)
231  *   round_mask/roundeven_mask: masks used for rounding
232  * The following optional modifiers are available:
233  *   arm_althp: handle ARM Alternative Half Precision
234  */
235 typedef struct {
236     int exp_size;
237     int exp_bias;
238     int exp_max;
239     int frac_size;
240     int frac_shift;
241     uint64_t frac_lsb;
242     uint64_t frac_lsbm1;
243     uint64_t round_mask;
244     uint64_t roundeven_mask;
245     bool arm_althp;
246 } FloatFmt;
247 
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F)                                           \
250     .exp_size       = E,                                             \
251     .exp_bias       = ((1 << E) - 1) >> 1,                           \
252     .exp_max        = (1 << E) - 1,                                  \
253     .frac_size      = F,                                             \
254     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
255     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
256     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
257     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
258     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
259 
260 static const FloatFmt float16_params = {
261     FLOAT_PARAMS(5, 10)
262 };
263 
264 static const FloatFmt float16_params_ahp = {
265     FLOAT_PARAMS(5, 10),
266     .arm_althp = true
267 };
268 
269 static const FloatFmt float32_params = {
270     FLOAT_PARAMS(8, 23)
271 };
272 
273 static const FloatFmt float64_params = {
274     FLOAT_PARAMS(11, 52)
275 };
276 
277 /* Unpack a float to parts, but do not canonicalize.  */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
279 {
280     const int sign_pos = fmt.frac_size + fmt.exp_size;
281 
282     return (FloatParts) {
283         .cls = float_class_unclassified,
284         .sign = extract64(raw, sign_pos, 1),
285         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286         .frac = extract64(raw, 0, fmt.frac_size),
287     };
288 }
289 
290 static inline FloatParts float16_unpack_raw(float16 f)
291 {
292     return unpack_raw(float16_params, f);
293 }
294 
295 static inline FloatParts float32_unpack_raw(float32 f)
296 {
297     return unpack_raw(float32_params, f);
298 }
299 
300 static inline FloatParts float64_unpack_raw(float64 f)
301 {
302     return unpack_raw(float64_params, f);
303 }
304 
305 /* Pack a float from parts, but do not canonicalize.  */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
307 {
308     const int sign_pos = fmt.frac_size + fmt.exp_size;
309     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310     return deposit64(ret, sign_pos, 1, p.sign);
311 }
312 
313 static inline float16 float16_pack_raw(FloatParts p)
314 {
315     return make_float16(pack_raw(float16_params, p));
316 }
317 
318 static inline float32 float32_pack_raw(FloatParts p)
319 {
320     return make_float32(pack_raw(float32_params, p));
321 }
322 
323 static inline float64 float64_pack_raw(FloatParts p)
324 {
325     return make_float64(pack_raw(float64_params, p));
326 }
327 
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine:  (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output.  These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
337 
338 /* Canonicalize EXP and FRAC, setting CLS.  */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340                                float_status *status)
341 {
342     if (part.exp == parm->exp_max && !parm->arm_althp) {
343         if (part.frac == 0) {
344             part.cls = float_class_inf;
345         } else {
346             part.frac <<= parm->frac_shift;
347             part.cls = (parts_is_snan_frac(part.frac, status)
348                         ? float_class_snan : float_class_qnan);
349         }
350     } else if (part.exp == 0) {
351         if (likely(part.frac == 0)) {
352             part.cls = float_class_zero;
353         } else if (status->flush_inputs_to_zero) {
354             float_raise(float_flag_input_denormal, status);
355             part.cls = float_class_zero;
356             part.frac = 0;
357         } else {
358             int shift = clz64(part.frac) - 1;
359             part.cls = float_class_normal;
360             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361             part.frac <<= shift;
362         }
363     } else {
364         part.cls = float_class_normal;
365         part.exp -= parm->exp_bias;
366         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
367     }
368     return part;
369 }
370 
371 /* Round and uncanonicalize a floating-point number by parts. There
372  * are FRAC_SHIFT bits that may require rounding at the bottom of the
373  * fraction; these bits will be removed. The exponent will be biased
374  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
375  */
376 
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378                                   const FloatFmt *parm)
379 {
380     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381     const uint64_t round_mask = parm->round_mask;
382     const uint64_t roundeven_mask = parm->roundeven_mask;
383     const int exp_max = parm->exp_max;
384     const int frac_shift = parm->frac_shift;
385     uint64_t frac, inc;
386     int exp, flags = 0;
387     bool overflow_norm;
388 
389     frac = p.frac;
390     exp = p.exp;
391 
392     switch (p.cls) {
393     case float_class_normal:
394         switch (s->float_rounding_mode) {
395         case float_round_nearest_even:
396             overflow_norm = false;
397             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398             break;
399         case float_round_ties_away:
400             overflow_norm = false;
401             inc = frac_lsbm1;
402             break;
403         case float_round_to_zero:
404             overflow_norm = true;
405             inc = 0;
406             break;
407         case float_round_up:
408             inc = p.sign ? 0 : round_mask;
409             overflow_norm = p.sign;
410             break;
411         case float_round_down:
412             inc = p.sign ? round_mask : 0;
413             overflow_norm = !p.sign;
414             break;
415         default:
416             g_assert_not_reached();
417         }
418 
419         exp += parm->exp_bias;
420         if (likely(exp > 0)) {
421             if (frac & round_mask) {
422                 flags |= float_flag_inexact;
423                 frac += inc;
424                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425                     frac >>= 1;
426                     exp++;
427                 }
428             }
429             frac >>= frac_shift;
430 
431             if (parm->arm_althp) {
432                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
433                 if (unlikely(exp > exp_max)) {
434                     /* Overflow.  Return the maximum normal.  */
435                     flags = float_flag_invalid;
436                     exp = exp_max;
437                     frac = -1;
438                 }
439             } else if (unlikely(exp >= exp_max)) {
440                 flags |= float_flag_overflow | float_flag_inexact;
441                 if (overflow_norm) {
442                     exp = exp_max - 1;
443                     frac = -1;
444                 } else {
445                     p.cls = float_class_inf;
446                     goto do_inf;
447                 }
448             }
449         } else if (s->flush_to_zero) {
450             flags |= float_flag_output_denormal;
451             p.cls = float_class_zero;
452             goto do_zero;
453         } else {
454             bool is_tiny = (s->float_detect_tininess
455                             == float_tininess_before_rounding)
456                         || (exp < 0)
457                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
458 
459             shift64RightJamming(frac, 1 - exp, &frac);
460             if (frac & round_mask) {
461                 /* Need to recompute round-to-even.  */
462                 if (s->float_rounding_mode == float_round_nearest_even) {
463                     inc = ((frac & roundeven_mask) != frac_lsbm1
464                            ? frac_lsbm1 : 0);
465                 }
466                 flags |= float_flag_inexact;
467                 frac += inc;
468             }
469 
470             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471             frac >>= frac_shift;
472 
473             if (is_tiny && (flags & float_flag_inexact)) {
474                 flags |= float_flag_underflow;
475             }
476             if (exp == 0 && frac == 0) {
477                 p.cls = float_class_zero;
478             }
479         }
480         break;
481 
482     case float_class_zero:
483     do_zero:
484         exp = 0;
485         frac = 0;
486         break;
487 
488     case float_class_inf:
489     do_inf:
490         assert(!parm->arm_althp);
491         exp = exp_max;
492         frac = 0;
493         break;
494 
495     case float_class_qnan:
496     case float_class_snan:
497         assert(!parm->arm_althp);
498         exp = exp_max;
499         frac >>= parm->frac_shift;
500         break;
501 
502     default:
503         g_assert_not_reached();
504     }
505 
506     float_raise(flags, s);
507     p.exp = exp;
508     p.frac = frac;
509     return p;
510 }
511 
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514                                             const FloatFmt *params)
515 {
516     return canonicalize(float16_unpack_raw(f), params, s);
517 }
518 
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
520 {
521     return float16a_unpack_canonical(f, s, &float16_params);
522 }
523 
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525                                              const FloatFmt *params)
526 {
527     return float16_pack_raw(round_canonical(p, s, params));
528 }
529 
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
531 {
532     return float16a_round_pack_canonical(p, s, &float16_params);
533 }
534 
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
536 {
537     return canonicalize(float32_unpack_raw(f), &float32_params, s);
538 }
539 
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
541 {
542     return float32_pack_raw(round_canonical(p, s, &float32_params));
543 }
544 
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
546 {
547     return canonicalize(float64_unpack_raw(f), &float64_params, s);
548 }
549 
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
551 {
552     return float64_pack_raw(round_canonical(p, s, &float64_params));
553 }
554 
555 static FloatParts return_nan(FloatParts a, float_status *s)
556 {
557     switch (a.cls) {
558     case float_class_snan:
559         s->float_exception_flags |= float_flag_invalid;
560         a = parts_silence_nan(a, s);
561         /* fall through */
562     case float_class_qnan:
563         if (s->default_nan_mode) {
564             return parts_default_nan(s);
565         }
566         break;
567 
568     default:
569         g_assert_not_reached();
570     }
571     return a;
572 }
573 
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
575 {
576     if (is_snan(a.cls) || is_snan(b.cls)) {
577         s->float_exception_flags |= float_flag_invalid;
578     }
579 
580     if (s->default_nan_mode) {
581         return parts_default_nan(s);
582     } else {
583         if (pickNaN(a.cls, b.cls,
584                     a.frac > b.frac ||
585                     (a.frac == b.frac && a.sign < b.sign))) {
586             a = b;
587         }
588         if (is_snan(a.cls)) {
589             return parts_silence_nan(a, s);
590         }
591     }
592     return a;
593 }
594 
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596                                   bool inf_zero, float_status *s)
597 {
598     int which;
599 
600     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601         s->float_exception_flags |= float_flag_invalid;
602     }
603 
604     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
605 
606     if (s->default_nan_mode) {
607         /* Note that this check is after pickNaNMulAdd so that function
608          * has an opportunity to set the Invalid flag.
609          */
610         which = 3;
611     }
612 
613     switch (which) {
614     case 0:
615         break;
616     case 1:
617         a = b;
618         break;
619     case 2:
620         a = c;
621         break;
622     case 3:
623         return parts_default_nan(s);
624     default:
625         g_assert_not_reached();
626     }
627 
628     if (is_snan(a.cls)) {
629         return parts_silence_nan(a, s);
630     }
631     return a;
632 }
633 
634 /*
635  * Returns the result of adding or subtracting the values of the
636  * floating-point values `a' and `b'. The operation is performed
637  * according to the IEC/IEEE Standard for Binary Floating-Point
638  * Arithmetic.
639  */
640 
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642                                 float_status *s)
643 {
644     bool a_sign = a.sign;
645     bool b_sign = b.sign ^ subtract;
646 
647     if (a_sign != b_sign) {
648         /* Subtraction */
649 
650         if (a.cls == float_class_normal && b.cls == float_class_normal) {
651             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653                 a.frac = a.frac - b.frac;
654             } else {
655                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656                 a.frac = b.frac - a.frac;
657                 a.exp = b.exp;
658                 a_sign ^= 1;
659             }
660 
661             if (a.frac == 0) {
662                 a.cls = float_class_zero;
663                 a.sign = s->float_rounding_mode == float_round_down;
664             } else {
665                 int shift = clz64(a.frac) - 1;
666                 a.frac = a.frac << shift;
667                 a.exp = a.exp - shift;
668                 a.sign = a_sign;
669             }
670             return a;
671         }
672         if (is_nan(a.cls) || is_nan(b.cls)) {
673             return pick_nan(a, b, s);
674         }
675         if (a.cls == float_class_inf) {
676             if (b.cls == float_class_inf) {
677                 float_raise(float_flag_invalid, s);
678                 return parts_default_nan(s);
679             }
680             return a;
681         }
682         if (a.cls == float_class_zero && b.cls == float_class_zero) {
683             a.sign = s->float_rounding_mode == float_round_down;
684             return a;
685         }
686         if (a.cls == float_class_zero || b.cls == float_class_inf) {
687             b.sign = a_sign ^ 1;
688             return b;
689         }
690         if (b.cls == float_class_zero) {
691             return a;
692         }
693     } else {
694         /* Addition */
695         if (a.cls == float_class_normal && b.cls == float_class_normal) {
696             if (a.exp > b.exp) {
697                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698             } else if (a.exp < b.exp) {
699                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700                 a.exp = b.exp;
701             }
702             a.frac += b.frac;
703             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704                 shift64RightJamming(a.frac, 1, &a.frac);
705                 a.exp += 1;
706             }
707             return a;
708         }
709         if (is_nan(a.cls) || is_nan(b.cls)) {
710             return pick_nan(a, b, s);
711         }
712         if (a.cls == float_class_inf || b.cls == float_class_zero) {
713             return a;
714         }
715         if (b.cls == float_class_inf || a.cls == float_class_zero) {
716             b.sign = b_sign;
717             return b;
718         }
719     }
720     g_assert_not_reached();
721 }
722 
723 /*
724  * Returns the result of adding or subtracting the floating-point
725  * values `a' and `b'. The operation is performed according to the
726  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
727  */
728 
729 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
730                                               float_status *status)
731 {
732     FloatParts pa = float16_unpack_canonical(a, status);
733     FloatParts pb = float16_unpack_canonical(b, status);
734     FloatParts pr = addsub_floats(pa, pb, false, status);
735 
736     return float16_round_pack_canonical(pr, status);
737 }
738 
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740                                              float_status *status)
741 {
742     FloatParts pa = float32_unpack_canonical(a, status);
743     FloatParts pb = float32_unpack_canonical(b, status);
744     FloatParts pr = addsub_floats(pa, pb, false, status);
745 
746     return float32_round_pack_canonical(pr, status);
747 }
748 
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750                                              float_status *status)
751 {
752     FloatParts pa = float64_unpack_canonical(a, status);
753     FloatParts pb = float64_unpack_canonical(b, status);
754     FloatParts pr = addsub_floats(pa, pb, false, status);
755 
756     return float64_round_pack_canonical(pr, status);
757 }
758 
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760                                              float_status *status)
761 {
762     FloatParts pa = float16_unpack_canonical(a, status);
763     FloatParts pb = float16_unpack_canonical(b, status);
764     FloatParts pr = addsub_floats(pa, pb, true, status);
765 
766     return float16_round_pack_canonical(pr, status);
767 }
768 
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770                                              float_status *status)
771 {
772     FloatParts pa = float32_unpack_canonical(a, status);
773     FloatParts pb = float32_unpack_canonical(b, status);
774     FloatParts pr = addsub_floats(pa, pb, true, status);
775 
776     return float32_round_pack_canonical(pr, status);
777 }
778 
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780                                              float_status *status)
781 {
782     FloatParts pa = float64_unpack_canonical(a, status);
783     FloatParts pb = float64_unpack_canonical(b, status);
784     FloatParts pr = addsub_floats(pa, pb, true, status);
785 
786     return float64_round_pack_canonical(pr, status);
787 }
788 
789 /*
790  * Returns the result of multiplying the floating-point values `a' and
791  * `b'. The operation is performed according to the IEC/IEEE Standard
792  * for Binary Floating-Point Arithmetic.
793  */
794 
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
796 {
797     bool sign = a.sign ^ b.sign;
798 
799     if (a.cls == float_class_normal && b.cls == float_class_normal) {
800         uint64_t hi, lo;
801         int exp = a.exp + b.exp;
802 
803         mul64To128(a.frac, b.frac, &hi, &lo);
804         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805         if (lo & DECOMPOSED_OVERFLOW_BIT) {
806             shift64RightJamming(lo, 1, &lo);
807             exp += 1;
808         }
809 
810         /* Re-use a */
811         a.exp = exp;
812         a.sign = sign;
813         a.frac = lo;
814         return a;
815     }
816     /* handle all the NaN cases */
817     if (is_nan(a.cls) || is_nan(b.cls)) {
818         return pick_nan(a, b, s);
819     }
820     /* Inf * Zero == NaN */
821     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822         (a.cls == float_class_zero && b.cls == float_class_inf)) {
823         s->float_exception_flags |= float_flag_invalid;
824         return parts_default_nan(s);
825     }
826     /* Multiply by 0 or Inf */
827     if (a.cls == float_class_inf || a.cls == float_class_zero) {
828         a.sign = sign;
829         return a;
830     }
831     if (b.cls == float_class_inf || b.cls == float_class_zero) {
832         b.sign = sign;
833         return b;
834     }
835     g_assert_not_reached();
836 }
837 
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839                                              float_status *status)
840 {
841     FloatParts pa = float16_unpack_canonical(a, status);
842     FloatParts pb = float16_unpack_canonical(b, status);
843     FloatParts pr = mul_floats(pa, pb, status);
844 
845     return float16_round_pack_canonical(pr, status);
846 }
847 
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849                                              float_status *status)
850 {
851     FloatParts pa = float32_unpack_canonical(a, status);
852     FloatParts pb = float32_unpack_canonical(b, status);
853     FloatParts pr = mul_floats(pa, pb, status);
854 
855     return float32_round_pack_canonical(pr, status);
856 }
857 
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859                                              float_status *status)
860 {
861     FloatParts pa = float64_unpack_canonical(a, status);
862     FloatParts pb = float64_unpack_canonical(b, status);
863     FloatParts pr = mul_floats(pa, pb, status);
864 
865     return float64_round_pack_canonical(pr, status);
866 }
867 
868 /*
869  * Returns the result of multiplying the floating-point values `a' and
870  * `b' then adding 'c', with no intermediate rounding step after the
871  * multiplication. The operation is performed according to the
872  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873  * The flags argument allows the caller to select negation of the
874  * addend, the intermediate product, or the final result. (The
875  * difference between this and having the caller do a separate
876  * negation is that negating externally will flip the sign bit on
877  * NaNs.)
878  */
879 
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881                                 int flags, float_status *s)
882 {
883     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884                     ((1 << float_class_inf) | (1 << float_class_zero));
885     bool p_sign;
886     bool sign_flip = flags & float_muladd_negate_result;
887     FloatClass p_class;
888     uint64_t hi, lo;
889     int p_exp;
890 
891     /* It is implementation-defined whether the cases of (0,inf,qnan)
892      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893      * they return if they do), so we have to hand this information
894      * off to the target-specific pick-a-NaN routine.
895      */
896     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897         return pick_nan_muladd(a, b, c, inf_zero, s);
898     }
899 
900     if (inf_zero) {
901         s->float_exception_flags |= float_flag_invalid;
902         return parts_default_nan(s);
903     }
904 
905     if (flags & float_muladd_negate_c) {
906         c.sign ^= 1;
907     }
908 
909     p_sign = a.sign ^ b.sign;
910 
911     if (flags & float_muladd_negate_product) {
912         p_sign ^= 1;
913     }
914 
915     if (a.cls == float_class_inf || b.cls == float_class_inf) {
916         p_class = float_class_inf;
917     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918         p_class = float_class_zero;
919     } else {
920         p_class = float_class_normal;
921     }
922 
923     if (c.cls == float_class_inf) {
924         if (p_class == float_class_inf && p_sign != c.sign) {
925             s->float_exception_flags |= float_flag_invalid;
926             return parts_default_nan(s);
927         } else {
928             a.cls = float_class_inf;
929             a.sign = c.sign ^ sign_flip;
930             return a;
931         }
932     }
933 
934     if (p_class == float_class_inf) {
935         a.cls = float_class_inf;
936         a.sign = p_sign ^ sign_flip;
937         return a;
938     }
939 
940     if (p_class == float_class_zero) {
941         if (c.cls == float_class_zero) {
942             if (p_sign != c.sign) {
943                 p_sign = s->float_rounding_mode == float_round_down;
944             }
945             c.sign = p_sign;
946         } else if (flags & float_muladd_halve_result) {
947             c.exp -= 1;
948         }
949         c.sign ^= sign_flip;
950         return c;
951     }
952 
953     /* a & b should be normals now... */
954     assert(a.cls == float_class_normal &&
955            b.cls == float_class_normal);
956 
957     p_exp = a.exp + b.exp;
958 
959     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960      * result.
961      */
962     mul64To128(a.frac, b.frac, &hi, &lo);
963     /* binary point now at bit 124 */
964 
965     /* check for overflow */
966     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967         shift128RightJamming(hi, lo, 1, &hi, &lo);
968         p_exp += 1;
969     }
970 
971     /* + add/sub */
972     if (c.cls == float_class_zero) {
973         /* move binary point back to 62 */
974         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975     } else {
976         int exp_diff = p_exp - c.exp;
977         if (p_sign == c.sign) {
978             /* Addition */
979             if (exp_diff <= 0) {
980                 shift128RightJamming(hi, lo,
981                                      DECOMPOSED_BINARY_POINT - exp_diff,
982                                      &hi, &lo);
983                 lo += c.frac;
984                 p_exp = c.exp;
985             } else {
986                 uint64_t c_hi, c_lo;
987                 /* shift c to the same binary point as the product (124) */
988                 c_hi = c.frac >> 2;
989                 c_lo = 0;
990                 shift128RightJamming(c_hi, c_lo,
991                                      exp_diff,
992                                      &c_hi, &c_lo);
993                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994                 /* move binary point back to 62 */
995                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
996             }
997 
998             if (lo & DECOMPOSED_OVERFLOW_BIT) {
999                 shift64RightJamming(lo, 1, &lo);
1000                 p_exp += 1;
1001             }
1002 
1003         } else {
1004             /* Subtraction */
1005             uint64_t c_hi, c_lo;
1006             /* make C binary point match product at bit 124 */
1007             c_hi = c.frac >> 2;
1008             c_lo = 0;
1009 
1010             if (exp_diff <= 0) {
1011                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012                 if (exp_diff == 0
1013                     &&
1014                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016                 } else {
1017                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018                     p_sign ^= 1;
1019                     p_exp = c.exp;
1020                 }
1021             } else {
1022                 shift128RightJamming(c_hi, c_lo,
1023                                      exp_diff,
1024                                      &c_hi, &c_lo);
1025                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026             }
1027 
1028             if (hi == 0 && lo == 0) {
1029                 a.cls = float_class_zero;
1030                 a.sign = s->float_rounding_mode == float_round_down;
1031                 a.sign ^= sign_flip;
1032                 return a;
1033             } else {
1034                 int shift;
1035                 if (hi != 0) {
1036                     shift = clz64(hi);
1037                 } else {
1038                     shift = clz64(lo) + 64;
1039                 }
1040                 /* Normalizing to a binary point of 124 is the
1041                    correct adjust for the exponent.  However since we're
1042                    shifting, we might as well put the binary point back
1043                    at 62 where we really want it.  Therefore shift as
1044                    if we're leaving 1 bit at the top of the word, but
1045                    adjust the exponent as if we're leaving 3 bits.  */
1046                 shift -= 1;
1047                 if (shift >= 64) {
1048                     lo = lo << (shift - 64);
1049                 } else {
1050                     hi = (hi << shift) | (lo >> (64 - shift));
1051                     lo = hi | ((lo << shift) != 0);
1052                 }
1053                 p_exp -= shift - 2;
1054             }
1055         }
1056     }
1057 
1058     if (flags & float_muladd_halve_result) {
1059         p_exp -= 1;
1060     }
1061 
1062     /* finally prepare our result */
1063     a.cls = float_class_normal;
1064     a.sign = p_sign ^ sign_flip;
1065     a.exp = p_exp;
1066     a.frac = lo;
1067 
1068     return a;
1069 }
1070 
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072                                                 int flags, float_status *status)
1073 {
1074     FloatParts pa = float16_unpack_canonical(a, status);
1075     FloatParts pb = float16_unpack_canonical(b, status);
1076     FloatParts pc = float16_unpack_canonical(c, status);
1077     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1078 
1079     return float16_round_pack_canonical(pr, status);
1080 }
1081 
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083                                                 int flags, float_status *status)
1084 {
1085     FloatParts pa = float32_unpack_canonical(a, status);
1086     FloatParts pb = float32_unpack_canonical(b, status);
1087     FloatParts pc = float32_unpack_canonical(c, status);
1088     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1089 
1090     return float32_round_pack_canonical(pr, status);
1091 }
1092 
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094                                                 int flags, float_status *status)
1095 {
1096     FloatParts pa = float64_unpack_canonical(a, status);
1097     FloatParts pb = float64_unpack_canonical(b, status);
1098     FloatParts pc = float64_unpack_canonical(c, status);
1099     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1100 
1101     return float64_round_pack_canonical(pr, status);
1102 }
1103 
1104 /*
1105  * Returns the result of dividing the floating-point value `a' by the
1106  * corresponding value `b'. The operation is performed according to
1107  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108  */
1109 
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1111 {
1112     bool sign = a.sign ^ b.sign;
1113 
1114     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115         uint64_t temp_lo, temp_hi;
1116         int exp = a.exp - b.exp;
1117         if (a.frac < b.frac) {
1118             exp -= 1;
1119             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120                               &temp_hi, &temp_lo);
1121         } else {
1122             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123                               &temp_hi, &temp_lo);
1124         }
1125         /* LSB of quot is set if inexact which roundandpack will use
1126          * to set flags. Yet again we re-use a for the result */
1127         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128         a.sign = sign;
1129         a.exp = exp;
1130         return a;
1131     }
1132     /* handle all the NaN cases */
1133     if (is_nan(a.cls) || is_nan(b.cls)) {
1134         return pick_nan(a, b, s);
1135     }
1136     /* 0/0 or Inf/Inf */
1137     if (a.cls == b.cls
1138         &&
1139         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140         s->float_exception_flags |= float_flag_invalid;
1141         return parts_default_nan(s);
1142     }
1143     /* Inf / x or 0 / x */
1144     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145         a.sign = sign;
1146         return a;
1147     }
1148     /* Div 0 => Inf */
1149     if (b.cls == float_class_zero) {
1150         s->float_exception_flags |= float_flag_divbyzero;
1151         a.cls = float_class_inf;
1152         a.sign = sign;
1153         return a;
1154     }
1155     /* Div by Inf */
1156     if (b.cls == float_class_inf) {
1157         a.cls = float_class_zero;
1158         a.sign = sign;
1159         return a;
1160     }
1161     g_assert_not_reached();
1162 }
1163 
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1165 {
1166     FloatParts pa = float16_unpack_canonical(a, status);
1167     FloatParts pb = float16_unpack_canonical(b, status);
1168     FloatParts pr = div_floats(pa, pb, status);
1169 
1170     return float16_round_pack_canonical(pr, status);
1171 }
1172 
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1174 {
1175     FloatParts pa = float32_unpack_canonical(a, status);
1176     FloatParts pb = float32_unpack_canonical(b, status);
1177     FloatParts pr = div_floats(pa, pb, status);
1178 
1179     return float32_round_pack_canonical(pr, status);
1180 }
1181 
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1183 {
1184     FloatParts pa = float64_unpack_canonical(a, status);
1185     FloatParts pb = float64_unpack_canonical(b, status);
1186     FloatParts pr = div_floats(pa, pb, status);
1187 
1188     return float64_round_pack_canonical(pr, status);
1189 }
1190 
1191 /*
1192  * Float to Float conversions
1193  *
1194  * Returns the result of converting one float format to another. The
1195  * conversion is performed according to the IEC/IEEE Standard for
1196  * Binary Floating-Point Arithmetic.
1197  *
1198  * The float_to_float helper only needs to take care of raising
1199  * invalid exceptions and handling the conversion on NaNs.
1200  */
1201 
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203                                  float_status *s)
1204 {
1205     if (dstf->arm_althp) {
1206         switch (a.cls) {
1207         case float_class_qnan:
1208         case float_class_snan:
1209             /* There is no NaN in the destination format.  Raise Invalid
1210              * and return a zero with the sign of the input NaN.
1211              */
1212             s->float_exception_flags |= float_flag_invalid;
1213             a.cls = float_class_zero;
1214             a.frac = 0;
1215             a.exp = 0;
1216             break;
1217 
1218         case float_class_inf:
1219             /* There is no Inf in the destination format.  Raise Invalid
1220              * and return the maximum normal with the correct sign.
1221              */
1222             s->float_exception_flags |= float_flag_invalid;
1223             a.cls = float_class_normal;
1224             a.exp = dstf->exp_max;
1225             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226             break;
1227 
1228         default:
1229             break;
1230         }
1231     } else if (is_nan(a.cls)) {
1232         if (is_snan(a.cls)) {
1233             s->float_exception_flags |= float_flag_invalid;
1234             a = parts_silence_nan(a, s);
1235         }
1236         if (s->default_nan_mode) {
1237             return parts_default_nan(s);
1238         }
1239     }
1240     return a;
1241 }
1242 
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1244 {
1245     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247     FloatParts pr = float_to_float(p, &float32_params, s);
1248     return float32_round_pack_canonical(pr, s);
1249 }
1250 
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1252 {
1253     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255     FloatParts pr = float_to_float(p, &float64_params, s);
1256     return float64_round_pack_canonical(pr, s);
1257 }
1258 
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1260 {
1261     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262     FloatParts p = float32_unpack_canonical(a, s);
1263     FloatParts pr = float_to_float(p, fmt16, s);
1264     return float16a_round_pack_canonical(pr, s, fmt16);
1265 }
1266 
1267 float64 float32_to_float64(float32 a, float_status *s)
1268 {
1269     FloatParts p = float32_unpack_canonical(a, s);
1270     FloatParts pr = float_to_float(p, &float64_params, s);
1271     return float64_round_pack_canonical(pr, s);
1272 }
1273 
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1275 {
1276     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277     FloatParts p = float64_unpack_canonical(a, s);
1278     FloatParts pr = float_to_float(p, fmt16, s);
1279     return float16a_round_pack_canonical(pr, s, fmt16);
1280 }
1281 
1282 float32 float64_to_float32(float64 a, float_status *s)
1283 {
1284     FloatParts p = float64_unpack_canonical(a, s);
1285     FloatParts pr = float_to_float(p, &float32_params, s);
1286     return float32_round_pack_canonical(pr, s);
1287 }
1288 
1289 /*
1290  * Rounds the floating-point value `a' to an integer, and returns the
1291  * result as a floating-point value. The operation is performed
1292  * according to the IEC/IEEE Standard for Binary Floating-Point
1293  * Arithmetic.
1294  */
1295 
1296 static FloatParts round_to_int(FloatParts a, int rmode,
1297                                int scale, float_status *s)
1298 {
1299     switch (a.cls) {
1300     case float_class_qnan:
1301     case float_class_snan:
1302         return return_nan(a, s);
1303 
1304     case float_class_zero:
1305     case float_class_inf:
1306         /* already "integral" */
1307         break;
1308 
1309     case float_class_normal:
1310         scale = MIN(MAX(scale, -0x10000), 0x10000);
1311         a.exp += scale;
1312 
1313         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1314             /* already integral */
1315             break;
1316         }
1317         if (a.exp < 0) {
1318             bool one;
1319             /* all fractional */
1320             s->float_exception_flags |= float_flag_inexact;
1321             switch (rmode) {
1322             case float_round_nearest_even:
1323                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1324                 break;
1325             case float_round_ties_away:
1326                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1327                 break;
1328             case float_round_to_zero:
1329                 one = false;
1330                 break;
1331             case float_round_up:
1332                 one = !a.sign;
1333                 break;
1334             case float_round_down:
1335                 one = a.sign;
1336                 break;
1337             default:
1338                 g_assert_not_reached();
1339             }
1340 
1341             if (one) {
1342                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1343                 a.exp = 0;
1344             } else {
1345                 a.cls = float_class_zero;
1346             }
1347         } else {
1348             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1349             uint64_t frac_lsbm1 = frac_lsb >> 1;
1350             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1351             uint64_t rnd_mask = rnd_even_mask >> 1;
1352             uint64_t inc;
1353 
1354             switch (rmode) {
1355             case float_round_nearest_even:
1356                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1357                 break;
1358             case float_round_ties_away:
1359                 inc = frac_lsbm1;
1360                 break;
1361             case float_round_to_zero:
1362                 inc = 0;
1363                 break;
1364             case float_round_up:
1365                 inc = a.sign ? 0 : rnd_mask;
1366                 break;
1367             case float_round_down:
1368                 inc = a.sign ? rnd_mask : 0;
1369                 break;
1370             default:
1371                 g_assert_not_reached();
1372             }
1373 
1374             if (a.frac & rnd_mask) {
1375                 s->float_exception_flags |= float_flag_inexact;
1376                 a.frac += inc;
1377                 a.frac &= ~rnd_mask;
1378                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1379                     a.frac >>= 1;
1380                     a.exp++;
1381                 }
1382             }
1383         }
1384         break;
1385     default:
1386         g_assert_not_reached();
1387     }
1388     return a;
1389 }
1390 
1391 float16 float16_round_to_int(float16 a, float_status *s)
1392 {
1393     FloatParts pa = float16_unpack_canonical(a, s);
1394     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1395     return float16_round_pack_canonical(pr, s);
1396 }
1397 
1398 float32 float32_round_to_int(float32 a, float_status *s)
1399 {
1400     FloatParts pa = float32_unpack_canonical(a, s);
1401     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1402     return float32_round_pack_canonical(pr, s);
1403 }
1404 
1405 float64 float64_round_to_int(float64 a, float_status *s)
1406 {
1407     FloatParts pa = float64_unpack_canonical(a, s);
1408     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1409     return float64_round_pack_canonical(pr, s);
1410 }
1411 
1412 /*
1413  * Returns the result of converting the floating-point value `a' to
1414  * the two's complement integer format. The conversion is performed
1415  * according to the IEC/IEEE Standard for Binary Floating-Point
1416  * Arithmetic---which means in particular that the conversion is
1417  * rounded according to the current rounding mode. If `a' is a NaN,
1418  * the largest positive integer is returned. Otherwise, if the
1419  * conversion overflows, the largest integer with the same sign as `a'
1420  * is returned.
1421 */
1422 
1423 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1424                                      int64_t min, int64_t max,
1425                                      float_status *s)
1426 {
1427     uint64_t r;
1428     int orig_flags = get_float_exception_flags(s);
1429     FloatParts p = round_to_int(in, rmode, scale, s);
1430 
1431     switch (p.cls) {
1432     case float_class_snan:
1433     case float_class_qnan:
1434         s->float_exception_flags = orig_flags | float_flag_invalid;
1435         return max;
1436     case float_class_inf:
1437         s->float_exception_flags = orig_flags | float_flag_invalid;
1438         return p.sign ? min : max;
1439     case float_class_zero:
1440         return 0;
1441     case float_class_normal:
1442         if (p.exp < DECOMPOSED_BINARY_POINT) {
1443             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1444         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1445             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1446         } else {
1447             r = UINT64_MAX;
1448         }
1449         if (p.sign) {
1450             if (r <= -(uint64_t) min) {
1451                 return -r;
1452             } else {
1453                 s->float_exception_flags = orig_flags | float_flag_invalid;
1454                 return min;
1455             }
1456         } else {
1457             if (r <= max) {
1458                 return r;
1459             } else {
1460                 s->float_exception_flags = orig_flags | float_flag_invalid;
1461                 return max;
1462             }
1463         }
1464     default:
1465         g_assert_not_reached();
1466     }
1467 }
1468 
1469 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1470                                 float_status *s)
1471 {
1472     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1473                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1474 }
1475 
1476 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1477                                 float_status *s)
1478 {
1479     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1480                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1481 }
1482 
1483 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1484                                 float_status *s)
1485 {
1486     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1487                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1488 }
1489 
1490 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1491                                 float_status *s)
1492 {
1493     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1494                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1495 }
1496 
1497 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1498                                 float_status *s)
1499 {
1500     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1501                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1502 }
1503 
1504 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1505                                 float_status *s)
1506 {
1507     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1508                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1509 }
1510 
1511 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1512                                 float_status *s)
1513 {
1514     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1515                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1516 }
1517 
1518 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1519                                 float_status *s)
1520 {
1521     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1522                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1523 }
1524 
1525 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1526                                 float_status *s)
1527 {
1528     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1529                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1530 }
1531 
1532 int16_t float16_to_int16(float16 a, float_status *s)
1533 {
1534     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1535 }
1536 
1537 int32_t float16_to_int32(float16 a, float_status *s)
1538 {
1539     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1540 }
1541 
1542 int64_t float16_to_int64(float16 a, float_status *s)
1543 {
1544     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1545 }
1546 
1547 int16_t float32_to_int16(float32 a, float_status *s)
1548 {
1549     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1550 }
1551 
1552 int32_t float32_to_int32(float32 a, float_status *s)
1553 {
1554     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1555 }
1556 
1557 int64_t float32_to_int64(float32 a, float_status *s)
1558 {
1559     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1560 }
1561 
1562 int16_t float64_to_int16(float64 a, float_status *s)
1563 {
1564     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1565 }
1566 
1567 int32_t float64_to_int32(float64 a, float_status *s)
1568 {
1569     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1570 }
1571 
1572 int64_t float64_to_int64(float64 a, float_status *s)
1573 {
1574     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1575 }
1576 
1577 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1578 {
1579     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1580 }
1581 
1582 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1583 {
1584     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1585 }
1586 
1587 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1588 {
1589     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1590 }
1591 
1592 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1593 {
1594     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1595 }
1596 
1597 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1598 {
1599     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1600 }
1601 
1602 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1603 {
1604     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1605 }
1606 
1607 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1608 {
1609     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1610 }
1611 
1612 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1613 {
1614     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1615 }
1616 
1617 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1618 {
1619     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1620 }
1621 
1622 /*
1623  *  Returns the result of converting the floating-point value `a' to
1624  *  the unsigned integer format. The conversion is performed according
1625  *  to the IEC/IEEE Standard for Binary Floating-Point
1626  *  Arithmetic---which means in particular that the conversion is
1627  *  rounded according to the current rounding mode. If `a' is a NaN,
1628  *  the largest unsigned integer is returned. Otherwise, if the
1629  *  conversion overflows, the largest unsigned integer is returned. If
1630  *  the 'a' is negative, the result is rounded and zero is returned;
1631  *  values that do not round to zero will raise the inexact exception
1632  *  flag.
1633  */
1634 
1635 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1636                                        uint64_t max, float_status *s)
1637 {
1638     int orig_flags = get_float_exception_flags(s);
1639     FloatParts p = round_to_int(in, rmode, scale, s);
1640     uint64_t r;
1641 
1642     switch (p.cls) {
1643     case float_class_snan:
1644     case float_class_qnan:
1645         s->float_exception_flags = orig_flags | float_flag_invalid;
1646         return max;
1647     case float_class_inf:
1648         s->float_exception_flags = orig_flags | float_flag_invalid;
1649         return p.sign ? 0 : max;
1650     case float_class_zero:
1651         return 0;
1652     case float_class_normal:
1653         if (p.sign) {
1654             s->float_exception_flags = orig_flags | float_flag_invalid;
1655             return 0;
1656         }
1657 
1658         if (p.exp < DECOMPOSED_BINARY_POINT) {
1659             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1660         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1661             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1662         } else {
1663             s->float_exception_flags = orig_flags | float_flag_invalid;
1664             return max;
1665         }
1666 
1667         /* For uint64 this will never trip, but if p.exp is too large
1668          * to shift a decomposed fraction we shall have exited via the
1669          * 3rd leg above.
1670          */
1671         if (r > max) {
1672             s->float_exception_flags = orig_flags | float_flag_invalid;
1673             return max;
1674         }
1675         return r;
1676     default:
1677         g_assert_not_reached();
1678     }
1679 }
1680 
1681 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1682                                   float_status *s)
1683 {
1684     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1685                                   rmode, scale, UINT16_MAX, s);
1686 }
1687 
1688 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1689                                   float_status *s)
1690 {
1691     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1692                                   rmode, scale, UINT32_MAX, s);
1693 }
1694 
1695 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1696                                   float_status *s)
1697 {
1698     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1699                                   rmode, scale, UINT64_MAX, s);
1700 }
1701 
1702 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1703                                   float_status *s)
1704 {
1705     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1706                                   rmode, scale, UINT16_MAX, s);
1707 }
1708 
1709 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1710                                   float_status *s)
1711 {
1712     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1713                                   rmode, scale, UINT32_MAX, s);
1714 }
1715 
1716 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1717                                   float_status *s)
1718 {
1719     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1720                                   rmode, scale, UINT64_MAX, s);
1721 }
1722 
1723 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1724                                   float_status *s)
1725 {
1726     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1727                                   rmode, scale, UINT16_MAX, s);
1728 }
1729 
1730 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1731                                   float_status *s)
1732 {
1733     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1734                                   rmode, scale, UINT32_MAX, s);
1735 }
1736 
1737 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1738                                   float_status *s)
1739 {
1740     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1741                                   rmode, scale, UINT64_MAX, s);
1742 }
1743 
1744 uint16_t float16_to_uint16(float16 a, float_status *s)
1745 {
1746     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1747 }
1748 
1749 uint32_t float16_to_uint32(float16 a, float_status *s)
1750 {
1751     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1752 }
1753 
1754 uint64_t float16_to_uint64(float16 a, float_status *s)
1755 {
1756     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1757 }
1758 
1759 uint16_t float32_to_uint16(float32 a, float_status *s)
1760 {
1761     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1762 }
1763 
1764 uint32_t float32_to_uint32(float32 a, float_status *s)
1765 {
1766     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1767 }
1768 
1769 uint64_t float32_to_uint64(float32 a, float_status *s)
1770 {
1771     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1772 }
1773 
1774 uint16_t float64_to_uint16(float64 a, float_status *s)
1775 {
1776     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1777 }
1778 
1779 uint32_t float64_to_uint32(float64 a, float_status *s)
1780 {
1781     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1782 }
1783 
1784 uint64_t float64_to_uint64(float64 a, float_status *s)
1785 {
1786     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1787 }
1788 
1789 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1790 {
1791     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1792 }
1793 
1794 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1795 {
1796     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1797 }
1798 
1799 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1800 {
1801     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1802 }
1803 
1804 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1805 {
1806     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1807 }
1808 
1809 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1810 {
1811     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1812 }
1813 
1814 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1815 {
1816     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1817 }
1818 
1819 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1820 {
1821     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1822 }
1823 
1824 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1825 {
1826     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1827 }
1828 
1829 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1830 {
1831     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1832 }
1833 
1834 /*
1835  * Integer to float conversions
1836  *
1837  * Returns the result of converting the two's complement integer `a'
1838  * to the floating-point format. The conversion is performed according
1839  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1840  */
1841 
1842 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1843 {
1844     FloatParts r = { .sign = false };
1845 
1846     if (a == 0) {
1847         r.cls = float_class_zero;
1848     } else {
1849         uint64_t f = a;
1850         int shift;
1851 
1852         r.cls = float_class_normal;
1853         if (a < 0) {
1854             f = -f;
1855             r.sign = true;
1856         }
1857         shift = clz64(f) - 1;
1858         scale = MIN(MAX(scale, -0x10000), 0x10000);
1859 
1860         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1861         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1862     }
1863 
1864     return r;
1865 }
1866 
1867 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1868 {
1869     FloatParts pa = int_to_float(a, scale, status);
1870     return float16_round_pack_canonical(pa, status);
1871 }
1872 
1873 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1874 {
1875     return int64_to_float16_scalbn(a, scale, status);
1876 }
1877 
1878 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1879 {
1880     return int64_to_float16_scalbn(a, scale, status);
1881 }
1882 
1883 float16 int64_to_float16(int64_t a, float_status *status)
1884 {
1885     return int64_to_float16_scalbn(a, 0, status);
1886 }
1887 
1888 float16 int32_to_float16(int32_t a, float_status *status)
1889 {
1890     return int64_to_float16_scalbn(a, 0, status);
1891 }
1892 
1893 float16 int16_to_float16(int16_t a, float_status *status)
1894 {
1895     return int64_to_float16_scalbn(a, 0, status);
1896 }
1897 
1898 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1899 {
1900     FloatParts pa = int_to_float(a, scale, status);
1901     return float32_round_pack_canonical(pa, status);
1902 }
1903 
1904 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1905 {
1906     return int64_to_float32_scalbn(a, scale, status);
1907 }
1908 
1909 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1910 {
1911     return int64_to_float32_scalbn(a, scale, status);
1912 }
1913 
1914 float32 int64_to_float32(int64_t a, float_status *status)
1915 {
1916     return int64_to_float32_scalbn(a, 0, status);
1917 }
1918 
1919 float32 int32_to_float32(int32_t a, float_status *status)
1920 {
1921     return int64_to_float32_scalbn(a, 0, status);
1922 }
1923 
1924 float32 int16_to_float32(int16_t a, float_status *status)
1925 {
1926     return int64_to_float32_scalbn(a, 0, status);
1927 }
1928 
1929 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1930 {
1931     FloatParts pa = int_to_float(a, scale, status);
1932     return float64_round_pack_canonical(pa, status);
1933 }
1934 
1935 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1936 {
1937     return int64_to_float64_scalbn(a, scale, status);
1938 }
1939 
1940 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1941 {
1942     return int64_to_float64_scalbn(a, scale, status);
1943 }
1944 
1945 float64 int64_to_float64(int64_t a, float_status *status)
1946 {
1947     return int64_to_float64_scalbn(a, 0, status);
1948 }
1949 
1950 float64 int32_to_float64(int32_t a, float_status *status)
1951 {
1952     return int64_to_float64_scalbn(a, 0, status);
1953 }
1954 
1955 float64 int16_to_float64(int16_t a, float_status *status)
1956 {
1957     return int64_to_float64_scalbn(a, 0, status);
1958 }
1959 
1960 
1961 /*
1962  * Unsigned Integer to float conversions
1963  *
1964  * Returns the result of converting the unsigned integer `a' to the
1965  * floating-point format. The conversion is performed according to the
1966  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1967  */
1968 
1969 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1970 {
1971     FloatParts r = { .sign = false };
1972 
1973     if (a == 0) {
1974         r.cls = float_class_zero;
1975     } else {
1976         scale = MIN(MAX(scale, -0x10000), 0x10000);
1977         r.cls = float_class_normal;
1978         if ((int64_t)a < 0) {
1979             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1980             shift64RightJamming(a, 1, &a);
1981             r.frac = a;
1982         } else {
1983             int shift = clz64(a) - 1;
1984             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1985             r.frac = a << shift;
1986         }
1987     }
1988 
1989     return r;
1990 }
1991 
1992 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
1993 {
1994     FloatParts pa = uint_to_float(a, scale, status);
1995     return float16_round_pack_canonical(pa, status);
1996 }
1997 
1998 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
1999 {
2000     return uint64_to_float16_scalbn(a, scale, status);
2001 }
2002 
2003 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2004 {
2005     return uint64_to_float16_scalbn(a, scale, status);
2006 }
2007 
2008 float16 uint64_to_float16(uint64_t a, float_status *status)
2009 {
2010     return uint64_to_float16_scalbn(a, 0, status);
2011 }
2012 
2013 float16 uint32_to_float16(uint32_t a, float_status *status)
2014 {
2015     return uint64_to_float16_scalbn(a, 0, status);
2016 }
2017 
2018 float16 uint16_to_float16(uint16_t a, float_status *status)
2019 {
2020     return uint64_to_float16_scalbn(a, 0, status);
2021 }
2022 
2023 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2024 {
2025     FloatParts pa = uint_to_float(a, scale, status);
2026     return float32_round_pack_canonical(pa, status);
2027 }
2028 
2029 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2030 {
2031     return uint64_to_float32_scalbn(a, scale, status);
2032 }
2033 
2034 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2035 {
2036     return uint64_to_float32_scalbn(a, scale, status);
2037 }
2038 
2039 float32 uint64_to_float32(uint64_t a, float_status *status)
2040 {
2041     return uint64_to_float32_scalbn(a, 0, status);
2042 }
2043 
2044 float32 uint32_to_float32(uint32_t a, float_status *status)
2045 {
2046     return uint64_to_float32_scalbn(a, 0, status);
2047 }
2048 
2049 float32 uint16_to_float32(uint16_t a, float_status *status)
2050 {
2051     return uint64_to_float32_scalbn(a, 0, status);
2052 }
2053 
2054 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2055 {
2056     FloatParts pa = uint_to_float(a, scale, status);
2057     return float64_round_pack_canonical(pa, status);
2058 }
2059 
2060 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2061 {
2062     return uint64_to_float64_scalbn(a, scale, status);
2063 }
2064 
2065 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2066 {
2067     return uint64_to_float64_scalbn(a, scale, status);
2068 }
2069 
2070 float64 uint64_to_float64(uint64_t a, float_status *status)
2071 {
2072     return uint64_to_float64_scalbn(a, 0, status);
2073 }
2074 
2075 float64 uint32_to_float64(uint32_t a, float_status *status)
2076 {
2077     return uint64_to_float64_scalbn(a, 0, status);
2078 }
2079 
2080 float64 uint16_to_float64(uint16_t a, float_status *status)
2081 {
2082     return uint64_to_float64_scalbn(a, 0, status);
2083 }
2084 
2085 /* Float Min/Max */
2086 /* min() and max() functions. These can't be implemented as
2087  * 'compare and pick one input' because that would mishandle
2088  * NaNs and +0 vs -0.
2089  *
2090  * minnum() and maxnum() functions. These are similar to the min()
2091  * and max() functions but if one of the arguments is a QNaN and
2092  * the other is numerical then the numerical argument is returned.
2093  * SNaNs will get quietened before being returned.
2094  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2095  * and maxNum() operations. min() and max() are the typical min/max
2096  * semantics provided by many CPUs which predate that specification.
2097  *
2098  * minnummag() and maxnummag() functions correspond to minNumMag()
2099  * and minNumMag() from the IEEE-754 2008.
2100  */
2101 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2102                                 bool ieee, bool ismag, float_status *s)
2103 {
2104     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2105         if (ieee) {
2106             /* Takes two floating-point values `a' and `b', one of
2107              * which is a NaN, and returns the appropriate NaN
2108              * result. If either `a' or `b' is a signaling NaN,
2109              * the invalid exception is raised.
2110              */
2111             if (is_snan(a.cls) || is_snan(b.cls)) {
2112                 return pick_nan(a, b, s);
2113             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2114                 return b;
2115             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2116                 return a;
2117             }
2118         }
2119         return pick_nan(a, b, s);
2120     } else {
2121         int a_exp, b_exp;
2122 
2123         switch (a.cls) {
2124         case float_class_normal:
2125             a_exp = a.exp;
2126             break;
2127         case float_class_inf:
2128             a_exp = INT_MAX;
2129             break;
2130         case float_class_zero:
2131             a_exp = INT_MIN;
2132             break;
2133         default:
2134             g_assert_not_reached();
2135             break;
2136         }
2137         switch (b.cls) {
2138         case float_class_normal:
2139             b_exp = b.exp;
2140             break;
2141         case float_class_inf:
2142             b_exp = INT_MAX;
2143             break;
2144         case float_class_zero:
2145             b_exp = INT_MIN;
2146             break;
2147         default:
2148             g_assert_not_reached();
2149             break;
2150         }
2151 
2152         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2153             bool a_less = a_exp < b_exp;
2154             if (a_exp == b_exp) {
2155                 a_less = a.frac < b.frac;
2156             }
2157             return a_less ^ ismin ? b : a;
2158         }
2159 
2160         if (a.sign == b.sign) {
2161             bool a_less = a_exp < b_exp;
2162             if (a_exp == b_exp) {
2163                 a_less = a.frac < b.frac;
2164             }
2165             return a.sign ^ a_less ^ ismin ? b : a;
2166         } else {
2167             return a.sign ^ ismin ? b : a;
2168         }
2169     }
2170 }
2171 
2172 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2173 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2174                                      float_status *s)                   \
2175 {                                                                       \
2176     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2177     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2178     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2179                                                                         \
2180     return float ## sz ## _round_pack_canonical(pr, s);                 \
2181 }
2182 
2183 MINMAX(16, min, true, false, false)
2184 MINMAX(16, minnum, true, true, false)
2185 MINMAX(16, minnummag, true, true, true)
2186 MINMAX(16, max, false, false, false)
2187 MINMAX(16, maxnum, false, true, false)
2188 MINMAX(16, maxnummag, false, true, true)
2189 
2190 MINMAX(32, min, true, false, false)
2191 MINMAX(32, minnum, true, true, false)
2192 MINMAX(32, minnummag, true, true, true)
2193 MINMAX(32, max, false, false, false)
2194 MINMAX(32, maxnum, false, true, false)
2195 MINMAX(32, maxnummag, false, true, true)
2196 
2197 MINMAX(64, min, true, false, false)
2198 MINMAX(64, minnum, true, true, false)
2199 MINMAX(64, minnummag, true, true, true)
2200 MINMAX(64, max, false, false, false)
2201 MINMAX(64, maxnum, false, true, false)
2202 MINMAX(64, maxnummag, false, true, true)
2203 
2204 #undef MINMAX
2205 
2206 /* Floating point compare */
2207 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2208                           float_status *s)
2209 {
2210     if (is_nan(a.cls) || is_nan(b.cls)) {
2211         if (!is_quiet ||
2212             a.cls == float_class_snan ||
2213             b.cls == float_class_snan) {
2214             s->float_exception_flags |= float_flag_invalid;
2215         }
2216         return float_relation_unordered;
2217     }
2218 
2219     if (a.cls == float_class_zero) {
2220         if (b.cls == float_class_zero) {
2221             return float_relation_equal;
2222         }
2223         return b.sign ? float_relation_greater : float_relation_less;
2224     } else if (b.cls == float_class_zero) {
2225         return a.sign ? float_relation_less : float_relation_greater;
2226     }
2227 
2228     /* The only really important thing about infinity is its sign. If
2229      * both are infinities the sign marks the smallest of the two.
2230      */
2231     if (a.cls == float_class_inf) {
2232         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2233             return float_relation_equal;
2234         }
2235         return a.sign ? float_relation_less : float_relation_greater;
2236     } else if (b.cls == float_class_inf) {
2237         return b.sign ? float_relation_greater : float_relation_less;
2238     }
2239 
2240     if (a.sign != b.sign) {
2241         return a.sign ? float_relation_less : float_relation_greater;
2242     }
2243 
2244     if (a.exp == b.exp) {
2245         if (a.frac == b.frac) {
2246             return float_relation_equal;
2247         }
2248         if (a.sign) {
2249             return a.frac > b.frac ?
2250                 float_relation_less : float_relation_greater;
2251         } else {
2252             return a.frac > b.frac ?
2253                 float_relation_greater : float_relation_less;
2254         }
2255     } else {
2256         if (a.sign) {
2257             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2258         } else {
2259             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2260         }
2261     }
2262 }
2263 
2264 #define COMPARE(sz)                                                     \
2265 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2266                             float_status *s)                            \
2267 {                                                                       \
2268     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2269     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2270     return compare_floats(pa, pb, false, s);                            \
2271 }                                                                       \
2272 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2273                                   float_status *s)                      \
2274 {                                                                       \
2275     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2276     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2277     return compare_floats(pa, pb, true, s);                             \
2278 }
2279 
2280 COMPARE(16)
2281 COMPARE(32)
2282 COMPARE(64)
2283 
2284 #undef COMPARE
2285 
2286 /* Multiply A by 2 raised to the power N.  */
2287 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2288 {
2289     if (unlikely(is_nan(a.cls))) {
2290         return return_nan(a, s);
2291     }
2292     if (a.cls == float_class_normal) {
2293         /* The largest float type (even though not supported by FloatParts)
2294          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2295          * still allows rounding to infinity, without allowing overflow
2296          * within the int32_t that backs FloatParts.exp.
2297          */
2298         n = MIN(MAX(n, -0x10000), 0x10000);
2299         a.exp += n;
2300     }
2301     return a;
2302 }
2303 
2304 float16 float16_scalbn(float16 a, int n, float_status *status)
2305 {
2306     FloatParts pa = float16_unpack_canonical(a, status);
2307     FloatParts pr = scalbn_decomposed(pa, n, status);
2308     return float16_round_pack_canonical(pr, status);
2309 }
2310 
2311 float32 float32_scalbn(float32 a, int n, float_status *status)
2312 {
2313     FloatParts pa = float32_unpack_canonical(a, status);
2314     FloatParts pr = scalbn_decomposed(pa, n, status);
2315     return float32_round_pack_canonical(pr, status);
2316 }
2317 
2318 float64 float64_scalbn(float64 a, int n, float_status *status)
2319 {
2320     FloatParts pa = float64_unpack_canonical(a, status);
2321     FloatParts pr = scalbn_decomposed(pa, n, status);
2322     return float64_round_pack_canonical(pr, status);
2323 }
2324 
2325 /*
2326  * Square Root
2327  *
2328  * The old softfloat code did an approximation step before zeroing in
2329  * on the final result. However for simpleness we just compute the
2330  * square root by iterating down from the implicit bit to enough extra
2331  * bits to ensure we get a correctly rounded result.
2332  *
2333  * This does mean however the calculation is slower than before,
2334  * especially for 64 bit floats.
2335  */
2336 
2337 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2338 {
2339     uint64_t a_frac, r_frac, s_frac;
2340     int bit, last_bit;
2341 
2342     if (is_nan(a.cls)) {
2343         return return_nan(a, s);
2344     }
2345     if (a.cls == float_class_zero) {
2346         return a;  /* sqrt(+-0) = +-0 */
2347     }
2348     if (a.sign) {
2349         s->float_exception_flags |= float_flag_invalid;
2350         return parts_default_nan(s);
2351     }
2352     if (a.cls == float_class_inf) {
2353         return a;  /* sqrt(+inf) = +inf */
2354     }
2355 
2356     assert(a.cls == float_class_normal);
2357 
2358     /* We need two overflow bits at the top. Adding room for that is a
2359      * right shift. If the exponent is odd, we can discard the low bit
2360      * by multiplying the fraction by 2; that's a left shift. Combine
2361      * those and we shift right if the exponent is even.
2362      */
2363     a_frac = a.frac;
2364     if (!(a.exp & 1)) {
2365         a_frac >>= 1;
2366     }
2367     a.exp >>= 1;
2368 
2369     /* Bit-by-bit computation of sqrt.  */
2370     r_frac = 0;
2371     s_frac = 0;
2372 
2373     /* Iterate from implicit bit down to the 3 extra bits to compute a
2374      * properly rounded result. Remember we've inserted one more bit
2375      * at the top, so these positions are one less.
2376      */
2377     bit = DECOMPOSED_BINARY_POINT - 1;
2378     last_bit = MAX(p->frac_shift - 4, 0);
2379     do {
2380         uint64_t q = 1ULL << bit;
2381         uint64_t t_frac = s_frac + q;
2382         if (t_frac <= a_frac) {
2383             s_frac = t_frac + q;
2384             a_frac -= t_frac;
2385             r_frac += q;
2386         }
2387         a_frac <<= 1;
2388     } while (--bit >= last_bit);
2389 
2390     /* Undo the right shift done above. If there is any remaining
2391      * fraction, the result is inexact. Set the sticky bit.
2392      */
2393     a.frac = (r_frac << 1) + (a_frac != 0);
2394 
2395     return a;
2396 }
2397 
2398 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2399 {
2400     FloatParts pa = float16_unpack_canonical(a, status);
2401     FloatParts pr = sqrt_float(pa, status, &float16_params);
2402     return float16_round_pack_canonical(pr, status);
2403 }
2404 
2405 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2406 {
2407     FloatParts pa = float32_unpack_canonical(a, status);
2408     FloatParts pr = sqrt_float(pa, status, &float32_params);
2409     return float32_round_pack_canonical(pr, status);
2410 }
2411 
2412 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2413 {
2414     FloatParts pa = float64_unpack_canonical(a, status);
2415     FloatParts pr = sqrt_float(pa, status, &float64_params);
2416     return float64_round_pack_canonical(pr, status);
2417 }
2418 
2419 /*----------------------------------------------------------------------------
2420 | The pattern for a default generated NaN.
2421 *----------------------------------------------------------------------------*/
2422 
2423 float16 float16_default_nan(float_status *status)
2424 {
2425     FloatParts p = parts_default_nan(status);
2426     p.frac >>= float16_params.frac_shift;
2427     return float16_pack_raw(p);
2428 }
2429 
2430 float32 float32_default_nan(float_status *status)
2431 {
2432     FloatParts p = parts_default_nan(status);
2433     p.frac >>= float32_params.frac_shift;
2434     return float32_pack_raw(p);
2435 }
2436 
2437 float64 float64_default_nan(float_status *status)
2438 {
2439     FloatParts p = parts_default_nan(status);
2440     p.frac >>= float64_params.frac_shift;
2441     return float64_pack_raw(p);
2442 }
2443 
2444 float128 float128_default_nan(float_status *status)
2445 {
2446     FloatParts p = parts_default_nan(status);
2447     float128 r;
2448 
2449     /* Extrapolate from the choices made by parts_default_nan to fill
2450      * in the quad-floating format.  If the low bit is set, assume we
2451      * want to set all non-snan bits.
2452      */
2453     r.low = -(p.frac & 1);
2454     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2455     r.high |= LIT64(0x7FFF000000000000);
2456     r.high |= (uint64_t)p.sign << 63;
2457 
2458     return r;
2459 }
2460 
2461 /*----------------------------------------------------------------------------
2462 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2463 *----------------------------------------------------------------------------*/
2464 
2465 float16 float16_silence_nan(float16 a, float_status *status)
2466 {
2467     FloatParts p = float16_unpack_raw(a);
2468     p.frac <<= float16_params.frac_shift;
2469     p = parts_silence_nan(p, status);
2470     p.frac >>= float16_params.frac_shift;
2471     return float16_pack_raw(p);
2472 }
2473 
2474 float32 float32_silence_nan(float32 a, float_status *status)
2475 {
2476     FloatParts p = float32_unpack_raw(a);
2477     p.frac <<= float32_params.frac_shift;
2478     p = parts_silence_nan(p, status);
2479     p.frac >>= float32_params.frac_shift;
2480     return float32_pack_raw(p);
2481 }
2482 
2483 float64 float64_silence_nan(float64 a, float_status *status)
2484 {
2485     FloatParts p = float64_unpack_raw(a);
2486     p.frac <<= float64_params.frac_shift;
2487     p = parts_silence_nan(p, status);
2488     p.frac >>= float64_params.frac_shift;
2489     return float64_pack_raw(p);
2490 }
2491 
2492 /*----------------------------------------------------------------------------
2493 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2494 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2495 | input.  If `zSign' is 1, the input is negated before being converted to an
2496 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2497 | is simply rounded to an integer, with the inexact exception raised if the
2498 | input cannot be represented exactly as an integer.  However, if the fixed-
2499 | point input is too large, the invalid exception is raised and the largest
2500 | positive or negative integer is returned.
2501 *----------------------------------------------------------------------------*/
2502 
2503 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2504 {
2505     int8_t roundingMode;
2506     flag roundNearestEven;
2507     int8_t roundIncrement, roundBits;
2508     int32_t z;
2509 
2510     roundingMode = status->float_rounding_mode;
2511     roundNearestEven = ( roundingMode == float_round_nearest_even );
2512     switch (roundingMode) {
2513     case float_round_nearest_even:
2514     case float_round_ties_away:
2515         roundIncrement = 0x40;
2516         break;
2517     case float_round_to_zero:
2518         roundIncrement = 0;
2519         break;
2520     case float_round_up:
2521         roundIncrement = zSign ? 0 : 0x7f;
2522         break;
2523     case float_round_down:
2524         roundIncrement = zSign ? 0x7f : 0;
2525         break;
2526     default:
2527         abort();
2528     }
2529     roundBits = absZ & 0x7F;
2530     absZ = ( absZ + roundIncrement )>>7;
2531     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2532     z = absZ;
2533     if ( zSign ) z = - z;
2534     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2535         float_raise(float_flag_invalid, status);
2536         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2537     }
2538     if (roundBits) {
2539         status->float_exception_flags |= float_flag_inexact;
2540     }
2541     return z;
2542 
2543 }
2544 
2545 /*----------------------------------------------------------------------------
2546 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2547 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2548 | and returns the properly rounded 64-bit integer corresponding to the input.
2549 | If `zSign' is 1, the input is negated before being converted to an integer.
2550 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2551 | the inexact exception raised if the input cannot be represented exactly as
2552 | an integer.  However, if the fixed-point input is too large, the invalid
2553 | exception is raised and the largest positive or negative integer is
2554 | returned.
2555 *----------------------------------------------------------------------------*/
2556 
2557 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2558                                float_status *status)
2559 {
2560     int8_t roundingMode;
2561     flag roundNearestEven, increment;
2562     int64_t z;
2563 
2564     roundingMode = status->float_rounding_mode;
2565     roundNearestEven = ( roundingMode == float_round_nearest_even );
2566     switch (roundingMode) {
2567     case float_round_nearest_even:
2568     case float_round_ties_away:
2569         increment = ((int64_t) absZ1 < 0);
2570         break;
2571     case float_round_to_zero:
2572         increment = 0;
2573         break;
2574     case float_round_up:
2575         increment = !zSign && absZ1;
2576         break;
2577     case float_round_down:
2578         increment = zSign && absZ1;
2579         break;
2580     default:
2581         abort();
2582     }
2583     if ( increment ) {
2584         ++absZ0;
2585         if ( absZ0 == 0 ) goto overflow;
2586         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2587     }
2588     z = absZ0;
2589     if ( zSign ) z = - z;
2590     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2591  overflow:
2592         float_raise(float_flag_invalid, status);
2593         return
2594               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2595             : LIT64( 0x7FFFFFFFFFFFFFFF );
2596     }
2597     if (absZ1) {
2598         status->float_exception_flags |= float_flag_inexact;
2599     }
2600     return z;
2601 
2602 }
2603 
2604 /*----------------------------------------------------------------------------
2605 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2606 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2607 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2608 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2609 | with the inexact exception raised if the input cannot be represented exactly
2610 | as an integer.  However, if the fixed-point input is too large, the invalid
2611 | exception is raised and the largest unsigned integer is returned.
2612 *----------------------------------------------------------------------------*/
2613 
2614 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2615                                 uint64_t absZ1, float_status *status)
2616 {
2617     int8_t roundingMode;
2618     flag roundNearestEven, increment;
2619 
2620     roundingMode = status->float_rounding_mode;
2621     roundNearestEven = (roundingMode == float_round_nearest_even);
2622     switch (roundingMode) {
2623     case float_round_nearest_even:
2624     case float_round_ties_away:
2625         increment = ((int64_t)absZ1 < 0);
2626         break;
2627     case float_round_to_zero:
2628         increment = 0;
2629         break;
2630     case float_round_up:
2631         increment = !zSign && absZ1;
2632         break;
2633     case float_round_down:
2634         increment = zSign && absZ1;
2635         break;
2636     default:
2637         abort();
2638     }
2639     if (increment) {
2640         ++absZ0;
2641         if (absZ0 == 0) {
2642             float_raise(float_flag_invalid, status);
2643             return LIT64(0xFFFFFFFFFFFFFFFF);
2644         }
2645         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2646     }
2647 
2648     if (zSign && absZ0) {
2649         float_raise(float_flag_invalid, status);
2650         return 0;
2651     }
2652 
2653     if (absZ1) {
2654         status->float_exception_flags |= float_flag_inexact;
2655     }
2656     return absZ0;
2657 }
2658 
2659 /*----------------------------------------------------------------------------
2660 | If `a' is denormal and we are in flush-to-zero mode then set the
2661 | input-denormal exception and return zero. Otherwise just return the value.
2662 *----------------------------------------------------------------------------*/
2663 float32 float32_squash_input_denormal(float32 a, float_status *status)
2664 {
2665     if (status->flush_inputs_to_zero) {
2666         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2667             float_raise(float_flag_input_denormal, status);
2668             return make_float32(float32_val(a) & 0x80000000);
2669         }
2670     }
2671     return a;
2672 }
2673 
2674 /*----------------------------------------------------------------------------
2675 | Normalizes the subnormal single-precision floating-point value represented
2676 | by the denormalized significand `aSig'.  The normalized exponent and
2677 | significand are stored at the locations pointed to by `zExpPtr' and
2678 | `zSigPtr', respectively.
2679 *----------------------------------------------------------------------------*/
2680 
2681 static void
2682  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2683 {
2684     int8_t shiftCount;
2685 
2686     shiftCount = countLeadingZeros32( aSig ) - 8;
2687     *zSigPtr = aSig<<shiftCount;
2688     *zExpPtr = 1 - shiftCount;
2689 
2690 }
2691 
2692 /*----------------------------------------------------------------------------
2693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2694 | and significand `zSig', and returns the proper single-precision floating-
2695 | point value corresponding to the abstract input.  Ordinarily, the abstract
2696 | value is simply rounded and packed into the single-precision format, with
2697 | the inexact exception raised if the abstract input cannot be represented
2698 | exactly.  However, if the abstract value is too large, the overflow and
2699 | inexact exceptions are raised and an infinity or maximal finite value is
2700 | returned.  If the abstract value is too small, the input value is rounded to
2701 | a subnormal number, and the underflow and inexact exceptions are raised if
2702 | the abstract input cannot be represented exactly as a subnormal single-
2703 | precision floating-point number.
2704 |     The input significand `zSig' has its binary point between bits 30
2705 | and 29, which is 7 bits to the left of the usual location.  This shifted
2706 | significand must be normalized or smaller.  If `zSig' is not normalized,
2707 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2708 | and it must not require rounding.  In the usual case that `zSig' is
2709 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2710 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2711 | Binary Floating-Point Arithmetic.
2712 *----------------------------------------------------------------------------*/
2713 
2714 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2715                                    float_status *status)
2716 {
2717     int8_t roundingMode;
2718     flag roundNearestEven;
2719     int8_t roundIncrement, roundBits;
2720     flag isTiny;
2721 
2722     roundingMode = status->float_rounding_mode;
2723     roundNearestEven = ( roundingMode == float_round_nearest_even );
2724     switch (roundingMode) {
2725     case float_round_nearest_even:
2726     case float_round_ties_away:
2727         roundIncrement = 0x40;
2728         break;
2729     case float_round_to_zero:
2730         roundIncrement = 0;
2731         break;
2732     case float_round_up:
2733         roundIncrement = zSign ? 0 : 0x7f;
2734         break;
2735     case float_round_down:
2736         roundIncrement = zSign ? 0x7f : 0;
2737         break;
2738     default:
2739         abort();
2740         break;
2741     }
2742     roundBits = zSig & 0x7F;
2743     if ( 0xFD <= (uint16_t) zExp ) {
2744         if (    ( 0xFD < zExp )
2745              || (    ( zExp == 0xFD )
2746                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2747            ) {
2748             float_raise(float_flag_overflow | float_flag_inexact, status);
2749             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2750         }
2751         if ( zExp < 0 ) {
2752             if (status->flush_to_zero) {
2753                 float_raise(float_flag_output_denormal, status);
2754                 return packFloat32(zSign, 0, 0);
2755             }
2756             isTiny =
2757                 (status->float_detect_tininess
2758                  == float_tininess_before_rounding)
2759                 || ( zExp < -1 )
2760                 || ( zSig + roundIncrement < 0x80000000 );
2761             shift32RightJamming( zSig, - zExp, &zSig );
2762             zExp = 0;
2763             roundBits = zSig & 0x7F;
2764             if (isTiny && roundBits) {
2765                 float_raise(float_flag_underflow, status);
2766             }
2767         }
2768     }
2769     if (roundBits) {
2770         status->float_exception_flags |= float_flag_inexact;
2771     }
2772     zSig = ( zSig + roundIncrement )>>7;
2773     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2774     if ( zSig == 0 ) zExp = 0;
2775     return packFloat32( zSign, zExp, zSig );
2776 
2777 }
2778 
2779 /*----------------------------------------------------------------------------
2780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2781 | and significand `zSig', and returns the proper single-precision floating-
2782 | point value corresponding to the abstract input.  This routine is just like
2783 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2784 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2785 | floating-point exponent.
2786 *----------------------------------------------------------------------------*/
2787 
2788 static float32
2789  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2790                               float_status *status)
2791 {
2792     int8_t shiftCount;
2793 
2794     shiftCount = countLeadingZeros32( zSig ) - 1;
2795     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2796                                status);
2797 
2798 }
2799 
2800 /*----------------------------------------------------------------------------
2801 | If `a' is denormal and we are in flush-to-zero mode then set the
2802 | input-denormal exception and return zero. Otherwise just return the value.
2803 *----------------------------------------------------------------------------*/
2804 float64 float64_squash_input_denormal(float64 a, float_status *status)
2805 {
2806     if (status->flush_inputs_to_zero) {
2807         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2808             float_raise(float_flag_input_denormal, status);
2809             return make_float64(float64_val(a) & (1ULL << 63));
2810         }
2811     }
2812     return a;
2813 }
2814 
2815 /*----------------------------------------------------------------------------
2816 | Normalizes the subnormal double-precision floating-point value represented
2817 | by the denormalized significand `aSig'.  The normalized exponent and
2818 | significand are stored at the locations pointed to by `zExpPtr' and
2819 | `zSigPtr', respectively.
2820 *----------------------------------------------------------------------------*/
2821 
2822 static void
2823  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2824 {
2825     int8_t shiftCount;
2826 
2827     shiftCount = countLeadingZeros64( aSig ) - 11;
2828     *zSigPtr = aSig<<shiftCount;
2829     *zExpPtr = 1 - shiftCount;
2830 
2831 }
2832 
2833 /*----------------------------------------------------------------------------
2834 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2835 | double-precision floating-point value, returning the result.  After being
2836 | shifted into the proper positions, the three fields are simply added
2837 | together to form the result.  This means that any integer portion of `zSig'
2838 | will be added into the exponent.  Since a properly normalized significand
2839 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2840 | than the desired result exponent whenever `zSig' is a complete, normalized
2841 | significand.
2842 *----------------------------------------------------------------------------*/
2843 
2844 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2845 {
2846 
2847     return make_float64(
2848         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2849 
2850 }
2851 
2852 /*----------------------------------------------------------------------------
2853 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2854 | and significand `zSig', and returns the proper double-precision floating-
2855 | point value corresponding to the abstract input.  Ordinarily, the abstract
2856 | value is simply rounded and packed into the double-precision format, with
2857 | the inexact exception raised if the abstract input cannot be represented
2858 | exactly.  However, if the abstract value is too large, the overflow and
2859 | inexact exceptions are raised and an infinity or maximal finite value is
2860 | returned.  If the abstract value is too small, the input value is rounded to
2861 | a subnormal number, and the underflow and inexact exceptions are raised if
2862 | the abstract input cannot be represented exactly as a subnormal double-
2863 | precision floating-point number.
2864 |     The input significand `zSig' has its binary point between bits 62
2865 | and 61, which is 10 bits to the left of the usual location.  This shifted
2866 | significand must be normalized or smaller.  If `zSig' is not normalized,
2867 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2868 | and it must not require rounding.  In the usual case that `zSig' is
2869 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2870 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2871 | Binary Floating-Point Arithmetic.
2872 *----------------------------------------------------------------------------*/
2873 
2874 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2875                                    float_status *status)
2876 {
2877     int8_t roundingMode;
2878     flag roundNearestEven;
2879     int roundIncrement, roundBits;
2880     flag isTiny;
2881 
2882     roundingMode = status->float_rounding_mode;
2883     roundNearestEven = ( roundingMode == float_round_nearest_even );
2884     switch (roundingMode) {
2885     case float_round_nearest_even:
2886     case float_round_ties_away:
2887         roundIncrement = 0x200;
2888         break;
2889     case float_round_to_zero:
2890         roundIncrement = 0;
2891         break;
2892     case float_round_up:
2893         roundIncrement = zSign ? 0 : 0x3ff;
2894         break;
2895     case float_round_down:
2896         roundIncrement = zSign ? 0x3ff : 0;
2897         break;
2898     case float_round_to_odd:
2899         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2900         break;
2901     default:
2902         abort();
2903     }
2904     roundBits = zSig & 0x3FF;
2905     if ( 0x7FD <= (uint16_t) zExp ) {
2906         if (    ( 0x7FD < zExp )
2907              || (    ( zExp == 0x7FD )
2908                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2909            ) {
2910             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2911                                    roundIncrement != 0;
2912             float_raise(float_flag_overflow | float_flag_inexact, status);
2913             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2914         }
2915         if ( zExp < 0 ) {
2916             if (status->flush_to_zero) {
2917                 float_raise(float_flag_output_denormal, status);
2918                 return packFloat64(zSign, 0, 0);
2919             }
2920             isTiny =
2921                    (status->float_detect_tininess
2922                     == float_tininess_before_rounding)
2923                 || ( zExp < -1 )
2924                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2925             shift64RightJamming( zSig, - zExp, &zSig );
2926             zExp = 0;
2927             roundBits = zSig & 0x3FF;
2928             if (isTiny && roundBits) {
2929                 float_raise(float_flag_underflow, status);
2930             }
2931             if (roundingMode == float_round_to_odd) {
2932                 /*
2933                  * For round-to-odd case, the roundIncrement depends on
2934                  * zSig which just changed.
2935                  */
2936                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2937             }
2938         }
2939     }
2940     if (roundBits) {
2941         status->float_exception_flags |= float_flag_inexact;
2942     }
2943     zSig = ( zSig + roundIncrement )>>10;
2944     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2945     if ( zSig == 0 ) zExp = 0;
2946     return packFloat64( zSign, zExp, zSig );
2947 
2948 }
2949 
2950 /*----------------------------------------------------------------------------
2951 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2952 | and significand `zSig', and returns the proper double-precision floating-
2953 | point value corresponding to the abstract input.  This routine is just like
2954 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2955 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2956 | floating-point exponent.
2957 *----------------------------------------------------------------------------*/
2958 
2959 static float64
2960  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2961                               float_status *status)
2962 {
2963     int8_t shiftCount;
2964 
2965     shiftCount = countLeadingZeros64( zSig ) - 1;
2966     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2967                                status);
2968 
2969 }
2970 
2971 /*----------------------------------------------------------------------------
2972 | Normalizes the subnormal extended double-precision floating-point value
2973 | represented by the denormalized significand `aSig'.  The normalized exponent
2974 | and significand are stored at the locations pointed to by `zExpPtr' and
2975 | `zSigPtr', respectively.
2976 *----------------------------------------------------------------------------*/
2977 
2978 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2979                                 uint64_t *zSigPtr)
2980 {
2981     int8_t shiftCount;
2982 
2983     shiftCount = countLeadingZeros64( aSig );
2984     *zSigPtr = aSig<<shiftCount;
2985     *zExpPtr = 1 - shiftCount;
2986 }
2987 
2988 /*----------------------------------------------------------------------------
2989 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2990 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2991 | and returns the proper extended double-precision floating-point value
2992 | corresponding to the abstract input.  Ordinarily, the abstract value is
2993 | rounded and packed into the extended double-precision format, with the
2994 | inexact exception raised if the abstract input cannot be represented
2995 | exactly.  However, if the abstract value is too large, the overflow and
2996 | inexact exceptions are raised and an infinity or maximal finite value is
2997 | returned.  If the abstract value is too small, the input value is rounded to
2998 | a subnormal number, and the underflow and inexact exceptions are raised if
2999 | the abstract input cannot be represented exactly as a subnormal extended
3000 | double-precision floating-point number.
3001 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3002 | number of bits as single or double precision, respectively.  Otherwise, the
3003 | result is rounded to the full precision of the extended double-precision
3004 | format.
3005 |     The input significand must be normalized or smaller.  If the input
3006 | significand is not normalized, `zExp' must be 0; in that case, the result
3007 | returned is a subnormal number, and it must not require rounding.  The
3008 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3009 | Floating-Point Arithmetic.
3010 *----------------------------------------------------------------------------*/
3011 
3012 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3013                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3014                               float_status *status)
3015 {
3016     int8_t roundingMode;
3017     flag roundNearestEven, increment, isTiny;
3018     int64_t roundIncrement, roundMask, roundBits;
3019 
3020     roundingMode = status->float_rounding_mode;
3021     roundNearestEven = ( roundingMode == float_round_nearest_even );
3022     if ( roundingPrecision == 80 ) goto precision80;
3023     if ( roundingPrecision == 64 ) {
3024         roundIncrement = LIT64( 0x0000000000000400 );
3025         roundMask = LIT64( 0x00000000000007FF );
3026     }
3027     else if ( roundingPrecision == 32 ) {
3028         roundIncrement = LIT64( 0x0000008000000000 );
3029         roundMask = LIT64( 0x000000FFFFFFFFFF );
3030     }
3031     else {
3032         goto precision80;
3033     }
3034     zSig0 |= ( zSig1 != 0 );
3035     switch (roundingMode) {
3036     case float_round_nearest_even:
3037     case float_round_ties_away:
3038         break;
3039     case float_round_to_zero:
3040         roundIncrement = 0;
3041         break;
3042     case float_round_up:
3043         roundIncrement = zSign ? 0 : roundMask;
3044         break;
3045     case float_round_down:
3046         roundIncrement = zSign ? roundMask : 0;
3047         break;
3048     default:
3049         abort();
3050     }
3051     roundBits = zSig0 & roundMask;
3052     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3053         if (    ( 0x7FFE < zExp )
3054              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3055            ) {
3056             goto overflow;
3057         }
3058         if ( zExp <= 0 ) {
3059             if (status->flush_to_zero) {
3060                 float_raise(float_flag_output_denormal, status);
3061                 return packFloatx80(zSign, 0, 0);
3062             }
3063             isTiny =
3064                    (status->float_detect_tininess
3065                     == float_tininess_before_rounding)
3066                 || ( zExp < 0 )
3067                 || ( zSig0 <= zSig0 + roundIncrement );
3068             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3069             zExp = 0;
3070             roundBits = zSig0 & roundMask;
3071             if (isTiny && roundBits) {
3072                 float_raise(float_flag_underflow, status);
3073             }
3074             if (roundBits) {
3075                 status->float_exception_flags |= float_flag_inexact;
3076             }
3077             zSig0 += roundIncrement;
3078             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3079             roundIncrement = roundMask + 1;
3080             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3081                 roundMask |= roundIncrement;
3082             }
3083             zSig0 &= ~ roundMask;
3084             return packFloatx80( zSign, zExp, zSig0 );
3085         }
3086     }
3087     if (roundBits) {
3088         status->float_exception_flags |= float_flag_inexact;
3089     }
3090     zSig0 += roundIncrement;
3091     if ( zSig0 < roundIncrement ) {
3092         ++zExp;
3093         zSig0 = LIT64( 0x8000000000000000 );
3094     }
3095     roundIncrement = roundMask + 1;
3096     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3097         roundMask |= roundIncrement;
3098     }
3099     zSig0 &= ~ roundMask;
3100     if ( zSig0 == 0 ) zExp = 0;
3101     return packFloatx80( zSign, zExp, zSig0 );
3102  precision80:
3103     switch (roundingMode) {
3104     case float_round_nearest_even:
3105     case float_round_ties_away:
3106         increment = ((int64_t)zSig1 < 0);
3107         break;
3108     case float_round_to_zero:
3109         increment = 0;
3110         break;
3111     case float_round_up:
3112         increment = !zSign && zSig1;
3113         break;
3114     case float_round_down:
3115         increment = zSign && zSig1;
3116         break;
3117     default:
3118         abort();
3119     }
3120     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3121         if (    ( 0x7FFE < zExp )
3122              || (    ( zExp == 0x7FFE )
3123                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3124                   && increment
3125                 )
3126            ) {
3127             roundMask = 0;
3128  overflow:
3129             float_raise(float_flag_overflow | float_flag_inexact, status);
3130             if (    ( roundingMode == float_round_to_zero )
3131                  || ( zSign && ( roundingMode == float_round_up ) )
3132                  || ( ! zSign && ( roundingMode == float_round_down ) )
3133                ) {
3134                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3135             }
3136             return packFloatx80(zSign,
3137                                 floatx80_infinity_high,
3138                                 floatx80_infinity_low);
3139         }
3140         if ( zExp <= 0 ) {
3141             isTiny =
3142                    (status->float_detect_tininess
3143                     == float_tininess_before_rounding)
3144                 || ( zExp < 0 )
3145                 || ! increment
3146                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3147             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3148             zExp = 0;
3149             if (isTiny && zSig1) {
3150                 float_raise(float_flag_underflow, status);
3151             }
3152             if (zSig1) {
3153                 status->float_exception_flags |= float_flag_inexact;
3154             }
3155             switch (roundingMode) {
3156             case float_round_nearest_even:
3157             case float_round_ties_away:
3158                 increment = ((int64_t)zSig1 < 0);
3159                 break;
3160             case float_round_to_zero:
3161                 increment = 0;
3162                 break;
3163             case float_round_up:
3164                 increment = !zSign && zSig1;
3165                 break;
3166             case float_round_down:
3167                 increment = zSign && zSig1;
3168                 break;
3169             default:
3170                 abort();
3171             }
3172             if ( increment ) {
3173                 ++zSig0;
3174                 zSig0 &=
3175                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3176                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3177             }
3178             return packFloatx80( zSign, zExp, zSig0 );
3179         }
3180     }
3181     if (zSig1) {
3182         status->float_exception_flags |= float_flag_inexact;
3183     }
3184     if ( increment ) {
3185         ++zSig0;
3186         if ( zSig0 == 0 ) {
3187             ++zExp;
3188             zSig0 = LIT64( 0x8000000000000000 );
3189         }
3190         else {
3191             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3192         }
3193     }
3194     else {
3195         if ( zSig0 == 0 ) zExp = 0;
3196     }
3197     return packFloatx80( zSign, zExp, zSig0 );
3198 
3199 }
3200 
3201 /*----------------------------------------------------------------------------
3202 | Takes an abstract floating-point value having sign `zSign', exponent
3203 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3204 | and returns the proper extended double-precision floating-point value
3205 | corresponding to the abstract input.  This routine is just like
3206 | `roundAndPackFloatx80' except that the input significand does not have to be
3207 | normalized.
3208 *----------------------------------------------------------------------------*/
3209 
3210 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3211                                        flag zSign, int32_t zExp,
3212                                        uint64_t zSig0, uint64_t zSig1,
3213                                        float_status *status)
3214 {
3215     int8_t shiftCount;
3216 
3217     if ( zSig0 == 0 ) {
3218         zSig0 = zSig1;
3219         zSig1 = 0;
3220         zExp -= 64;
3221     }
3222     shiftCount = countLeadingZeros64( zSig0 );
3223     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3224     zExp -= shiftCount;
3225     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3226                                 zSig0, zSig1, status);
3227 
3228 }
3229 
3230 /*----------------------------------------------------------------------------
3231 | Returns the least-significant 64 fraction bits of the quadruple-precision
3232 | floating-point value `a'.
3233 *----------------------------------------------------------------------------*/
3234 
3235 static inline uint64_t extractFloat128Frac1( float128 a )
3236 {
3237 
3238     return a.low;
3239 
3240 }
3241 
3242 /*----------------------------------------------------------------------------
3243 | Returns the most-significant 48 fraction bits of the quadruple-precision
3244 | floating-point value `a'.
3245 *----------------------------------------------------------------------------*/
3246 
3247 static inline uint64_t extractFloat128Frac0( float128 a )
3248 {
3249 
3250     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3251 
3252 }
3253 
3254 /*----------------------------------------------------------------------------
3255 | Returns the exponent bits of the quadruple-precision floating-point value
3256 | `a'.
3257 *----------------------------------------------------------------------------*/
3258 
3259 static inline int32_t extractFloat128Exp( float128 a )
3260 {
3261 
3262     return ( a.high>>48 ) & 0x7FFF;
3263 
3264 }
3265 
3266 /*----------------------------------------------------------------------------
3267 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3268 *----------------------------------------------------------------------------*/
3269 
3270 static inline flag extractFloat128Sign( float128 a )
3271 {
3272 
3273     return a.high>>63;
3274 
3275 }
3276 
3277 /*----------------------------------------------------------------------------
3278 | Normalizes the subnormal quadruple-precision floating-point value
3279 | represented by the denormalized significand formed by the concatenation of
3280 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3281 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3282 | significand are stored at the location pointed to by `zSig0Ptr', and the
3283 | least significant 64 bits of the normalized significand are stored at the
3284 | location pointed to by `zSig1Ptr'.
3285 *----------------------------------------------------------------------------*/
3286 
3287 static void
3288  normalizeFloat128Subnormal(
3289      uint64_t aSig0,
3290      uint64_t aSig1,
3291      int32_t *zExpPtr,
3292      uint64_t *zSig0Ptr,
3293      uint64_t *zSig1Ptr
3294  )
3295 {
3296     int8_t shiftCount;
3297 
3298     if ( aSig0 == 0 ) {
3299         shiftCount = countLeadingZeros64( aSig1 ) - 15;
3300         if ( shiftCount < 0 ) {
3301             *zSig0Ptr = aSig1>>( - shiftCount );
3302             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3303         }
3304         else {
3305             *zSig0Ptr = aSig1<<shiftCount;
3306             *zSig1Ptr = 0;
3307         }
3308         *zExpPtr = - shiftCount - 63;
3309     }
3310     else {
3311         shiftCount = countLeadingZeros64( aSig0 ) - 15;
3312         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3313         *zExpPtr = 1 - shiftCount;
3314     }
3315 
3316 }
3317 
3318 /*----------------------------------------------------------------------------
3319 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3320 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3321 | floating-point value, returning the result.  After being shifted into the
3322 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3323 | added together to form the most significant 32 bits of the result.  This
3324 | means that any integer portion of `zSig0' will be added into the exponent.
3325 | Since a properly normalized significand will have an integer portion equal
3326 | to 1, the `zExp' input should be 1 less than the desired result exponent
3327 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3328 | significand.
3329 *----------------------------------------------------------------------------*/
3330 
3331 static inline float128
3332  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3333 {
3334     float128 z;
3335 
3336     z.low = zSig1;
3337     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3338     return z;
3339 
3340 }
3341 
3342 /*----------------------------------------------------------------------------
3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3344 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3345 | and `zSig2', and returns the proper quadruple-precision floating-point value
3346 | corresponding to the abstract input.  Ordinarily, the abstract value is
3347 | simply rounded and packed into the quadruple-precision format, with the
3348 | inexact exception raised if the abstract input cannot be represented
3349 | exactly.  However, if the abstract value is too large, the overflow and
3350 | inexact exceptions are raised and an infinity or maximal finite value is
3351 | returned.  If the abstract value is too small, the input value is rounded to
3352 | a subnormal number, and the underflow and inexact exceptions are raised if
3353 | the abstract input cannot be represented exactly as a subnormal quadruple-
3354 | precision floating-point number.
3355 |     The input significand must be normalized or smaller.  If the input
3356 | significand is not normalized, `zExp' must be 0; in that case, the result
3357 | returned is a subnormal number, and it must not require rounding.  In the
3358 | usual case that the input significand is normalized, `zExp' must be 1 less
3359 | than the ``true'' floating-point exponent.  The handling of underflow and
3360 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3361 *----------------------------------------------------------------------------*/
3362 
3363 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3364                                      uint64_t zSig0, uint64_t zSig1,
3365                                      uint64_t zSig2, float_status *status)
3366 {
3367     int8_t roundingMode;
3368     flag roundNearestEven, increment, isTiny;
3369 
3370     roundingMode = status->float_rounding_mode;
3371     roundNearestEven = ( roundingMode == float_round_nearest_even );
3372     switch (roundingMode) {
3373     case float_round_nearest_even:
3374     case float_round_ties_away:
3375         increment = ((int64_t)zSig2 < 0);
3376         break;
3377     case float_round_to_zero:
3378         increment = 0;
3379         break;
3380     case float_round_up:
3381         increment = !zSign && zSig2;
3382         break;
3383     case float_round_down:
3384         increment = zSign && zSig2;
3385         break;
3386     case float_round_to_odd:
3387         increment = !(zSig1 & 0x1) && zSig2;
3388         break;
3389     default:
3390         abort();
3391     }
3392     if ( 0x7FFD <= (uint32_t) zExp ) {
3393         if (    ( 0x7FFD < zExp )
3394              || (    ( zExp == 0x7FFD )
3395                   && eq128(
3396                          LIT64( 0x0001FFFFFFFFFFFF ),
3397                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3398                          zSig0,
3399                          zSig1
3400                      )
3401                   && increment
3402                 )
3403            ) {
3404             float_raise(float_flag_overflow | float_flag_inexact, status);
3405             if (    ( roundingMode == float_round_to_zero )
3406                  || ( zSign && ( roundingMode == float_round_up ) )
3407                  || ( ! zSign && ( roundingMode == float_round_down ) )
3408                  || (roundingMode == float_round_to_odd)
3409                ) {
3410                 return
3411                     packFloat128(
3412                         zSign,
3413                         0x7FFE,
3414                         LIT64( 0x0000FFFFFFFFFFFF ),
3415                         LIT64( 0xFFFFFFFFFFFFFFFF )
3416                     );
3417             }
3418             return packFloat128( zSign, 0x7FFF, 0, 0 );
3419         }
3420         if ( zExp < 0 ) {
3421             if (status->flush_to_zero) {
3422                 float_raise(float_flag_output_denormal, status);
3423                 return packFloat128(zSign, 0, 0, 0);
3424             }
3425             isTiny =
3426                    (status->float_detect_tininess
3427                     == float_tininess_before_rounding)
3428                 || ( zExp < -1 )
3429                 || ! increment
3430                 || lt128(
3431                        zSig0,
3432                        zSig1,
3433                        LIT64( 0x0001FFFFFFFFFFFF ),
3434                        LIT64( 0xFFFFFFFFFFFFFFFF )
3435                    );
3436             shift128ExtraRightJamming(
3437                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3438             zExp = 0;
3439             if (isTiny && zSig2) {
3440                 float_raise(float_flag_underflow, status);
3441             }
3442             switch (roundingMode) {
3443             case float_round_nearest_even:
3444             case float_round_ties_away:
3445                 increment = ((int64_t)zSig2 < 0);
3446                 break;
3447             case float_round_to_zero:
3448                 increment = 0;
3449                 break;
3450             case float_round_up:
3451                 increment = !zSign && zSig2;
3452                 break;
3453             case float_round_down:
3454                 increment = zSign && zSig2;
3455                 break;
3456             case float_round_to_odd:
3457                 increment = !(zSig1 & 0x1) && zSig2;
3458                 break;
3459             default:
3460                 abort();
3461             }
3462         }
3463     }
3464     if (zSig2) {
3465         status->float_exception_flags |= float_flag_inexact;
3466     }
3467     if ( increment ) {
3468         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3469         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3470     }
3471     else {
3472         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3473     }
3474     return packFloat128( zSign, zExp, zSig0, zSig1 );
3475 
3476 }
3477 
3478 /*----------------------------------------------------------------------------
3479 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3480 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3481 | returns the proper quadruple-precision floating-point value corresponding
3482 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3483 | except that the input significand has fewer bits and does not have to be
3484 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3485 | point exponent.
3486 *----------------------------------------------------------------------------*/
3487 
3488 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3489                                               uint64_t zSig0, uint64_t zSig1,
3490                                               float_status *status)
3491 {
3492     int8_t shiftCount;
3493     uint64_t zSig2;
3494 
3495     if ( zSig0 == 0 ) {
3496         zSig0 = zSig1;
3497         zSig1 = 0;
3498         zExp -= 64;
3499     }
3500     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3501     if ( 0 <= shiftCount ) {
3502         zSig2 = 0;
3503         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3504     }
3505     else {
3506         shift128ExtraRightJamming(
3507             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3508     }
3509     zExp -= shiftCount;
3510     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3511 
3512 }
3513 
3514 
3515 /*----------------------------------------------------------------------------
3516 | Returns the result of converting the 32-bit two's complement integer `a'
3517 | to the extended double-precision floating-point format.  The conversion
3518 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3519 | Arithmetic.
3520 *----------------------------------------------------------------------------*/
3521 
3522 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3523 {
3524     flag zSign;
3525     uint32_t absA;
3526     int8_t shiftCount;
3527     uint64_t zSig;
3528 
3529     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3530     zSign = ( a < 0 );
3531     absA = zSign ? - a : a;
3532     shiftCount = countLeadingZeros32( absA ) + 32;
3533     zSig = absA;
3534     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3535 
3536 }
3537 
3538 /*----------------------------------------------------------------------------
3539 | Returns the result of converting the 32-bit two's complement integer `a' to
3540 | the quadruple-precision floating-point format.  The conversion is performed
3541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3542 *----------------------------------------------------------------------------*/
3543 
3544 float128 int32_to_float128(int32_t a, float_status *status)
3545 {
3546     flag zSign;
3547     uint32_t absA;
3548     int8_t shiftCount;
3549     uint64_t zSig0;
3550 
3551     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3552     zSign = ( a < 0 );
3553     absA = zSign ? - a : a;
3554     shiftCount = countLeadingZeros32( absA ) + 17;
3555     zSig0 = absA;
3556     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3557 
3558 }
3559 
3560 /*----------------------------------------------------------------------------
3561 | Returns the result of converting the 64-bit two's complement integer `a'
3562 | to the extended double-precision floating-point format.  The conversion
3563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3564 | Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566 
3567 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3568 {
3569     flag zSign;
3570     uint64_t absA;
3571     int8_t shiftCount;
3572 
3573     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3574     zSign = ( a < 0 );
3575     absA = zSign ? - a : a;
3576     shiftCount = countLeadingZeros64( absA );
3577     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3578 
3579 }
3580 
3581 /*----------------------------------------------------------------------------
3582 | Returns the result of converting the 64-bit two's complement integer `a' to
3583 | the quadruple-precision floating-point format.  The conversion is performed
3584 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3585 *----------------------------------------------------------------------------*/
3586 
3587 float128 int64_to_float128(int64_t a, float_status *status)
3588 {
3589     flag zSign;
3590     uint64_t absA;
3591     int8_t shiftCount;
3592     int32_t zExp;
3593     uint64_t zSig0, zSig1;
3594 
3595     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3596     zSign = ( a < 0 );
3597     absA = zSign ? - a : a;
3598     shiftCount = countLeadingZeros64( absA ) + 49;
3599     zExp = 0x406E - shiftCount;
3600     if ( 64 <= shiftCount ) {
3601         zSig1 = 0;
3602         zSig0 = absA;
3603         shiftCount -= 64;
3604     }
3605     else {
3606         zSig1 = absA;
3607         zSig0 = 0;
3608     }
3609     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3610     return packFloat128( zSign, zExp, zSig0, zSig1 );
3611 
3612 }
3613 
3614 /*----------------------------------------------------------------------------
3615 | Returns the result of converting the 64-bit unsigned integer `a'
3616 | to the quadruple-precision floating-point format.  The conversion is performed
3617 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3618 *----------------------------------------------------------------------------*/
3619 
3620 float128 uint64_to_float128(uint64_t a, float_status *status)
3621 {
3622     if (a == 0) {
3623         return float128_zero;
3624     }
3625     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3626 }
3627 
3628 /*----------------------------------------------------------------------------
3629 | Returns the result of converting the single-precision floating-point value
3630 | `a' to the extended double-precision floating-point format.  The conversion
3631 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3632 | Arithmetic.
3633 *----------------------------------------------------------------------------*/
3634 
3635 floatx80 float32_to_floatx80(float32 a, float_status *status)
3636 {
3637     flag aSign;
3638     int aExp;
3639     uint32_t aSig;
3640 
3641     a = float32_squash_input_denormal(a, status);
3642     aSig = extractFloat32Frac( a );
3643     aExp = extractFloat32Exp( a );
3644     aSign = extractFloat32Sign( a );
3645     if ( aExp == 0xFF ) {
3646         if (aSig) {
3647             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3648         }
3649         return packFloatx80(aSign,
3650                             floatx80_infinity_high,
3651                             floatx80_infinity_low);
3652     }
3653     if ( aExp == 0 ) {
3654         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3655         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3656     }
3657     aSig |= 0x00800000;
3658     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3659 
3660 }
3661 
3662 /*----------------------------------------------------------------------------
3663 | Returns the result of converting the single-precision floating-point value
3664 | `a' to the double-precision floating-point format.  The conversion is
3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3666 | Arithmetic.
3667 *----------------------------------------------------------------------------*/
3668 
3669 float128 float32_to_float128(float32 a, float_status *status)
3670 {
3671     flag aSign;
3672     int aExp;
3673     uint32_t aSig;
3674 
3675     a = float32_squash_input_denormal(a, status);
3676     aSig = extractFloat32Frac( a );
3677     aExp = extractFloat32Exp( a );
3678     aSign = extractFloat32Sign( a );
3679     if ( aExp == 0xFF ) {
3680         if (aSig) {
3681             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3682         }
3683         return packFloat128( aSign, 0x7FFF, 0, 0 );
3684     }
3685     if ( aExp == 0 ) {
3686         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3687         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3688         --aExp;
3689     }
3690     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3691 
3692 }
3693 
3694 /*----------------------------------------------------------------------------
3695 | Returns the remainder of the single-precision floating-point value `a'
3696 | with respect to the corresponding value `b'.  The operation is performed
3697 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699 
3700 float32 float32_rem(float32 a, float32 b, float_status *status)
3701 {
3702     flag aSign, zSign;
3703     int aExp, bExp, expDiff;
3704     uint32_t aSig, bSig;
3705     uint32_t q;
3706     uint64_t aSig64, bSig64, q64;
3707     uint32_t alternateASig;
3708     int32_t sigMean;
3709     a = float32_squash_input_denormal(a, status);
3710     b = float32_squash_input_denormal(b, status);
3711 
3712     aSig = extractFloat32Frac( a );
3713     aExp = extractFloat32Exp( a );
3714     aSign = extractFloat32Sign( a );
3715     bSig = extractFloat32Frac( b );
3716     bExp = extractFloat32Exp( b );
3717     if ( aExp == 0xFF ) {
3718         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3719             return propagateFloat32NaN(a, b, status);
3720         }
3721         float_raise(float_flag_invalid, status);
3722         return float32_default_nan(status);
3723     }
3724     if ( bExp == 0xFF ) {
3725         if (bSig) {
3726             return propagateFloat32NaN(a, b, status);
3727         }
3728         return a;
3729     }
3730     if ( bExp == 0 ) {
3731         if ( bSig == 0 ) {
3732             float_raise(float_flag_invalid, status);
3733             return float32_default_nan(status);
3734         }
3735         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3736     }
3737     if ( aExp == 0 ) {
3738         if ( aSig == 0 ) return a;
3739         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3740     }
3741     expDiff = aExp - bExp;
3742     aSig |= 0x00800000;
3743     bSig |= 0x00800000;
3744     if ( expDiff < 32 ) {
3745         aSig <<= 8;
3746         bSig <<= 8;
3747         if ( expDiff < 0 ) {
3748             if ( expDiff < -1 ) return a;
3749             aSig >>= 1;
3750         }
3751         q = ( bSig <= aSig );
3752         if ( q ) aSig -= bSig;
3753         if ( 0 < expDiff ) {
3754             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3755             q >>= 32 - expDiff;
3756             bSig >>= 2;
3757             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3758         }
3759         else {
3760             aSig >>= 2;
3761             bSig >>= 2;
3762         }
3763     }
3764     else {
3765         if ( bSig <= aSig ) aSig -= bSig;
3766         aSig64 = ( (uint64_t) aSig )<<40;
3767         bSig64 = ( (uint64_t) bSig )<<40;
3768         expDiff -= 64;
3769         while ( 0 < expDiff ) {
3770             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3771             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3772             aSig64 = - ( ( bSig * q64 )<<38 );
3773             expDiff -= 62;
3774         }
3775         expDiff += 64;
3776         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3777         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3778         q = q64>>( 64 - expDiff );
3779         bSig <<= 6;
3780         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3781     }
3782     do {
3783         alternateASig = aSig;
3784         ++q;
3785         aSig -= bSig;
3786     } while ( 0 <= (int32_t) aSig );
3787     sigMean = aSig + alternateASig;
3788     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3789         aSig = alternateASig;
3790     }
3791     zSign = ( (int32_t) aSig < 0 );
3792     if ( zSign ) aSig = - aSig;
3793     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3794 }
3795 
3796 
3797 
3798 /*----------------------------------------------------------------------------
3799 | Returns the binary exponential of the single-precision floating-point value
3800 | `a'. The operation is performed according to the IEC/IEEE Standard for
3801 | Binary Floating-Point Arithmetic.
3802 |
3803 | Uses the following identities:
3804 |
3805 | 1. -------------------------------------------------------------------------
3806 |      x    x*ln(2)
3807 |     2  = e
3808 |
3809 | 2. -------------------------------------------------------------------------
3810 |                      2     3     4     5           n
3811 |      x        x     x     x     x     x           x
3812 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3813 |               1!    2!    3!    4!    5!          n!
3814 *----------------------------------------------------------------------------*/
3815 
3816 static const float64 float32_exp2_coefficients[15] =
3817 {
3818     const_float64( 0x3ff0000000000000ll ), /*  1 */
3819     const_float64( 0x3fe0000000000000ll ), /*  2 */
3820     const_float64( 0x3fc5555555555555ll ), /*  3 */
3821     const_float64( 0x3fa5555555555555ll ), /*  4 */
3822     const_float64( 0x3f81111111111111ll ), /*  5 */
3823     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3824     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3825     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3826     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3827     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3828     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3829     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3830     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3831     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3832     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3833 };
3834 
3835 float32 float32_exp2(float32 a, float_status *status)
3836 {
3837     flag aSign;
3838     int aExp;
3839     uint32_t aSig;
3840     float64 r, x, xn;
3841     int i;
3842     a = float32_squash_input_denormal(a, status);
3843 
3844     aSig = extractFloat32Frac( a );
3845     aExp = extractFloat32Exp( a );
3846     aSign = extractFloat32Sign( a );
3847 
3848     if ( aExp == 0xFF) {
3849         if (aSig) {
3850             return propagateFloat32NaN(a, float32_zero, status);
3851         }
3852         return (aSign) ? float32_zero : a;
3853     }
3854     if (aExp == 0) {
3855         if (aSig == 0) return float32_one;
3856     }
3857 
3858     float_raise(float_flag_inexact, status);
3859 
3860     /* ******************************* */
3861     /* using float64 for approximation */
3862     /* ******************************* */
3863     x = float32_to_float64(a, status);
3864     x = float64_mul(x, float64_ln2, status);
3865 
3866     xn = x;
3867     r = float64_one;
3868     for (i = 0 ; i < 15 ; i++) {
3869         float64 f;
3870 
3871         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3872         r = float64_add(r, f, status);
3873 
3874         xn = float64_mul(xn, x, status);
3875     }
3876 
3877     return float64_to_float32(r, status);
3878 }
3879 
3880 /*----------------------------------------------------------------------------
3881 | Returns the binary log of the single-precision floating-point value `a'.
3882 | The operation is performed according to the IEC/IEEE Standard for Binary
3883 | Floating-Point Arithmetic.
3884 *----------------------------------------------------------------------------*/
3885 float32 float32_log2(float32 a, float_status *status)
3886 {
3887     flag aSign, zSign;
3888     int aExp;
3889     uint32_t aSig, zSig, i;
3890 
3891     a = float32_squash_input_denormal(a, status);
3892     aSig = extractFloat32Frac( a );
3893     aExp = extractFloat32Exp( a );
3894     aSign = extractFloat32Sign( a );
3895 
3896     if ( aExp == 0 ) {
3897         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3898         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3899     }
3900     if ( aSign ) {
3901         float_raise(float_flag_invalid, status);
3902         return float32_default_nan(status);
3903     }
3904     if ( aExp == 0xFF ) {
3905         if (aSig) {
3906             return propagateFloat32NaN(a, float32_zero, status);
3907         }
3908         return a;
3909     }
3910 
3911     aExp -= 0x7F;
3912     aSig |= 0x00800000;
3913     zSign = aExp < 0;
3914     zSig = aExp << 23;
3915 
3916     for (i = 1 << 22; i > 0; i >>= 1) {
3917         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3918         if ( aSig & 0x01000000 ) {
3919             aSig >>= 1;
3920             zSig |= i;
3921         }
3922     }
3923 
3924     if ( zSign )
3925         zSig = -zSig;
3926 
3927     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3928 }
3929 
3930 /*----------------------------------------------------------------------------
3931 | Returns 1 if the single-precision floating-point value `a' is equal to
3932 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3933 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3934 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3935 *----------------------------------------------------------------------------*/
3936 
3937 int float32_eq(float32 a, float32 b, float_status *status)
3938 {
3939     uint32_t av, bv;
3940     a = float32_squash_input_denormal(a, status);
3941     b = float32_squash_input_denormal(b, status);
3942 
3943     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3944          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3945        ) {
3946         float_raise(float_flag_invalid, status);
3947         return 0;
3948     }
3949     av = float32_val(a);
3950     bv = float32_val(b);
3951     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3952 }
3953 
3954 /*----------------------------------------------------------------------------
3955 | Returns 1 if the single-precision floating-point value `a' is less than
3956 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3957 | exception is raised if either operand is a NaN.  The comparison is performed
3958 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3959 *----------------------------------------------------------------------------*/
3960 
3961 int float32_le(float32 a, float32 b, float_status *status)
3962 {
3963     flag aSign, bSign;
3964     uint32_t av, bv;
3965     a = float32_squash_input_denormal(a, status);
3966     b = float32_squash_input_denormal(b, status);
3967 
3968     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3969          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3970        ) {
3971         float_raise(float_flag_invalid, status);
3972         return 0;
3973     }
3974     aSign = extractFloat32Sign( a );
3975     bSign = extractFloat32Sign( b );
3976     av = float32_val(a);
3977     bv = float32_val(b);
3978     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3979     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3980 
3981 }
3982 
3983 /*----------------------------------------------------------------------------
3984 | Returns 1 if the single-precision floating-point value `a' is less than
3985 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3986 | raised if either operand is a NaN.  The comparison is performed according
3987 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3989 
3990 int float32_lt(float32 a, float32 b, float_status *status)
3991 {
3992     flag aSign, bSign;
3993     uint32_t av, bv;
3994     a = float32_squash_input_denormal(a, status);
3995     b = float32_squash_input_denormal(b, status);
3996 
3997     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3998          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3999        ) {
4000         float_raise(float_flag_invalid, status);
4001         return 0;
4002     }
4003     aSign = extractFloat32Sign( a );
4004     bSign = extractFloat32Sign( b );
4005     av = float32_val(a);
4006     bv = float32_val(b);
4007     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4008     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4009 
4010 }
4011 
4012 /*----------------------------------------------------------------------------
4013 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4014 | be compared, and 0 otherwise.  The invalid exception is raised if either
4015 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4016 | Standard for Binary Floating-Point Arithmetic.
4017 *----------------------------------------------------------------------------*/
4018 
4019 int float32_unordered(float32 a, float32 b, float_status *status)
4020 {
4021     a = float32_squash_input_denormal(a, status);
4022     b = float32_squash_input_denormal(b, status);
4023 
4024     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4025          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4026        ) {
4027         float_raise(float_flag_invalid, status);
4028         return 1;
4029     }
4030     return 0;
4031 }
4032 
4033 /*----------------------------------------------------------------------------
4034 | Returns 1 if the single-precision floating-point value `a' is equal to
4035 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4036 | exception.  The comparison is performed according to the IEC/IEEE Standard
4037 | for Binary Floating-Point Arithmetic.
4038 *----------------------------------------------------------------------------*/
4039 
4040 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4041 {
4042     a = float32_squash_input_denormal(a, status);
4043     b = float32_squash_input_denormal(b, status);
4044 
4045     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4046          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4047        ) {
4048         if (float32_is_signaling_nan(a, status)
4049          || float32_is_signaling_nan(b, status)) {
4050             float_raise(float_flag_invalid, status);
4051         }
4052         return 0;
4053     }
4054     return ( float32_val(a) == float32_val(b) ) ||
4055             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4056 }
4057 
4058 /*----------------------------------------------------------------------------
4059 | Returns 1 if the single-precision floating-point value `a' is less than or
4060 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4061 | cause an exception.  Otherwise, the comparison is performed according to the
4062 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4063 *----------------------------------------------------------------------------*/
4064 
4065 int float32_le_quiet(float32 a, float32 b, float_status *status)
4066 {
4067     flag aSign, bSign;
4068     uint32_t av, bv;
4069     a = float32_squash_input_denormal(a, status);
4070     b = float32_squash_input_denormal(b, status);
4071 
4072     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4073          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4074        ) {
4075         if (float32_is_signaling_nan(a, status)
4076          || float32_is_signaling_nan(b, status)) {
4077             float_raise(float_flag_invalid, status);
4078         }
4079         return 0;
4080     }
4081     aSign = extractFloat32Sign( a );
4082     bSign = extractFloat32Sign( b );
4083     av = float32_val(a);
4084     bv = float32_val(b);
4085     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4086     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4087 
4088 }
4089 
4090 /*----------------------------------------------------------------------------
4091 | Returns 1 if the single-precision floating-point value `a' is less than
4092 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4093 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4094 | Standard for Binary Floating-Point Arithmetic.
4095 *----------------------------------------------------------------------------*/
4096 
4097 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4098 {
4099     flag aSign, bSign;
4100     uint32_t av, bv;
4101     a = float32_squash_input_denormal(a, status);
4102     b = float32_squash_input_denormal(b, status);
4103 
4104     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4105          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4106        ) {
4107         if (float32_is_signaling_nan(a, status)
4108          || float32_is_signaling_nan(b, status)) {
4109             float_raise(float_flag_invalid, status);
4110         }
4111         return 0;
4112     }
4113     aSign = extractFloat32Sign( a );
4114     bSign = extractFloat32Sign( b );
4115     av = float32_val(a);
4116     bv = float32_val(b);
4117     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4118     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4119 
4120 }
4121 
4122 /*----------------------------------------------------------------------------
4123 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4124 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4125 | comparison is performed according to the IEC/IEEE Standard for Binary
4126 | Floating-Point Arithmetic.
4127 *----------------------------------------------------------------------------*/
4128 
4129 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4130 {
4131     a = float32_squash_input_denormal(a, status);
4132     b = float32_squash_input_denormal(b, status);
4133 
4134     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4135          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4136        ) {
4137         if (float32_is_signaling_nan(a, status)
4138          || float32_is_signaling_nan(b, status)) {
4139             float_raise(float_flag_invalid, status);
4140         }
4141         return 1;
4142     }
4143     return 0;
4144 }
4145 
4146 /*----------------------------------------------------------------------------
4147 | If `a' is denormal and we are in flush-to-zero mode then set the
4148 | input-denormal exception and return zero. Otherwise just return the value.
4149 *----------------------------------------------------------------------------*/
4150 float16 float16_squash_input_denormal(float16 a, float_status *status)
4151 {
4152     if (status->flush_inputs_to_zero) {
4153         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4154             float_raise(float_flag_input_denormal, status);
4155             return make_float16(float16_val(a) & 0x8000);
4156         }
4157     }
4158     return a;
4159 }
4160 
4161 /*----------------------------------------------------------------------------
4162 | Returns the result of converting the double-precision floating-point value
4163 | `a' to the extended double-precision floating-point format.  The conversion
4164 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 | Arithmetic.
4166 *----------------------------------------------------------------------------*/
4167 
4168 floatx80 float64_to_floatx80(float64 a, float_status *status)
4169 {
4170     flag aSign;
4171     int aExp;
4172     uint64_t aSig;
4173 
4174     a = float64_squash_input_denormal(a, status);
4175     aSig = extractFloat64Frac( a );
4176     aExp = extractFloat64Exp( a );
4177     aSign = extractFloat64Sign( a );
4178     if ( aExp == 0x7FF ) {
4179         if (aSig) {
4180             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4181         }
4182         return packFloatx80(aSign,
4183                             floatx80_infinity_high,
4184                             floatx80_infinity_low);
4185     }
4186     if ( aExp == 0 ) {
4187         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4188         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4189     }
4190     return
4191         packFloatx80(
4192             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4193 
4194 }
4195 
4196 /*----------------------------------------------------------------------------
4197 | Returns the result of converting the double-precision floating-point value
4198 | `a' to the quadruple-precision floating-point format.  The conversion is
4199 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4200 | Arithmetic.
4201 *----------------------------------------------------------------------------*/
4202 
4203 float128 float64_to_float128(float64 a, float_status *status)
4204 {
4205     flag aSign;
4206     int aExp;
4207     uint64_t aSig, zSig0, zSig1;
4208 
4209     a = float64_squash_input_denormal(a, status);
4210     aSig = extractFloat64Frac( a );
4211     aExp = extractFloat64Exp( a );
4212     aSign = extractFloat64Sign( a );
4213     if ( aExp == 0x7FF ) {
4214         if (aSig) {
4215             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4216         }
4217         return packFloat128( aSign, 0x7FFF, 0, 0 );
4218     }
4219     if ( aExp == 0 ) {
4220         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4221         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4222         --aExp;
4223     }
4224     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4225     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4226 
4227 }
4228 
4229 
4230 /*----------------------------------------------------------------------------
4231 | Returns the remainder of the double-precision floating-point value `a'
4232 | with respect to the corresponding value `b'.  The operation is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4235 
4236 float64 float64_rem(float64 a, float64 b, float_status *status)
4237 {
4238     flag aSign, zSign;
4239     int aExp, bExp, expDiff;
4240     uint64_t aSig, bSig;
4241     uint64_t q, alternateASig;
4242     int64_t sigMean;
4243 
4244     a = float64_squash_input_denormal(a, status);
4245     b = float64_squash_input_denormal(b, status);
4246     aSig = extractFloat64Frac( a );
4247     aExp = extractFloat64Exp( a );
4248     aSign = extractFloat64Sign( a );
4249     bSig = extractFloat64Frac( b );
4250     bExp = extractFloat64Exp( b );
4251     if ( aExp == 0x7FF ) {
4252         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4253             return propagateFloat64NaN(a, b, status);
4254         }
4255         float_raise(float_flag_invalid, status);
4256         return float64_default_nan(status);
4257     }
4258     if ( bExp == 0x7FF ) {
4259         if (bSig) {
4260             return propagateFloat64NaN(a, b, status);
4261         }
4262         return a;
4263     }
4264     if ( bExp == 0 ) {
4265         if ( bSig == 0 ) {
4266             float_raise(float_flag_invalid, status);
4267             return float64_default_nan(status);
4268         }
4269         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4270     }
4271     if ( aExp == 0 ) {
4272         if ( aSig == 0 ) return a;
4273         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4274     }
4275     expDiff = aExp - bExp;
4276     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4277     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4278     if ( expDiff < 0 ) {
4279         if ( expDiff < -1 ) return a;
4280         aSig >>= 1;
4281     }
4282     q = ( bSig <= aSig );
4283     if ( q ) aSig -= bSig;
4284     expDiff -= 64;
4285     while ( 0 < expDiff ) {
4286         q = estimateDiv128To64( aSig, 0, bSig );
4287         q = ( 2 < q ) ? q - 2 : 0;
4288         aSig = - ( ( bSig>>2 ) * q );
4289         expDiff -= 62;
4290     }
4291     expDiff += 64;
4292     if ( 0 < expDiff ) {
4293         q = estimateDiv128To64( aSig, 0, bSig );
4294         q = ( 2 < q ) ? q - 2 : 0;
4295         q >>= 64 - expDiff;
4296         bSig >>= 2;
4297         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4298     }
4299     else {
4300         aSig >>= 2;
4301         bSig >>= 2;
4302     }
4303     do {
4304         alternateASig = aSig;
4305         ++q;
4306         aSig -= bSig;
4307     } while ( 0 <= (int64_t) aSig );
4308     sigMean = aSig + alternateASig;
4309     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4310         aSig = alternateASig;
4311     }
4312     zSign = ( (int64_t) aSig < 0 );
4313     if ( zSign ) aSig = - aSig;
4314     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4315 
4316 }
4317 
4318 /*----------------------------------------------------------------------------
4319 | Returns the binary log of the double-precision floating-point value `a'.
4320 | The operation is performed according to the IEC/IEEE Standard for Binary
4321 | Floating-Point Arithmetic.
4322 *----------------------------------------------------------------------------*/
4323 float64 float64_log2(float64 a, float_status *status)
4324 {
4325     flag aSign, zSign;
4326     int aExp;
4327     uint64_t aSig, aSig0, aSig1, zSig, i;
4328     a = float64_squash_input_denormal(a, status);
4329 
4330     aSig = extractFloat64Frac( a );
4331     aExp = extractFloat64Exp( a );
4332     aSign = extractFloat64Sign( a );
4333 
4334     if ( aExp == 0 ) {
4335         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4336         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4337     }
4338     if ( aSign ) {
4339         float_raise(float_flag_invalid, status);
4340         return float64_default_nan(status);
4341     }
4342     if ( aExp == 0x7FF ) {
4343         if (aSig) {
4344             return propagateFloat64NaN(a, float64_zero, status);
4345         }
4346         return a;
4347     }
4348 
4349     aExp -= 0x3FF;
4350     aSig |= LIT64( 0x0010000000000000 );
4351     zSign = aExp < 0;
4352     zSig = (uint64_t)aExp << 52;
4353     for (i = 1LL << 51; i > 0; i >>= 1) {
4354         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4355         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4356         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4357             aSig >>= 1;
4358             zSig |= i;
4359         }
4360     }
4361 
4362     if ( zSign )
4363         zSig = -zSig;
4364     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4365 }
4366 
4367 /*----------------------------------------------------------------------------
4368 | Returns 1 if the double-precision floating-point value `a' is equal to the
4369 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4370 | if either operand is a NaN.  Otherwise, the comparison is performed
4371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4372 *----------------------------------------------------------------------------*/
4373 
4374 int float64_eq(float64 a, float64 b, float_status *status)
4375 {
4376     uint64_t av, bv;
4377     a = float64_squash_input_denormal(a, status);
4378     b = float64_squash_input_denormal(b, status);
4379 
4380     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4381          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4382        ) {
4383         float_raise(float_flag_invalid, status);
4384         return 0;
4385     }
4386     av = float64_val(a);
4387     bv = float64_val(b);
4388     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns 1 if the double-precision floating-point value `a' is less than or
4394 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4395 | exception is raised if either operand is a NaN.  The comparison is performed
4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4397 *----------------------------------------------------------------------------*/
4398 
4399 int float64_le(float64 a, float64 b, float_status *status)
4400 {
4401     flag aSign, bSign;
4402     uint64_t av, bv;
4403     a = float64_squash_input_denormal(a, status);
4404     b = float64_squash_input_denormal(b, status);
4405 
4406     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4407          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4408        ) {
4409         float_raise(float_flag_invalid, status);
4410         return 0;
4411     }
4412     aSign = extractFloat64Sign( a );
4413     bSign = extractFloat64Sign( b );
4414     av = float64_val(a);
4415     bv = float64_val(b);
4416     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4417     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4418 
4419 }
4420 
4421 /*----------------------------------------------------------------------------
4422 | Returns 1 if the double-precision floating-point value `a' is less than
4423 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4424 | raised if either operand is a NaN.  The comparison is performed according
4425 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4426 *----------------------------------------------------------------------------*/
4427 
4428 int float64_lt(float64 a, float64 b, float_status *status)
4429 {
4430     flag aSign, bSign;
4431     uint64_t av, bv;
4432 
4433     a = float64_squash_input_denormal(a, status);
4434     b = float64_squash_input_denormal(b, status);
4435     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4436          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4437        ) {
4438         float_raise(float_flag_invalid, status);
4439         return 0;
4440     }
4441     aSign = extractFloat64Sign( a );
4442     bSign = extractFloat64Sign( b );
4443     av = float64_val(a);
4444     bv = float64_val(b);
4445     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4446     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4447 
4448 }
4449 
4450 /*----------------------------------------------------------------------------
4451 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4452 | be compared, and 0 otherwise.  The invalid exception is raised if either
4453 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4454 | Standard for Binary Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4456 
4457 int float64_unordered(float64 a, float64 b, float_status *status)
4458 {
4459     a = float64_squash_input_denormal(a, status);
4460     b = float64_squash_input_denormal(b, status);
4461 
4462     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4463          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4464        ) {
4465         float_raise(float_flag_invalid, status);
4466         return 1;
4467     }
4468     return 0;
4469 }
4470 
4471 /*----------------------------------------------------------------------------
4472 | Returns 1 if the double-precision floating-point value `a' is equal to the
4473 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4474 | exception.The comparison is performed according to the IEC/IEEE Standard
4475 | for Binary Floating-Point Arithmetic.
4476 *----------------------------------------------------------------------------*/
4477 
4478 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4479 {
4480     uint64_t av, bv;
4481     a = float64_squash_input_denormal(a, status);
4482     b = float64_squash_input_denormal(b, status);
4483 
4484     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4485          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4486        ) {
4487         if (float64_is_signaling_nan(a, status)
4488          || float64_is_signaling_nan(b, status)) {
4489             float_raise(float_flag_invalid, status);
4490         }
4491         return 0;
4492     }
4493     av = float64_val(a);
4494     bv = float64_val(b);
4495     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4496 
4497 }
4498 
4499 /*----------------------------------------------------------------------------
4500 | Returns 1 if the double-precision floating-point value `a' is less than or
4501 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4502 | cause an exception.  Otherwise, the comparison is performed according to the
4503 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4504 *----------------------------------------------------------------------------*/
4505 
4506 int float64_le_quiet(float64 a, float64 b, float_status *status)
4507 {
4508     flag aSign, bSign;
4509     uint64_t av, bv;
4510     a = float64_squash_input_denormal(a, status);
4511     b = float64_squash_input_denormal(b, status);
4512 
4513     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4514          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4515        ) {
4516         if (float64_is_signaling_nan(a, status)
4517          || float64_is_signaling_nan(b, status)) {
4518             float_raise(float_flag_invalid, status);
4519         }
4520         return 0;
4521     }
4522     aSign = extractFloat64Sign( a );
4523     bSign = extractFloat64Sign( b );
4524     av = float64_val(a);
4525     bv = float64_val(b);
4526     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4527     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4528 
4529 }
4530 
4531 /*----------------------------------------------------------------------------
4532 | Returns 1 if the double-precision floating-point value `a' is less than
4533 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4534 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4535 | Standard for Binary Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4537 
4538 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4539 {
4540     flag aSign, bSign;
4541     uint64_t av, bv;
4542     a = float64_squash_input_denormal(a, status);
4543     b = float64_squash_input_denormal(b, status);
4544 
4545     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4546          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4547        ) {
4548         if (float64_is_signaling_nan(a, status)
4549          || float64_is_signaling_nan(b, status)) {
4550             float_raise(float_flag_invalid, status);
4551         }
4552         return 0;
4553     }
4554     aSign = extractFloat64Sign( a );
4555     bSign = extractFloat64Sign( b );
4556     av = float64_val(a);
4557     bv = float64_val(b);
4558     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4559     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4560 
4561 }
4562 
4563 /*----------------------------------------------------------------------------
4564 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4565 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4566 | comparison is performed according to the IEC/IEEE Standard for Binary
4567 | Floating-Point Arithmetic.
4568 *----------------------------------------------------------------------------*/
4569 
4570 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4571 {
4572     a = float64_squash_input_denormal(a, status);
4573     b = float64_squash_input_denormal(b, status);
4574 
4575     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4576          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4577        ) {
4578         if (float64_is_signaling_nan(a, status)
4579          || float64_is_signaling_nan(b, status)) {
4580             float_raise(float_flag_invalid, status);
4581         }
4582         return 1;
4583     }
4584     return 0;
4585 }
4586 
4587 /*----------------------------------------------------------------------------
4588 | Returns the result of converting the extended double-precision floating-
4589 | point value `a' to the 32-bit two's complement integer format.  The
4590 | conversion is performed according to the IEC/IEEE Standard for Binary
4591 | Floating-Point Arithmetic---which means in particular that the conversion
4592 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4593 | largest positive integer is returned.  Otherwise, if the conversion
4594 | overflows, the largest integer with the same sign as `a' is returned.
4595 *----------------------------------------------------------------------------*/
4596 
4597 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4598 {
4599     flag aSign;
4600     int32_t aExp, shiftCount;
4601     uint64_t aSig;
4602 
4603     if (floatx80_invalid_encoding(a)) {
4604         float_raise(float_flag_invalid, status);
4605         return 1 << 31;
4606     }
4607     aSig = extractFloatx80Frac( a );
4608     aExp = extractFloatx80Exp( a );
4609     aSign = extractFloatx80Sign( a );
4610     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4611     shiftCount = 0x4037 - aExp;
4612     if ( shiftCount <= 0 ) shiftCount = 1;
4613     shift64RightJamming( aSig, shiftCount, &aSig );
4614     return roundAndPackInt32(aSign, aSig, status);
4615 
4616 }
4617 
4618 /*----------------------------------------------------------------------------
4619 | Returns the result of converting the extended double-precision floating-
4620 | point value `a' to the 32-bit two's complement integer format.  The
4621 | conversion is performed according to the IEC/IEEE Standard for Binary
4622 | Floating-Point Arithmetic, except that the conversion is always rounded
4623 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4624 | Otherwise, if the conversion overflows, the largest integer with the same
4625 | sign as `a' is returned.
4626 *----------------------------------------------------------------------------*/
4627 
4628 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4629 {
4630     flag aSign;
4631     int32_t aExp, shiftCount;
4632     uint64_t aSig, savedASig;
4633     int32_t z;
4634 
4635     if (floatx80_invalid_encoding(a)) {
4636         float_raise(float_flag_invalid, status);
4637         return 1 << 31;
4638     }
4639     aSig = extractFloatx80Frac( a );
4640     aExp = extractFloatx80Exp( a );
4641     aSign = extractFloatx80Sign( a );
4642     if ( 0x401E < aExp ) {
4643         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4644         goto invalid;
4645     }
4646     else if ( aExp < 0x3FFF ) {
4647         if (aExp || aSig) {
4648             status->float_exception_flags |= float_flag_inexact;
4649         }
4650         return 0;
4651     }
4652     shiftCount = 0x403E - aExp;
4653     savedASig = aSig;
4654     aSig >>= shiftCount;
4655     z = aSig;
4656     if ( aSign ) z = - z;
4657     if ( ( z < 0 ) ^ aSign ) {
4658  invalid:
4659         float_raise(float_flag_invalid, status);
4660         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4661     }
4662     if ( ( aSig<<shiftCount ) != savedASig ) {
4663         status->float_exception_flags |= float_flag_inexact;
4664     }
4665     return z;
4666 
4667 }
4668 
4669 /*----------------------------------------------------------------------------
4670 | Returns the result of converting the extended double-precision floating-
4671 | point value `a' to the 64-bit two's complement integer format.  The
4672 | conversion is performed according to the IEC/IEEE Standard for Binary
4673 | Floating-Point Arithmetic---which means in particular that the conversion
4674 | is rounded according to the current rounding mode.  If `a' is a NaN,
4675 | the largest positive integer is returned.  Otherwise, if the conversion
4676 | overflows, the largest integer with the same sign as `a' is returned.
4677 *----------------------------------------------------------------------------*/
4678 
4679 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4680 {
4681     flag aSign;
4682     int32_t aExp, shiftCount;
4683     uint64_t aSig, aSigExtra;
4684 
4685     if (floatx80_invalid_encoding(a)) {
4686         float_raise(float_flag_invalid, status);
4687         return 1ULL << 63;
4688     }
4689     aSig = extractFloatx80Frac( a );
4690     aExp = extractFloatx80Exp( a );
4691     aSign = extractFloatx80Sign( a );
4692     shiftCount = 0x403E - aExp;
4693     if ( shiftCount <= 0 ) {
4694         if ( shiftCount ) {
4695             float_raise(float_flag_invalid, status);
4696             if (!aSign || floatx80_is_any_nan(a)) {
4697                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4698             }
4699             return (int64_t) LIT64( 0x8000000000000000 );
4700         }
4701         aSigExtra = 0;
4702     }
4703     else {
4704         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4705     }
4706     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4707 
4708 }
4709 
4710 /*----------------------------------------------------------------------------
4711 | Returns the result of converting the extended double-precision floating-
4712 | point value `a' to the 64-bit two's complement integer format.  The
4713 | conversion is performed according to the IEC/IEEE Standard for Binary
4714 | Floating-Point Arithmetic, except that the conversion is always rounded
4715 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4716 | Otherwise, if the conversion overflows, the largest integer with the same
4717 | sign as `a' is returned.
4718 *----------------------------------------------------------------------------*/
4719 
4720 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4721 {
4722     flag aSign;
4723     int32_t aExp, shiftCount;
4724     uint64_t aSig;
4725     int64_t z;
4726 
4727     if (floatx80_invalid_encoding(a)) {
4728         float_raise(float_flag_invalid, status);
4729         return 1ULL << 63;
4730     }
4731     aSig = extractFloatx80Frac( a );
4732     aExp = extractFloatx80Exp( a );
4733     aSign = extractFloatx80Sign( a );
4734     shiftCount = aExp - 0x403E;
4735     if ( 0 <= shiftCount ) {
4736         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4737         if ( ( a.high != 0xC03E ) || aSig ) {
4738             float_raise(float_flag_invalid, status);
4739             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4740                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4741             }
4742         }
4743         return (int64_t) LIT64( 0x8000000000000000 );
4744     }
4745     else if ( aExp < 0x3FFF ) {
4746         if (aExp | aSig) {
4747             status->float_exception_flags |= float_flag_inexact;
4748         }
4749         return 0;
4750     }
4751     z = aSig>>( - shiftCount );
4752     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4753         status->float_exception_flags |= float_flag_inexact;
4754     }
4755     if ( aSign ) z = - z;
4756     return z;
4757 
4758 }
4759 
4760 /*----------------------------------------------------------------------------
4761 | Returns the result of converting the extended double-precision floating-
4762 | point value `a' to the single-precision floating-point format.  The
4763 | conversion is performed according to the IEC/IEEE Standard for Binary
4764 | Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4766 
4767 float32 floatx80_to_float32(floatx80 a, float_status *status)
4768 {
4769     flag aSign;
4770     int32_t aExp;
4771     uint64_t aSig;
4772 
4773     if (floatx80_invalid_encoding(a)) {
4774         float_raise(float_flag_invalid, status);
4775         return float32_default_nan(status);
4776     }
4777     aSig = extractFloatx80Frac( a );
4778     aExp = extractFloatx80Exp( a );
4779     aSign = extractFloatx80Sign( a );
4780     if ( aExp == 0x7FFF ) {
4781         if ( (uint64_t) ( aSig<<1 ) ) {
4782             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4783         }
4784         return packFloat32( aSign, 0xFF, 0 );
4785     }
4786     shift64RightJamming( aSig, 33, &aSig );
4787     if ( aExp || aSig ) aExp -= 0x3F81;
4788     return roundAndPackFloat32(aSign, aExp, aSig, status);
4789 
4790 }
4791 
4792 /*----------------------------------------------------------------------------
4793 | Returns the result of converting the extended double-precision floating-
4794 | point value `a' to the double-precision floating-point format.  The
4795 | conversion is performed according to the IEC/IEEE Standard for Binary
4796 | Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798 
4799 float64 floatx80_to_float64(floatx80 a, float_status *status)
4800 {
4801     flag aSign;
4802     int32_t aExp;
4803     uint64_t aSig, zSig;
4804 
4805     if (floatx80_invalid_encoding(a)) {
4806         float_raise(float_flag_invalid, status);
4807         return float64_default_nan(status);
4808     }
4809     aSig = extractFloatx80Frac( a );
4810     aExp = extractFloatx80Exp( a );
4811     aSign = extractFloatx80Sign( a );
4812     if ( aExp == 0x7FFF ) {
4813         if ( (uint64_t) ( aSig<<1 ) ) {
4814             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4815         }
4816         return packFloat64( aSign, 0x7FF, 0 );
4817     }
4818     shift64RightJamming( aSig, 1, &zSig );
4819     if ( aExp || aSig ) aExp -= 0x3C01;
4820     return roundAndPackFloat64(aSign, aExp, zSig, status);
4821 
4822 }
4823 
4824 /*----------------------------------------------------------------------------
4825 | Returns the result of converting the extended double-precision floating-
4826 | point value `a' to the quadruple-precision floating-point format.  The
4827 | conversion is performed according to the IEC/IEEE Standard for Binary
4828 | Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4830 
4831 float128 floatx80_to_float128(floatx80 a, float_status *status)
4832 {
4833     flag aSign;
4834     int aExp;
4835     uint64_t aSig, zSig0, zSig1;
4836 
4837     if (floatx80_invalid_encoding(a)) {
4838         float_raise(float_flag_invalid, status);
4839         return float128_default_nan(status);
4840     }
4841     aSig = extractFloatx80Frac( a );
4842     aExp = extractFloatx80Exp( a );
4843     aSign = extractFloatx80Sign( a );
4844     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4845         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4846     }
4847     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4848     return packFloat128( aSign, aExp, zSig0, zSig1 );
4849 
4850 }
4851 
4852 /*----------------------------------------------------------------------------
4853 | Rounds the extended double-precision floating-point value `a'
4854 | to the precision provided by floatx80_rounding_precision and returns the
4855 | result as an extended double-precision floating-point value.
4856 | The operation is performed according to the IEC/IEEE Standard for Binary
4857 | Floating-Point Arithmetic.
4858 *----------------------------------------------------------------------------*/
4859 
4860 floatx80 floatx80_round(floatx80 a, float_status *status)
4861 {
4862     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4863                                 extractFloatx80Sign(a),
4864                                 extractFloatx80Exp(a),
4865                                 extractFloatx80Frac(a), 0, status);
4866 }
4867 
4868 /*----------------------------------------------------------------------------
4869 | Rounds the extended double-precision floating-point value `a' to an integer,
4870 | and returns the result as an extended quadruple-precision floating-point
4871 | value.  The operation is performed according to the IEC/IEEE Standard for
4872 | Binary Floating-Point Arithmetic.
4873 *----------------------------------------------------------------------------*/
4874 
4875 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4876 {
4877     flag aSign;
4878     int32_t aExp;
4879     uint64_t lastBitMask, roundBitsMask;
4880     floatx80 z;
4881 
4882     if (floatx80_invalid_encoding(a)) {
4883         float_raise(float_flag_invalid, status);
4884         return floatx80_default_nan(status);
4885     }
4886     aExp = extractFloatx80Exp( a );
4887     if ( 0x403E <= aExp ) {
4888         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4889             return propagateFloatx80NaN(a, a, status);
4890         }
4891         return a;
4892     }
4893     if ( aExp < 0x3FFF ) {
4894         if (    ( aExp == 0 )
4895              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4896             return a;
4897         }
4898         status->float_exception_flags |= float_flag_inexact;
4899         aSign = extractFloatx80Sign( a );
4900         switch (status->float_rounding_mode) {
4901          case float_round_nearest_even:
4902             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4903                ) {
4904                 return
4905                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4906             }
4907             break;
4908         case float_round_ties_away:
4909             if (aExp == 0x3FFE) {
4910                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4911             }
4912             break;
4913          case float_round_down:
4914             return
4915                   aSign ?
4916                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4917                 : packFloatx80( 0, 0, 0 );
4918          case float_round_up:
4919             return
4920                   aSign ? packFloatx80( 1, 0, 0 )
4921                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4922         }
4923         return packFloatx80( aSign, 0, 0 );
4924     }
4925     lastBitMask = 1;
4926     lastBitMask <<= 0x403E - aExp;
4927     roundBitsMask = lastBitMask - 1;
4928     z = a;
4929     switch (status->float_rounding_mode) {
4930     case float_round_nearest_even:
4931         z.low += lastBitMask>>1;
4932         if ((z.low & roundBitsMask) == 0) {
4933             z.low &= ~lastBitMask;
4934         }
4935         break;
4936     case float_round_ties_away:
4937         z.low += lastBitMask >> 1;
4938         break;
4939     case float_round_to_zero:
4940         break;
4941     case float_round_up:
4942         if (!extractFloatx80Sign(z)) {
4943             z.low += roundBitsMask;
4944         }
4945         break;
4946     case float_round_down:
4947         if (extractFloatx80Sign(z)) {
4948             z.low += roundBitsMask;
4949         }
4950         break;
4951     default:
4952         abort();
4953     }
4954     z.low &= ~ roundBitsMask;
4955     if ( z.low == 0 ) {
4956         ++z.high;
4957         z.low = LIT64( 0x8000000000000000 );
4958     }
4959     if (z.low != a.low) {
4960         status->float_exception_flags |= float_flag_inexact;
4961     }
4962     return z;
4963 
4964 }
4965 
4966 /*----------------------------------------------------------------------------
4967 | Returns the result of adding the absolute values of the extended double-
4968 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4969 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4970 | The addition is performed according to the IEC/IEEE Standard for Binary
4971 | Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4973 
4974 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4975                                 float_status *status)
4976 {
4977     int32_t aExp, bExp, zExp;
4978     uint64_t aSig, bSig, zSig0, zSig1;
4979     int32_t expDiff;
4980 
4981     aSig = extractFloatx80Frac( a );
4982     aExp = extractFloatx80Exp( a );
4983     bSig = extractFloatx80Frac( b );
4984     bExp = extractFloatx80Exp( b );
4985     expDiff = aExp - bExp;
4986     if ( 0 < expDiff ) {
4987         if ( aExp == 0x7FFF ) {
4988             if ((uint64_t)(aSig << 1)) {
4989                 return propagateFloatx80NaN(a, b, status);
4990             }
4991             return a;
4992         }
4993         if ( bExp == 0 ) --expDiff;
4994         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4995         zExp = aExp;
4996     }
4997     else if ( expDiff < 0 ) {
4998         if ( bExp == 0x7FFF ) {
4999             if ((uint64_t)(bSig << 1)) {
5000                 return propagateFloatx80NaN(a, b, status);
5001             }
5002             return packFloatx80(zSign,
5003                                 floatx80_infinity_high,
5004                                 floatx80_infinity_low);
5005         }
5006         if ( aExp == 0 ) ++expDiff;
5007         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5008         zExp = bExp;
5009     }
5010     else {
5011         if ( aExp == 0x7FFF ) {
5012             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5013                 return propagateFloatx80NaN(a, b, status);
5014             }
5015             return a;
5016         }
5017         zSig1 = 0;
5018         zSig0 = aSig + bSig;
5019         if ( aExp == 0 ) {
5020             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5021             goto roundAndPack;
5022         }
5023         zExp = aExp;
5024         goto shiftRight1;
5025     }
5026     zSig0 = aSig + bSig;
5027     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5028  shiftRight1:
5029     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5030     zSig0 |= LIT64( 0x8000000000000000 );
5031     ++zExp;
5032  roundAndPack:
5033     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5034                                 zSign, zExp, zSig0, zSig1, status);
5035 }
5036 
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of subtracting the absolute values of the extended
5039 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5040 | difference is negated before being returned.  `zSign' is ignored if the
5041 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5042 | Standard for Binary Floating-Point Arithmetic.
5043 *----------------------------------------------------------------------------*/
5044 
5045 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5046                                 float_status *status)
5047 {
5048     int32_t aExp, bExp, zExp;
5049     uint64_t aSig, bSig, zSig0, zSig1;
5050     int32_t expDiff;
5051 
5052     aSig = extractFloatx80Frac( a );
5053     aExp = extractFloatx80Exp( a );
5054     bSig = extractFloatx80Frac( b );
5055     bExp = extractFloatx80Exp( b );
5056     expDiff = aExp - bExp;
5057     if ( 0 < expDiff ) goto aExpBigger;
5058     if ( expDiff < 0 ) goto bExpBigger;
5059     if ( aExp == 0x7FFF ) {
5060         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5061             return propagateFloatx80NaN(a, b, status);
5062         }
5063         float_raise(float_flag_invalid, status);
5064         return floatx80_default_nan(status);
5065     }
5066     if ( aExp == 0 ) {
5067         aExp = 1;
5068         bExp = 1;
5069     }
5070     zSig1 = 0;
5071     if ( bSig < aSig ) goto aBigger;
5072     if ( aSig < bSig ) goto bBigger;
5073     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5074  bExpBigger:
5075     if ( bExp == 0x7FFF ) {
5076         if ((uint64_t)(bSig << 1)) {
5077             return propagateFloatx80NaN(a, b, status);
5078         }
5079         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5080                             floatx80_infinity_low);
5081     }
5082     if ( aExp == 0 ) ++expDiff;
5083     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5084  bBigger:
5085     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5086     zExp = bExp;
5087     zSign ^= 1;
5088     goto normalizeRoundAndPack;
5089  aExpBigger:
5090     if ( aExp == 0x7FFF ) {
5091         if ((uint64_t)(aSig << 1)) {
5092             return propagateFloatx80NaN(a, b, status);
5093         }
5094         return a;
5095     }
5096     if ( bExp == 0 ) --expDiff;
5097     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5098  aBigger:
5099     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5100     zExp = aExp;
5101  normalizeRoundAndPack:
5102     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5103                                          zSign, zExp, zSig0, zSig1, status);
5104 }
5105 
5106 /*----------------------------------------------------------------------------
5107 | Returns the result of adding the extended double-precision floating-point
5108 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5109 | Standard for Binary Floating-Point Arithmetic.
5110 *----------------------------------------------------------------------------*/
5111 
5112 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5113 {
5114     flag aSign, bSign;
5115 
5116     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5117         float_raise(float_flag_invalid, status);
5118         return floatx80_default_nan(status);
5119     }
5120     aSign = extractFloatx80Sign( a );
5121     bSign = extractFloatx80Sign( b );
5122     if ( aSign == bSign ) {
5123         return addFloatx80Sigs(a, b, aSign, status);
5124     }
5125     else {
5126         return subFloatx80Sigs(a, b, aSign, status);
5127     }
5128 
5129 }
5130 
5131 /*----------------------------------------------------------------------------
5132 | Returns the result of subtracting the extended double-precision floating-
5133 | point values `a' and `b'.  The operation is performed according to the
5134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5135 *----------------------------------------------------------------------------*/
5136 
5137 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5138 {
5139     flag aSign, bSign;
5140 
5141     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5142         float_raise(float_flag_invalid, status);
5143         return floatx80_default_nan(status);
5144     }
5145     aSign = extractFloatx80Sign( a );
5146     bSign = extractFloatx80Sign( b );
5147     if ( aSign == bSign ) {
5148         return subFloatx80Sigs(a, b, aSign, status);
5149     }
5150     else {
5151         return addFloatx80Sigs(a, b, aSign, status);
5152     }
5153 
5154 }
5155 
5156 /*----------------------------------------------------------------------------
5157 | Returns the result of multiplying the extended double-precision floating-
5158 | point values `a' and `b'.  The operation is performed according to the
5159 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5160 *----------------------------------------------------------------------------*/
5161 
5162 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5163 {
5164     flag aSign, bSign, zSign;
5165     int32_t aExp, bExp, zExp;
5166     uint64_t aSig, bSig, zSig0, zSig1;
5167 
5168     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5169         float_raise(float_flag_invalid, status);
5170         return floatx80_default_nan(status);
5171     }
5172     aSig = extractFloatx80Frac( a );
5173     aExp = extractFloatx80Exp( a );
5174     aSign = extractFloatx80Sign( a );
5175     bSig = extractFloatx80Frac( b );
5176     bExp = extractFloatx80Exp( b );
5177     bSign = extractFloatx80Sign( b );
5178     zSign = aSign ^ bSign;
5179     if ( aExp == 0x7FFF ) {
5180         if (    (uint64_t) ( aSig<<1 )
5181              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5182             return propagateFloatx80NaN(a, b, status);
5183         }
5184         if ( ( bExp | bSig ) == 0 ) goto invalid;
5185         return packFloatx80(zSign, floatx80_infinity_high,
5186                                    floatx80_infinity_low);
5187     }
5188     if ( bExp == 0x7FFF ) {
5189         if ((uint64_t)(bSig << 1)) {
5190             return propagateFloatx80NaN(a, b, status);
5191         }
5192         if ( ( aExp | aSig ) == 0 ) {
5193  invalid:
5194             float_raise(float_flag_invalid, status);
5195             return floatx80_default_nan(status);
5196         }
5197         return packFloatx80(zSign, floatx80_infinity_high,
5198                                    floatx80_infinity_low);
5199     }
5200     if ( aExp == 0 ) {
5201         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5202         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5203     }
5204     if ( bExp == 0 ) {
5205         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5206         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5207     }
5208     zExp = aExp + bExp - 0x3FFE;
5209     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5210     if ( 0 < (int64_t) zSig0 ) {
5211         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5212         --zExp;
5213     }
5214     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5215                                 zSign, zExp, zSig0, zSig1, status);
5216 }
5217 
5218 /*----------------------------------------------------------------------------
5219 | Returns the result of dividing the extended double-precision floating-point
5220 | value `a' by the corresponding value `b'.  The operation is performed
5221 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5222 *----------------------------------------------------------------------------*/
5223 
5224 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5225 {
5226     flag aSign, bSign, zSign;
5227     int32_t aExp, bExp, zExp;
5228     uint64_t aSig, bSig, zSig0, zSig1;
5229     uint64_t rem0, rem1, rem2, term0, term1, term2;
5230 
5231     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5232         float_raise(float_flag_invalid, status);
5233         return floatx80_default_nan(status);
5234     }
5235     aSig = extractFloatx80Frac( a );
5236     aExp = extractFloatx80Exp( a );
5237     aSign = extractFloatx80Sign( a );
5238     bSig = extractFloatx80Frac( b );
5239     bExp = extractFloatx80Exp( b );
5240     bSign = extractFloatx80Sign( b );
5241     zSign = aSign ^ bSign;
5242     if ( aExp == 0x7FFF ) {
5243         if ((uint64_t)(aSig << 1)) {
5244             return propagateFloatx80NaN(a, b, status);
5245         }
5246         if ( bExp == 0x7FFF ) {
5247             if ((uint64_t)(bSig << 1)) {
5248                 return propagateFloatx80NaN(a, b, status);
5249             }
5250             goto invalid;
5251         }
5252         return packFloatx80(zSign, floatx80_infinity_high,
5253                                    floatx80_infinity_low);
5254     }
5255     if ( bExp == 0x7FFF ) {
5256         if ((uint64_t)(bSig << 1)) {
5257             return propagateFloatx80NaN(a, b, status);
5258         }
5259         return packFloatx80( zSign, 0, 0 );
5260     }
5261     if ( bExp == 0 ) {
5262         if ( bSig == 0 ) {
5263             if ( ( aExp | aSig ) == 0 ) {
5264  invalid:
5265                 float_raise(float_flag_invalid, status);
5266                 return floatx80_default_nan(status);
5267             }
5268             float_raise(float_flag_divbyzero, status);
5269             return packFloatx80(zSign, floatx80_infinity_high,
5270                                        floatx80_infinity_low);
5271         }
5272         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5273     }
5274     if ( aExp == 0 ) {
5275         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5276         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5277     }
5278     zExp = aExp - bExp + 0x3FFE;
5279     rem1 = 0;
5280     if ( bSig <= aSig ) {
5281         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5282         ++zExp;
5283     }
5284     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5285     mul64To128( bSig, zSig0, &term0, &term1 );
5286     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5287     while ( (int64_t) rem0 < 0 ) {
5288         --zSig0;
5289         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5290     }
5291     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5292     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5293         mul64To128( bSig, zSig1, &term1, &term2 );
5294         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5295         while ( (int64_t) rem1 < 0 ) {
5296             --zSig1;
5297             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5298         }
5299         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5300     }
5301     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5302                                 zSign, zExp, zSig0, zSig1, status);
5303 }
5304 
5305 /*----------------------------------------------------------------------------
5306 | Returns the remainder of the extended double-precision floating-point value
5307 | `a' with respect to the corresponding value `b'.  The operation is performed
5308 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5309 *----------------------------------------------------------------------------*/
5310 
5311 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5312 {
5313     flag aSign, zSign;
5314     int32_t aExp, bExp, expDiff;
5315     uint64_t aSig0, aSig1, bSig;
5316     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5317 
5318     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5319         float_raise(float_flag_invalid, status);
5320         return floatx80_default_nan(status);
5321     }
5322     aSig0 = extractFloatx80Frac( a );
5323     aExp = extractFloatx80Exp( a );
5324     aSign = extractFloatx80Sign( a );
5325     bSig = extractFloatx80Frac( b );
5326     bExp = extractFloatx80Exp( b );
5327     if ( aExp == 0x7FFF ) {
5328         if (    (uint64_t) ( aSig0<<1 )
5329              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5330             return propagateFloatx80NaN(a, b, status);
5331         }
5332         goto invalid;
5333     }
5334     if ( bExp == 0x7FFF ) {
5335         if ((uint64_t)(bSig << 1)) {
5336             return propagateFloatx80NaN(a, b, status);
5337         }
5338         return a;
5339     }
5340     if ( bExp == 0 ) {
5341         if ( bSig == 0 ) {
5342  invalid:
5343             float_raise(float_flag_invalid, status);
5344             return floatx80_default_nan(status);
5345         }
5346         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5347     }
5348     if ( aExp == 0 ) {
5349         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5350         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5351     }
5352     bSig |= LIT64( 0x8000000000000000 );
5353     zSign = aSign;
5354     expDiff = aExp - bExp;
5355     aSig1 = 0;
5356     if ( expDiff < 0 ) {
5357         if ( expDiff < -1 ) return a;
5358         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5359         expDiff = 0;
5360     }
5361     q = ( bSig <= aSig0 );
5362     if ( q ) aSig0 -= bSig;
5363     expDiff -= 64;
5364     while ( 0 < expDiff ) {
5365         q = estimateDiv128To64( aSig0, aSig1, bSig );
5366         q = ( 2 < q ) ? q - 2 : 0;
5367         mul64To128( bSig, q, &term0, &term1 );
5368         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5369         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5370         expDiff -= 62;
5371     }
5372     expDiff += 64;
5373     if ( 0 < expDiff ) {
5374         q = estimateDiv128To64( aSig0, aSig1, bSig );
5375         q = ( 2 < q ) ? q - 2 : 0;
5376         q >>= 64 - expDiff;
5377         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5378         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5379         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5380         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5381             ++q;
5382             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5383         }
5384     }
5385     else {
5386         term1 = 0;
5387         term0 = bSig;
5388     }
5389     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5390     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5391          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5392               && ( q & 1 ) )
5393        ) {
5394         aSig0 = alternateASig0;
5395         aSig1 = alternateASig1;
5396         zSign = ! zSign;
5397     }
5398     return
5399         normalizeRoundAndPackFloatx80(
5400             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5401 
5402 }
5403 
5404 /*----------------------------------------------------------------------------
5405 | Returns the square root of the extended double-precision floating-point
5406 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5407 | for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409 
5410 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5411 {
5412     flag aSign;
5413     int32_t aExp, zExp;
5414     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5415     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5416 
5417     if (floatx80_invalid_encoding(a)) {
5418         float_raise(float_flag_invalid, status);
5419         return floatx80_default_nan(status);
5420     }
5421     aSig0 = extractFloatx80Frac( a );
5422     aExp = extractFloatx80Exp( a );
5423     aSign = extractFloatx80Sign( a );
5424     if ( aExp == 0x7FFF ) {
5425         if ((uint64_t)(aSig0 << 1)) {
5426             return propagateFloatx80NaN(a, a, status);
5427         }
5428         if ( ! aSign ) return a;
5429         goto invalid;
5430     }
5431     if ( aSign ) {
5432         if ( ( aExp | aSig0 ) == 0 ) return a;
5433  invalid:
5434         float_raise(float_flag_invalid, status);
5435         return floatx80_default_nan(status);
5436     }
5437     if ( aExp == 0 ) {
5438         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5439         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5440     }
5441     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5442     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5443     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5444     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5445     doubleZSig0 = zSig0<<1;
5446     mul64To128( zSig0, zSig0, &term0, &term1 );
5447     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5448     while ( (int64_t) rem0 < 0 ) {
5449         --zSig0;
5450         doubleZSig0 -= 2;
5451         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5452     }
5453     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5454     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5455         if ( zSig1 == 0 ) zSig1 = 1;
5456         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5457         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5458         mul64To128( zSig1, zSig1, &term2, &term3 );
5459         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5460         while ( (int64_t) rem1 < 0 ) {
5461             --zSig1;
5462             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5463             term3 |= 1;
5464             term2 |= doubleZSig0;
5465             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5466         }
5467         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5468     }
5469     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5470     zSig0 |= doubleZSig0;
5471     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5472                                 0, zExp, zSig0, zSig1, status);
5473 }
5474 
5475 /*----------------------------------------------------------------------------
5476 | Returns 1 if the extended double-precision floating-point value `a' is equal
5477 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5478 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5479 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5480 *----------------------------------------------------------------------------*/
5481 
5482 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5483 {
5484 
5485     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5486         || (extractFloatx80Exp(a) == 0x7FFF
5487             && (uint64_t) (extractFloatx80Frac(a) << 1))
5488         || (extractFloatx80Exp(b) == 0x7FFF
5489             && (uint64_t) (extractFloatx80Frac(b) << 1))
5490        ) {
5491         float_raise(float_flag_invalid, status);
5492         return 0;
5493     }
5494     return
5495            ( a.low == b.low )
5496         && (    ( a.high == b.high )
5497              || (    ( a.low == 0 )
5498                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5499            );
5500 
5501 }
5502 
5503 /*----------------------------------------------------------------------------
5504 | Returns 1 if the extended double-precision floating-point value `a' is
5505 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5506 | invalid exception is raised if either operand is a NaN.  The comparison is
5507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5508 | Arithmetic.
5509 *----------------------------------------------------------------------------*/
5510 
5511 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5512 {
5513     flag aSign, bSign;
5514 
5515     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5516         || (extractFloatx80Exp(a) == 0x7FFF
5517             && (uint64_t) (extractFloatx80Frac(a) << 1))
5518         || (extractFloatx80Exp(b) == 0x7FFF
5519             && (uint64_t) (extractFloatx80Frac(b) << 1))
5520        ) {
5521         float_raise(float_flag_invalid, status);
5522         return 0;
5523     }
5524     aSign = extractFloatx80Sign( a );
5525     bSign = extractFloatx80Sign( b );
5526     if ( aSign != bSign ) {
5527         return
5528                aSign
5529             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5530                  == 0 );
5531     }
5532     return
5533           aSign ? le128( b.high, b.low, a.high, a.low )
5534         : le128( a.high, a.low, b.high, b.low );
5535 
5536 }
5537 
5538 /*----------------------------------------------------------------------------
5539 | Returns 1 if the extended double-precision floating-point value `a' is
5540 | less than the corresponding value `b', and 0 otherwise.  The invalid
5541 | exception is raised if either operand is a NaN.  The comparison is performed
5542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5543 *----------------------------------------------------------------------------*/
5544 
5545 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5546 {
5547     flag aSign, bSign;
5548 
5549     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5550         || (extractFloatx80Exp(a) == 0x7FFF
5551             && (uint64_t) (extractFloatx80Frac(a) << 1))
5552         || (extractFloatx80Exp(b) == 0x7FFF
5553             && (uint64_t) (extractFloatx80Frac(b) << 1))
5554        ) {
5555         float_raise(float_flag_invalid, status);
5556         return 0;
5557     }
5558     aSign = extractFloatx80Sign( a );
5559     bSign = extractFloatx80Sign( b );
5560     if ( aSign != bSign ) {
5561         return
5562                aSign
5563             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5564                  != 0 );
5565     }
5566     return
5567           aSign ? lt128( b.high, b.low, a.high, a.low )
5568         : lt128( a.high, a.low, b.high, b.low );
5569 
5570 }
5571 
5572 /*----------------------------------------------------------------------------
5573 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5574 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5575 | either operand is a NaN.   The comparison is performed according to the
5576 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5577 *----------------------------------------------------------------------------*/
5578 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5579 {
5580     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5581         || (extractFloatx80Exp(a) == 0x7FFF
5582             && (uint64_t) (extractFloatx80Frac(a) << 1))
5583         || (extractFloatx80Exp(b) == 0x7FFF
5584             && (uint64_t) (extractFloatx80Frac(b) << 1))
5585        ) {
5586         float_raise(float_flag_invalid, status);
5587         return 1;
5588     }
5589     return 0;
5590 }
5591 
5592 /*----------------------------------------------------------------------------
5593 | Returns 1 if the extended double-precision floating-point value `a' is
5594 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5595 | cause an exception.  The comparison is performed according to the IEC/IEEE
5596 | Standard for Binary Floating-Point Arithmetic.
5597 *----------------------------------------------------------------------------*/
5598 
5599 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5600 {
5601 
5602     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5603         float_raise(float_flag_invalid, status);
5604         return 0;
5605     }
5606     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5607               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5608          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5609               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5610        ) {
5611         if (floatx80_is_signaling_nan(a, status)
5612          || floatx80_is_signaling_nan(b, status)) {
5613             float_raise(float_flag_invalid, status);
5614         }
5615         return 0;
5616     }
5617     return
5618            ( a.low == b.low )
5619         && (    ( a.high == b.high )
5620              || (    ( a.low == 0 )
5621                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5622            );
5623 
5624 }
5625 
5626 /*----------------------------------------------------------------------------
5627 | Returns 1 if the extended double-precision floating-point value `a' is less
5628 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5629 | do not cause an exception.  Otherwise, the comparison is performed according
5630 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5631 *----------------------------------------------------------------------------*/
5632 
5633 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5634 {
5635     flag aSign, bSign;
5636 
5637     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5638         float_raise(float_flag_invalid, status);
5639         return 0;
5640     }
5641     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5642               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5643          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5644               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5645        ) {
5646         if (floatx80_is_signaling_nan(a, status)
5647          || floatx80_is_signaling_nan(b, status)) {
5648             float_raise(float_flag_invalid, status);
5649         }
5650         return 0;
5651     }
5652     aSign = extractFloatx80Sign( a );
5653     bSign = extractFloatx80Sign( b );
5654     if ( aSign != bSign ) {
5655         return
5656                aSign
5657             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5658                  == 0 );
5659     }
5660     return
5661           aSign ? le128( b.high, b.low, a.high, a.low )
5662         : le128( a.high, a.low, b.high, b.low );
5663 
5664 }
5665 
5666 /*----------------------------------------------------------------------------
5667 | Returns 1 if the extended double-precision floating-point value `a' is less
5668 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5669 | an exception.  Otherwise, the comparison is performed according to the
5670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5671 *----------------------------------------------------------------------------*/
5672 
5673 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5674 {
5675     flag aSign, bSign;
5676 
5677     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5678         float_raise(float_flag_invalid, status);
5679         return 0;
5680     }
5681     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5682               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5683          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5684               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5685        ) {
5686         if (floatx80_is_signaling_nan(a, status)
5687          || floatx80_is_signaling_nan(b, status)) {
5688             float_raise(float_flag_invalid, status);
5689         }
5690         return 0;
5691     }
5692     aSign = extractFloatx80Sign( a );
5693     bSign = extractFloatx80Sign( b );
5694     if ( aSign != bSign ) {
5695         return
5696                aSign
5697             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5698                  != 0 );
5699     }
5700     return
5701           aSign ? lt128( b.high, b.low, a.high, a.low )
5702         : lt128( a.high, a.low, b.high, b.low );
5703 
5704 }
5705 
5706 /*----------------------------------------------------------------------------
5707 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5708 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5709 | The comparison is performed according to the IEC/IEEE Standard for Binary
5710 | Floating-Point Arithmetic.
5711 *----------------------------------------------------------------------------*/
5712 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5713 {
5714     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5715         float_raise(float_flag_invalid, status);
5716         return 1;
5717     }
5718     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5719               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5720          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5721               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5722        ) {
5723         if (floatx80_is_signaling_nan(a, status)
5724          || floatx80_is_signaling_nan(b, status)) {
5725             float_raise(float_flag_invalid, status);
5726         }
5727         return 1;
5728     }
5729     return 0;
5730 }
5731 
5732 /*----------------------------------------------------------------------------
5733 | Returns the result of converting the quadruple-precision floating-point
5734 | value `a' to the 32-bit two's complement integer format.  The conversion
5735 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5736 | Arithmetic---which means in particular that the conversion is rounded
5737 | according to the current rounding mode.  If `a' is a NaN, the largest
5738 | positive integer is returned.  Otherwise, if the conversion overflows, the
5739 | largest integer with the same sign as `a' is returned.
5740 *----------------------------------------------------------------------------*/
5741 
5742 int32_t float128_to_int32(float128 a, float_status *status)
5743 {
5744     flag aSign;
5745     int32_t aExp, shiftCount;
5746     uint64_t aSig0, aSig1;
5747 
5748     aSig1 = extractFloat128Frac1( a );
5749     aSig0 = extractFloat128Frac0( a );
5750     aExp = extractFloat128Exp( a );
5751     aSign = extractFloat128Sign( a );
5752     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5753     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5754     aSig0 |= ( aSig1 != 0 );
5755     shiftCount = 0x4028 - aExp;
5756     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5757     return roundAndPackInt32(aSign, aSig0, status);
5758 
5759 }
5760 
5761 /*----------------------------------------------------------------------------
5762 | Returns the result of converting the quadruple-precision floating-point
5763 | value `a' to the 32-bit two's complement integer format.  The conversion
5764 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5765 | Arithmetic, except that the conversion is always rounded toward zero.  If
5766 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5767 | conversion overflows, the largest integer with the same sign as `a' is
5768 | returned.
5769 *----------------------------------------------------------------------------*/
5770 
5771 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5772 {
5773     flag aSign;
5774     int32_t aExp, shiftCount;
5775     uint64_t aSig0, aSig1, savedASig;
5776     int32_t z;
5777 
5778     aSig1 = extractFloat128Frac1( a );
5779     aSig0 = extractFloat128Frac0( a );
5780     aExp = extractFloat128Exp( a );
5781     aSign = extractFloat128Sign( a );
5782     aSig0 |= ( aSig1 != 0 );
5783     if ( 0x401E < aExp ) {
5784         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5785         goto invalid;
5786     }
5787     else if ( aExp < 0x3FFF ) {
5788         if (aExp || aSig0) {
5789             status->float_exception_flags |= float_flag_inexact;
5790         }
5791         return 0;
5792     }
5793     aSig0 |= LIT64( 0x0001000000000000 );
5794     shiftCount = 0x402F - aExp;
5795     savedASig = aSig0;
5796     aSig0 >>= shiftCount;
5797     z = aSig0;
5798     if ( aSign ) z = - z;
5799     if ( ( z < 0 ) ^ aSign ) {
5800  invalid:
5801         float_raise(float_flag_invalid, status);
5802         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5803     }
5804     if ( ( aSig0<<shiftCount ) != savedASig ) {
5805         status->float_exception_flags |= float_flag_inexact;
5806     }
5807     return z;
5808 
5809 }
5810 
5811 /*----------------------------------------------------------------------------
5812 | Returns the result of converting the quadruple-precision floating-point
5813 | value `a' to the 64-bit two's complement integer format.  The conversion
5814 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5815 | Arithmetic---which means in particular that the conversion is rounded
5816 | according to the current rounding mode.  If `a' is a NaN, the largest
5817 | positive integer is returned.  Otherwise, if the conversion overflows, the
5818 | largest integer with the same sign as `a' is returned.
5819 *----------------------------------------------------------------------------*/
5820 
5821 int64_t float128_to_int64(float128 a, float_status *status)
5822 {
5823     flag aSign;
5824     int32_t aExp, shiftCount;
5825     uint64_t aSig0, aSig1;
5826 
5827     aSig1 = extractFloat128Frac1( a );
5828     aSig0 = extractFloat128Frac0( a );
5829     aExp = extractFloat128Exp( a );
5830     aSign = extractFloat128Sign( a );
5831     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5832     shiftCount = 0x402F - aExp;
5833     if ( shiftCount <= 0 ) {
5834         if ( 0x403E < aExp ) {
5835             float_raise(float_flag_invalid, status);
5836             if (    ! aSign
5837                  || (    ( aExp == 0x7FFF )
5838                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5839                     )
5840                ) {
5841                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5842             }
5843             return (int64_t) LIT64( 0x8000000000000000 );
5844         }
5845         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5846     }
5847     else {
5848         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5849     }
5850     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5851 
5852 }
5853 
5854 /*----------------------------------------------------------------------------
5855 | Returns the result of converting the quadruple-precision floating-point
5856 | value `a' to the 64-bit two's complement integer format.  The conversion
5857 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5858 | Arithmetic, except that the conversion is always rounded toward zero.
5859 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5860 | the conversion overflows, the largest integer with the same sign as `a' is
5861 | returned.
5862 *----------------------------------------------------------------------------*/
5863 
5864 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5865 {
5866     flag aSign;
5867     int32_t aExp, shiftCount;
5868     uint64_t aSig0, aSig1;
5869     int64_t z;
5870 
5871     aSig1 = extractFloat128Frac1( a );
5872     aSig0 = extractFloat128Frac0( a );
5873     aExp = extractFloat128Exp( a );
5874     aSign = extractFloat128Sign( a );
5875     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5876     shiftCount = aExp - 0x402F;
5877     if ( 0 < shiftCount ) {
5878         if ( 0x403E <= aExp ) {
5879             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5880             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5881                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5882                 if (aSig1) {
5883                     status->float_exception_flags |= float_flag_inexact;
5884                 }
5885             }
5886             else {
5887                 float_raise(float_flag_invalid, status);
5888                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5889                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5890                 }
5891             }
5892             return (int64_t) LIT64( 0x8000000000000000 );
5893         }
5894         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5895         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5896             status->float_exception_flags |= float_flag_inexact;
5897         }
5898     }
5899     else {
5900         if ( aExp < 0x3FFF ) {
5901             if ( aExp | aSig0 | aSig1 ) {
5902                 status->float_exception_flags |= float_flag_inexact;
5903             }
5904             return 0;
5905         }
5906         z = aSig0>>( - shiftCount );
5907         if (    aSig1
5908              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5909             status->float_exception_flags |= float_flag_inexact;
5910         }
5911     }
5912     if ( aSign ) z = - z;
5913     return z;
5914 
5915 }
5916 
5917 /*----------------------------------------------------------------------------
5918 | Returns the result of converting the quadruple-precision floating-point value
5919 | `a' to the 64-bit unsigned integer format.  The conversion is
5920 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5921 | Arithmetic---which means in particular that the conversion is rounded
5922 | according to the current rounding mode.  If `a' is a NaN, the largest
5923 | positive integer is returned.  If the conversion overflows, the
5924 | largest unsigned integer is returned.  If 'a' is negative, the value is
5925 | rounded and zero is returned; negative values that do not round to zero
5926 | will raise the inexact exception.
5927 *----------------------------------------------------------------------------*/
5928 
5929 uint64_t float128_to_uint64(float128 a, float_status *status)
5930 {
5931     flag aSign;
5932     int aExp;
5933     int shiftCount;
5934     uint64_t aSig0, aSig1;
5935 
5936     aSig0 = extractFloat128Frac0(a);
5937     aSig1 = extractFloat128Frac1(a);
5938     aExp = extractFloat128Exp(a);
5939     aSign = extractFloat128Sign(a);
5940     if (aSign && (aExp > 0x3FFE)) {
5941         float_raise(float_flag_invalid, status);
5942         if (float128_is_any_nan(a)) {
5943             return LIT64(0xFFFFFFFFFFFFFFFF);
5944         } else {
5945             return 0;
5946         }
5947     }
5948     if (aExp) {
5949         aSig0 |= LIT64(0x0001000000000000);
5950     }
5951     shiftCount = 0x402F - aExp;
5952     if (shiftCount <= 0) {
5953         if (0x403E < aExp) {
5954             float_raise(float_flag_invalid, status);
5955             return LIT64(0xFFFFFFFFFFFFFFFF);
5956         }
5957         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5958     } else {
5959         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5960     }
5961     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5962 }
5963 
5964 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5965 {
5966     uint64_t v;
5967     signed char current_rounding_mode = status->float_rounding_mode;
5968 
5969     set_float_rounding_mode(float_round_to_zero, status);
5970     v = float128_to_uint64(a, status);
5971     set_float_rounding_mode(current_rounding_mode, status);
5972 
5973     return v;
5974 }
5975 
5976 /*----------------------------------------------------------------------------
5977 | Returns the result of converting the quadruple-precision floating-point
5978 | value `a' to the 32-bit unsigned integer format.  The conversion
5979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5980 | Arithmetic except that the conversion is always rounded toward zero.
5981 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5982 | if the conversion overflows, the largest unsigned integer is returned.
5983 | If 'a' is negative, the value is rounded and zero is returned; negative
5984 | values that do not round to zero will raise the inexact exception.
5985 *----------------------------------------------------------------------------*/
5986 
5987 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5988 {
5989     uint64_t v;
5990     uint32_t res;
5991     int old_exc_flags = get_float_exception_flags(status);
5992 
5993     v = float128_to_uint64_round_to_zero(a, status);
5994     if (v > 0xffffffff) {
5995         res = 0xffffffff;
5996     } else {
5997         return v;
5998     }
5999     set_float_exception_flags(old_exc_flags, status);
6000     float_raise(float_flag_invalid, status);
6001     return res;
6002 }
6003 
6004 /*----------------------------------------------------------------------------
6005 | Returns the result of converting the quadruple-precision floating-point
6006 | value `a' to the single-precision floating-point format.  The conversion
6007 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6008 | Arithmetic.
6009 *----------------------------------------------------------------------------*/
6010 
6011 float32 float128_to_float32(float128 a, float_status *status)
6012 {
6013     flag aSign;
6014     int32_t aExp;
6015     uint64_t aSig0, aSig1;
6016     uint32_t zSig;
6017 
6018     aSig1 = extractFloat128Frac1( a );
6019     aSig0 = extractFloat128Frac0( a );
6020     aExp = extractFloat128Exp( a );
6021     aSign = extractFloat128Sign( a );
6022     if ( aExp == 0x7FFF ) {
6023         if ( aSig0 | aSig1 ) {
6024             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6025         }
6026         return packFloat32( aSign, 0xFF, 0 );
6027     }
6028     aSig0 |= ( aSig1 != 0 );
6029     shift64RightJamming( aSig0, 18, &aSig0 );
6030     zSig = aSig0;
6031     if ( aExp || zSig ) {
6032         zSig |= 0x40000000;
6033         aExp -= 0x3F81;
6034     }
6035     return roundAndPackFloat32(aSign, aExp, zSig, status);
6036 
6037 }
6038 
6039 /*----------------------------------------------------------------------------
6040 | Returns the result of converting the quadruple-precision floating-point
6041 | value `a' to the double-precision floating-point format.  The conversion
6042 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6043 | Arithmetic.
6044 *----------------------------------------------------------------------------*/
6045 
6046 float64 float128_to_float64(float128 a, float_status *status)
6047 {
6048     flag aSign;
6049     int32_t aExp;
6050     uint64_t aSig0, aSig1;
6051 
6052     aSig1 = extractFloat128Frac1( a );
6053     aSig0 = extractFloat128Frac0( a );
6054     aExp = extractFloat128Exp( a );
6055     aSign = extractFloat128Sign( a );
6056     if ( aExp == 0x7FFF ) {
6057         if ( aSig0 | aSig1 ) {
6058             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6059         }
6060         return packFloat64( aSign, 0x7FF, 0 );
6061     }
6062     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6063     aSig0 |= ( aSig1 != 0 );
6064     if ( aExp || aSig0 ) {
6065         aSig0 |= LIT64( 0x4000000000000000 );
6066         aExp -= 0x3C01;
6067     }
6068     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6069 
6070 }
6071 
6072 /*----------------------------------------------------------------------------
6073 | Returns the result of converting the quadruple-precision floating-point
6074 | value `a' to the extended double-precision floating-point format.  The
6075 | conversion is performed according to the IEC/IEEE Standard for Binary
6076 | Floating-Point Arithmetic.
6077 *----------------------------------------------------------------------------*/
6078 
6079 floatx80 float128_to_floatx80(float128 a, float_status *status)
6080 {
6081     flag aSign;
6082     int32_t aExp;
6083     uint64_t aSig0, aSig1;
6084 
6085     aSig1 = extractFloat128Frac1( a );
6086     aSig0 = extractFloat128Frac0( a );
6087     aExp = extractFloat128Exp( a );
6088     aSign = extractFloat128Sign( a );
6089     if ( aExp == 0x7FFF ) {
6090         if ( aSig0 | aSig1 ) {
6091             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6092         }
6093         return packFloatx80(aSign, floatx80_infinity_high,
6094                                    floatx80_infinity_low);
6095     }
6096     if ( aExp == 0 ) {
6097         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6098         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6099     }
6100     else {
6101         aSig0 |= LIT64( 0x0001000000000000 );
6102     }
6103     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6104     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6105 
6106 }
6107 
6108 /*----------------------------------------------------------------------------
6109 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6110 | returns the result as a quadruple-precision floating-point value.  The
6111 | operation is performed according to the IEC/IEEE Standard for Binary
6112 | Floating-Point Arithmetic.
6113 *----------------------------------------------------------------------------*/
6114 
6115 float128 float128_round_to_int(float128 a, float_status *status)
6116 {
6117     flag aSign;
6118     int32_t aExp;
6119     uint64_t lastBitMask, roundBitsMask;
6120     float128 z;
6121 
6122     aExp = extractFloat128Exp( a );
6123     if ( 0x402F <= aExp ) {
6124         if ( 0x406F <= aExp ) {
6125             if (    ( aExp == 0x7FFF )
6126                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6127                ) {
6128                 return propagateFloat128NaN(a, a, status);
6129             }
6130             return a;
6131         }
6132         lastBitMask = 1;
6133         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6134         roundBitsMask = lastBitMask - 1;
6135         z = a;
6136         switch (status->float_rounding_mode) {
6137         case float_round_nearest_even:
6138             if ( lastBitMask ) {
6139                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6140                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6141             }
6142             else {
6143                 if ( (int64_t) z.low < 0 ) {
6144                     ++z.high;
6145                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6146                 }
6147             }
6148             break;
6149         case float_round_ties_away:
6150             if (lastBitMask) {
6151                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6152             } else {
6153                 if ((int64_t) z.low < 0) {
6154                     ++z.high;
6155                 }
6156             }
6157             break;
6158         case float_round_to_zero:
6159             break;
6160         case float_round_up:
6161             if (!extractFloat128Sign(z)) {
6162                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6163             }
6164             break;
6165         case float_round_down:
6166             if (extractFloat128Sign(z)) {
6167                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6168             }
6169             break;
6170         default:
6171             abort();
6172         }
6173         z.low &= ~ roundBitsMask;
6174     }
6175     else {
6176         if ( aExp < 0x3FFF ) {
6177             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6178             status->float_exception_flags |= float_flag_inexact;
6179             aSign = extractFloat128Sign( a );
6180             switch (status->float_rounding_mode) {
6181              case float_round_nearest_even:
6182                 if (    ( aExp == 0x3FFE )
6183                      && (   extractFloat128Frac0( a )
6184                           | extractFloat128Frac1( a ) )
6185                    ) {
6186                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6187                 }
6188                 break;
6189             case float_round_ties_away:
6190                 if (aExp == 0x3FFE) {
6191                     return packFloat128(aSign, 0x3FFF, 0, 0);
6192                 }
6193                 break;
6194              case float_round_down:
6195                 return
6196                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6197                     : packFloat128( 0, 0, 0, 0 );
6198              case float_round_up:
6199                 return
6200                       aSign ? packFloat128( 1, 0, 0, 0 )
6201                     : packFloat128( 0, 0x3FFF, 0, 0 );
6202             }
6203             return packFloat128( aSign, 0, 0, 0 );
6204         }
6205         lastBitMask = 1;
6206         lastBitMask <<= 0x402F - aExp;
6207         roundBitsMask = lastBitMask - 1;
6208         z.low = 0;
6209         z.high = a.high;
6210         switch (status->float_rounding_mode) {
6211         case float_round_nearest_even:
6212             z.high += lastBitMask>>1;
6213             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6214                 z.high &= ~ lastBitMask;
6215             }
6216             break;
6217         case float_round_ties_away:
6218             z.high += lastBitMask>>1;
6219             break;
6220         case float_round_to_zero:
6221             break;
6222         case float_round_up:
6223             if (!extractFloat128Sign(z)) {
6224                 z.high |= ( a.low != 0 );
6225                 z.high += roundBitsMask;
6226             }
6227             break;
6228         case float_round_down:
6229             if (extractFloat128Sign(z)) {
6230                 z.high |= (a.low != 0);
6231                 z.high += roundBitsMask;
6232             }
6233             break;
6234         default:
6235             abort();
6236         }
6237         z.high &= ~ roundBitsMask;
6238     }
6239     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6240         status->float_exception_flags |= float_flag_inexact;
6241     }
6242     return z;
6243 
6244 }
6245 
6246 /*----------------------------------------------------------------------------
6247 | Returns the result of adding the absolute values of the quadruple-precision
6248 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6249 | before being returned.  `zSign' is ignored if the result is a NaN.
6250 | The addition is performed according to the IEC/IEEE Standard for Binary
6251 | Floating-Point Arithmetic.
6252 *----------------------------------------------------------------------------*/
6253 
6254 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6255                                 float_status *status)
6256 {
6257     int32_t aExp, bExp, zExp;
6258     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6259     int32_t expDiff;
6260 
6261     aSig1 = extractFloat128Frac1( a );
6262     aSig0 = extractFloat128Frac0( a );
6263     aExp = extractFloat128Exp( a );
6264     bSig1 = extractFloat128Frac1( b );
6265     bSig0 = extractFloat128Frac0( b );
6266     bExp = extractFloat128Exp( b );
6267     expDiff = aExp - bExp;
6268     if ( 0 < expDiff ) {
6269         if ( aExp == 0x7FFF ) {
6270             if (aSig0 | aSig1) {
6271                 return propagateFloat128NaN(a, b, status);
6272             }
6273             return a;
6274         }
6275         if ( bExp == 0 ) {
6276             --expDiff;
6277         }
6278         else {
6279             bSig0 |= LIT64( 0x0001000000000000 );
6280         }
6281         shift128ExtraRightJamming(
6282             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6283         zExp = aExp;
6284     }
6285     else if ( expDiff < 0 ) {
6286         if ( bExp == 0x7FFF ) {
6287             if (bSig0 | bSig1) {
6288                 return propagateFloat128NaN(a, b, status);
6289             }
6290             return packFloat128( zSign, 0x7FFF, 0, 0 );
6291         }
6292         if ( aExp == 0 ) {
6293             ++expDiff;
6294         }
6295         else {
6296             aSig0 |= LIT64( 0x0001000000000000 );
6297         }
6298         shift128ExtraRightJamming(
6299             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6300         zExp = bExp;
6301     }
6302     else {
6303         if ( aExp == 0x7FFF ) {
6304             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6305                 return propagateFloat128NaN(a, b, status);
6306             }
6307             return a;
6308         }
6309         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6310         if ( aExp == 0 ) {
6311             if (status->flush_to_zero) {
6312                 if (zSig0 | zSig1) {
6313                     float_raise(float_flag_output_denormal, status);
6314                 }
6315                 return packFloat128(zSign, 0, 0, 0);
6316             }
6317             return packFloat128( zSign, 0, zSig0, zSig1 );
6318         }
6319         zSig2 = 0;
6320         zSig0 |= LIT64( 0x0002000000000000 );
6321         zExp = aExp;
6322         goto shiftRight1;
6323     }
6324     aSig0 |= LIT64( 0x0001000000000000 );
6325     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6326     --zExp;
6327     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6328     ++zExp;
6329  shiftRight1:
6330     shift128ExtraRightJamming(
6331         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6332  roundAndPack:
6333     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6334 
6335 }
6336 
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of subtracting the absolute values of the quadruple-
6339 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6340 | difference is negated before being returned.  `zSign' is ignored if the
6341 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6342 | Standard for Binary Floating-Point Arithmetic.
6343 *----------------------------------------------------------------------------*/
6344 
6345 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6346                                 float_status *status)
6347 {
6348     int32_t aExp, bExp, zExp;
6349     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6350     int32_t expDiff;
6351 
6352     aSig1 = extractFloat128Frac1( a );
6353     aSig0 = extractFloat128Frac0( a );
6354     aExp = extractFloat128Exp( a );
6355     bSig1 = extractFloat128Frac1( b );
6356     bSig0 = extractFloat128Frac0( b );
6357     bExp = extractFloat128Exp( b );
6358     expDiff = aExp - bExp;
6359     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6360     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6361     if ( 0 < expDiff ) goto aExpBigger;
6362     if ( expDiff < 0 ) goto bExpBigger;
6363     if ( aExp == 0x7FFF ) {
6364         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6365             return propagateFloat128NaN(a, b, status);
6366         }
6367         float_raise(float_flag_invalid, status);
6368         return float128_default_nan(status);
6369     }
6370     if ( aExp == 0 ) {
6371         aExp = 1;
6372         bExp = 1;
6373     }
6374     if ( bSig0 < aSig0 ) goto aBigger;
6375     if ( aSig0 < bSig0 ) goto bBigger;
6376     if ( bSig1 < aSig1 ) goto aBigger;
6377     if ( aSig1 < bSig1 ) goto bBigger;
6378     return packFloat128(status->float_rounding_mode == float_round_down,
6379                         0, 0, 0);
6380  bExpBigger:
6381     if ( bExp == 0x7FFF ) {
6382         if (bSig0 | bSig1) {
6383             return propagateFloat128NaN(a, b, status);
6384         }
6385         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6386     }
6387     if ( aExp == 0 ) {
6388         ++expDiff;
6389     }
6390     else {
6391         aSig0 |= LIT64( 0x4000000000000000 );
6392     }
6393     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6394     bSig0 |= LIT64( 0x4000000000000000 );
6395  bBigger:
6396     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6397     zExp = bExp;
6398     zSign ^= 1;
6399     goto normalizeRoundAndPack;
6400  aExpBigger:
6401     if ( aExp == 0x7FFF ) {
6402         if (aSig0 | aSig1) {
6403             return propagateFloat128NaN(a, b, status);
6404         }
6405         return a;
6406     }
6407     if ( bExp == 0 ) {
6408         --expDiff;
6409     }
6410     else {
6411         bSig0 |= LIT64( 0x4000000000000000 );
6412     }
6413     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6414     aSig0 |= LIT64( 0x4000000000000000 );
6415  aBigger:
6416     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6417     zExp = aExp;
6418  normalizeRoundAndPack:
6419     --zExp;
6420     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6421                                          status);
6422 
6423 }
6424 
6425 /*----------------------------------------------------------------------------
6426 | Returns the result of adding the quadruple-precision floating-point values
6427 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6428 | for Binary Floating-Point Arithmetic.
6429 *----------------------------------------------------------------------------*/
6430 
6431 float128 float128_add(float128 a, float128 b, float_status *status)
6432 {
6433     flag aSign, bSign;
6434 
6435     aSign = extractFloat128Sign( a );
6436     bSign = extractFloat128Sign( b );
6437     if ( aSign == bSign ) {
6438         return addFloat128Sigs(a, b, aSign, status);
6439     }
6440     else {
6441         return subFloat128Sigs(a, b, aSign, status);
6442     }
6443 
6444 }
6445 
6446 /*----------------------------------------------------------------------------
6447 | Returns the result of subtracting the quadruple-precision floating-point
6448 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6449 | Standard for Binary Floating-Point Arithmetic.
6450 *----------------------------------------------------------------------------*/
6451 
6452 float128 float128_sub(float128 a, float128 b, float_status *status)
6453 {
6454     flag aSign, bSign;
6455 
6456     aSign = extractFloat128Sign( a );
6457     bSign = extractFloat128Sign( b );
6458     if ( aSign == bSign ) {
6459         return subFloat128Sigs(a, b, aSign, status);
6460     }
6461     else {
6462         return addFloat128Sigs(a, b, aSign, status);
6463     }
6464 
6465 }
6466 
6467 /*----------------------------------------------------------------------------
6468 | Returns the result of multiplying the quadruple-precision floating-point
6469 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6470 | Standard for Binary Floating-Point Arithmetic.
6471 *----------------------------------------------------------------------------*/
6472 
6473 float128 float128_mul(float128 a, float128 b, float_status *status)
6474 {
6475     flag aSign, bSign, zSign;
6476     int32_t aExp, bExp, zExp;
6477     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6478 
6479     aSig1 = extractFloat128Frac1( a );
6480     aSig0 = extractFloat128Frac0( a );
6481     aExp = extractFloat128Exp( a );
6482     aSign = extractFloat128Sign( a );
6483     bSig1 = extractFloat128Frac1( b );
6484     bSig0 = extractFloat128Frac0( b );
6485     bExp = extractFloat128Exp( b );
6486     bSign = extractFloat128Sign( b );
6487     zSign = aSign ^ bSign;
6488     if ( aExp == 0x7FFF ) {
6489         if (    ( aSig0 | aSig1 )
6490              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6491             return propagateFloat128NaN(a, b, status);
6492         }
6493         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6494         return packFloat128( zSign, 0x7FFF, 0, 0 );
6495     }
6496     if ( bExp == 0x7FFF ) {
6497         if (bSig0 | bSig1) {
6498             return propagateFloat128NaN(a, b, status);
6499         }
6500         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6501  invalid:
6502             float_raise(float_flag_invalid, status);
6503             return float128_default_nan(status);
6504         }
6505         return packFloat128( zSign, 0x7FFF, 0, 0 );
6506     }
6507     if ( aExp == 0 ) {
6508         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6509         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6510     }
6511     if ( bExp == 0 ) {
6512         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6513         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6514     }
6515     zExp = aExp + bExp - 0x4000;
6516     aSig0 |= LIT64( 0x0001000000000000 );
6517     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6518     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6519     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6520     zSig2 |= ( zSig3 != 0 );
6521     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6522         shift128ExtraRightJamming(
6523             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6524         ++zExp;
6525     }
6526     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6527 
6528 }
6529 
6530 /*----------------------------------------------------------------------------
6531 | Returns the result of dividing the quadruple-precision floating-point value
6532 | `a' by the corresponding value `b'.  The operation is performed according to
6533 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6534 *----------------------------------------------------------------------------*/
6535 
6536 float128 float128_div(float128 a, float128 b, float_status *status)
6537 {
6538     flag aSign, bSign, zSign;
6539     int32_t aExp, bExp, zExp;
6540     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6541     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6542 
6543     aSig1 = extractFloat128Frac1( a );
6544     aSig0 = extractFloat128Frac0( a );
6545     aExp = extractFloat128Exp( a );
6546     aSign = extractFloat128Sign( a );
6547     bSig1 = extractFloat128Frac1( b );
6548     bSig0 = extractFloat128Frac0( b );
6549     bExp = extractFloat128Exp( b );
6550     bSign = extractFloat128Sign( b );
6551     zSign = aSign ^ bSign;
6552     if ( aExp == 0x7FFF ) {
6553         if (aSig0 | aSig1) {
6554             return propagateFloat128NaN(a, b, status);
6555         }
6556         if ( bExp == 0x7FFF ) {
6557             if (bSig0 | bSig1) {
6558                 return propagateFloat128NaN(a, b, status);
6559             }
6560             goto invalid;
6561         }
6562         return packFloat128( zSign, 0x7FFF, 0, 0 );
6563     }
6564     if ( bExp == 0x7FFF ) {
6565         if (bSig0 | bSig1) {
6566             return propagateFloat128NaN(a, b, status);
6567         }
6568         return packFloat128( zSign, 0, 0, 0 );
6569     }
6570     if ( bExp == 0 ) {
6571         if ( ( bSig0 | bSig1 ) == 0 ) {
6572             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6573  invalid:
6574                 float_raise(float_flag_invalid, status);
6575                 return float128_default_nan(status);
6576             }
6577             float_raise(float_flag_divbyzero, status);
6578             return packFloat128( zSign, 0x7FFF, 0, 0 );
6579         }
6580         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6581     }
6582     if ( aExp == 0 ) {
6583         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6584         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6585     }
6586     zExp = aExp - bExp + 0x3FFD;
6587     shortShift128Left(
6588         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6589     shortShift128Left(
6590         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6591     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6592         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6593         ++zExp;
6594     }
6595     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6596     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6597     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6598     while ( (int64_t) rem0 < 0 ) {
6599         --zSig0;
6600         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6601     }
6602     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6603     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6604         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6605         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6606         while ( (int64_t) rem1 < 0 ) {
6607             --zSig1;
6608             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6609         }
6610         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6611     }
6612     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6613     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6614 
6615 }
6616 
6617 /*----------------------------------------------------------------------------
6618 | Returns the remainder of the quadruple-precision floating-point value `a'
6619 | with respect to the corresponding value `b'.  The operation is performed
6620 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6621 *----------------------------------------------------------------------------*/
6622 
6623 float128 float128_rem(float128 a, float128 b, float_status *status)
6624 {
6625     flag aSign, zSign;
6626     int32_t aExp, bExp, expDiff;
6627     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6628     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6629     int64_t sigMean0;
6630 
6631     aSig1 = extractFloat128Frac1( a );
6632     aSig0 = extractFloat128Frac0( a );
6633     aExp = extractFloat128Exp( a );
6634     aSign = extractFloat128Sign( a );
6635     bSig1 = extractFloat128Frac1( b );
6636     bSig0 = extractFloat128Frac0( b );
6637     bExp = extractFloat128Exp( b );
6638     if ( aExp == 0x7FFF ) {
6639         if (    ( aSig0 | aSig1 )
6640              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6641             return propagateFloat128NaN(a, b, status);
6642         }
6643         goto invalid;
6644     }
6645     if ( bExp == 0x7FFF ) {
6646         if (bSig0 | bSig1) {
6647             return propagateFloat128NaN(a, b, status);
6648         }
6649         return a;
6650     }
6651     if ( bExp == 0 ) {
6652         if ( ( bSig0 | bSig1 ) == 0 ) {
6653  invalid:
6654             float_raise(float_flag_invalid, status);
6655             return float128_default_nan(status);
6656         }
6657         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6658     }
6659     if ( aExp == 0 ) {
6660         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6661         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6662     }
6663     expDiff = aExp - bExp;
6664     if ( expDiff < -1 ) return a;
6665     shortShift128Left(
6666         aSig0 | LIT64( 0x0001000000000000 ),
6667         aSig1,
6668         15 - ( expDiff < 0 ),
6669         &aSig0,
6670         &aSig1
6671     );
6672     shortShift128Left(
6673         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6674     q = le128( bSig0, bSig1, aSig0, aSig1 );
6675     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6676     expDiff -= 64;
6677     while ( 0 < expDiff ) {
6678         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6679         q = ( 4 < q ) ? q - 4 : 0;
6680         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6681         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6682         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6683         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6684         expDiff -= 61;
6685     }
6686     if ( -64 < expDiff ) {
6687         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6688         q = ( 4 < q ) ? q - 4 : 0;
6689         q >>= - expDiff;
6690         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6691         expDiff += 52;
6692         if ( expDiff < 0 ) {
6693             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6694         }
6695         else {
6696             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6697         }
6698         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6699         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6700     }
6701     else {
6702         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6703         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6704     }
6705     do {
6706         alternateASig0 = aSig0;
6707         alternateASig1 = aSig1;
6708         ++q;
6709         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6710     } while ( 0 <= (int64_t) aSig0 );
6711     add128(
6712         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6713     if (    ( sigMean0 < 0 )
6714          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6715         aSig0 = alternateASig0;
6716         aSig1 = alternateASig1;
6717     }
6718     zSign = ( (int64_t) aSig0 < 0 );
6719     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6720     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6721                                          status);
6722 }
6723 
6724 /*----------------------------------------------------------------------------
6725 | Returns the square root of the quadruple-precision floating-point value `a'.
6726 | The operation is performed according to the IEC/IEEE Standard for Binary
6727 | Floating-Point Arithmetic.
6728 *----------------------------------------------------------------------------*/
6729 
6730 float128 float128_sqrt(float128 a, float_status *status)
6731 {
6732     flag aSign;
6733     int32_t aExp, zExp;
6734     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6735     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6736 
6737     aSig1 = extractFloat128Frac1( a );
6738     aSig0 = extractFloat128Frac0( a );
6739     aExp = extractFloat128Exp( a );
6740     aSign = extractFloat128Sign( a );
6741     if ( aExp == 0x7FFF ) {
6742         if (aSig0 | aSig1) {
6743             return propagateFloat128NaN(a, a, status);
6744         }
6745         if ( ! aSign ) return a;
6746         goto invalid;
6747     }
6748     if ( aSign ) {
6749         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6750  invalid:
6751         float_raise(float_flag_invalid, status);
6752         return float128_default_nan(status);
6753     }
6754     if ( aExp == 0 ) {
6755         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6756         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6757     }
6758     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6759     aSig0 |= LIT64( 0x0001000000000000 );
6760     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6761     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6762     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6763     doubleZSig0 = zSig0<<1;
6764     mul64To128( zSig0, zSig0, &term0, &term1 );
6765     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6766     while ( (int64_t) rem0 < 0 ) {
6767         --zSig0;
6768         doubleZSig0 -= 2;
6769         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6770     }
6771     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6772     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6773         if ( zSig1 == 0 ) zSig1 = 1;
6774         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6775         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6776         mul64To128( zSig1, zSig1, &term2, &term3 );
6777         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6778         while ( (int64_t) rem1 < 0 ) {
6779             --zSig1;
6780             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6781             term3 |= 1;
6782             term2 |= doubleZSig0;
6783             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6784         }
6785         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6786     }
6787     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6788     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6789 
6790 }
6791 
6792 /*----------------------------------------------------------------------------
6793 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6794 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6795 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6797 *----------------------------------------------------------------------------*/
6798 
6799 int float128_eq(float128 a, float128 b, float_status *status)
6800 {
6801 
6802     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6803               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6804          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6805               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6806        ) {
6807         float_raise(float_flag_invalid, status);
6808         return 0;
6809     }
6810     return
6811            ( a.low == b.low )
6812         && (    ( a.high == b.high )
6813              || (    ( a.low == 0 )
6814                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6815            );
6816 
6817 }
6818 
6819 /*----------------------------------------------------------------------------
6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6821 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6822 | exception is raised if either operand is a NaN.  The comparison is performed
6823 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6824 *----------------------------------------------------------------------------*/
6825 
6826 int float128_le(float128 a, float128 b, float_status *status)
6827 {
6828     flag aSign, bSign;
6829 
6830     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6831               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6832          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6833               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6834        ) {
6835         float_raise(float_flag_invalid, status);
6836         return 0;
6837     }
6838     aSign = extractFloat128Sign( a );
6839     bSign = extractFloat128Sign( b );
6840     if ( aSign != bSign ) {
6841         return
6842                aSign
6843             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6844                  == 0 );
6845     }
6846     return
6847           aSign ? le128( b.high, b.low, a.high, a.low )
6848         : le128( a.high, a.low, b.high, b.low );
6849 
6850 }
6851 
6852 /*----------------------------------------------------------------------------
6853 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6854 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6855 | raised if either operand is a NaN.  The comparison is performed according
6856 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6857 *----------------------------------------------------------------------------*/
6858 
6859 int float128_lt(float128 a, float128 b, float_status *status)
6860 {
6861     flag aSign, bSign;
6862 
6863     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6864               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6865          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6866               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6867        ) {
6868         float_raise(float_flag_invalid, status);
6869         return 0;
6870     }
6871     aSign = extractFloat128Sign( a );
6872     bSign = extractFloat128Sign( b );
6873     if ( aSign != bSign ) {
6874         return
6875                aSign
6876             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6877                  != 0 );
6878     }
6879     return
6880           aSign ? lt128( b.high, b.low, a.high, a.low )
6881         : lt128( a.high, a.low, b.high, b.low );
6882 
6883 }
6884 
6885 /*----------------------------------------------------------------------------
6886 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6887 | be compared, and 0 otherwise.  The invalid exception is raised if either
6888 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6889 | Standard for Binary Floating-Point Arithmetic.
6890 *----------------------------------------------------------------------------*/
6891 
6892 int float128_unordered(float128 a, float128 b, float_status *status)
6893 {
6894     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6895               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6896          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6897               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6898        ) {
6899         float_raise(float_flag_invalid, status);
6900         return 1;
6901     }
6902     return 0;
6903 }
6904 
6905 /*----------------------------------------------------------------------------
6906 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6907 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6908 | exception.  The comparison is performed according to the IEC/IEEE Standard
6909 | for Binary Floating-Point Arithmetic.
6910 *----------------------------------------------------------------------------*/
6911 
6912 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6913 {
6914 
6915     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6916               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6917          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6918               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6919        ) {
6920         if (float128_is_signaling_nan(a, status)
6921          || float128_is_signaling_nan(b, status)) {
6922             float_raise(float_flag_invalid, status);
6923         }
6924         return 0;
6925     }
6926     return
6927            ( a.low == b.low )
6928         && (    ( a.high == b.high )
6929              || (    ( a.low == 0 )
6930                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6931            );
6932 
6933 }
6934 
6935 /*----------------------------------------------------------------------------
6936 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6937 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6938 | cause an exception.  Otherwise, the comparison is performed according to the
6939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6940 *----------------------------------------------------------------------------*/
6941 
6942 int float128_le_quiet(float128 a, float128 b, float_status *status)
6943 {
6944     flag aSign, bSign;
6945 
6946     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6947               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6948          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6949               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6950        ) {
6951         if (float128_is_signaling_nan(a, status)
6952          || float128_is_signaling_nan(b, status)) {
6953             float_raise(float_flag_invalid, status);
6954         }
6955         return 0;
6956     }
6957     aSign = extractFloat128Sign( a );
6958     bSign = extractFloat128Sign( b );
6959     if ( aSign != bSign ) {
6960         return
6961                aSign
6962             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6963                  == 0 );
6964     }
6965     return
6966           aSign ? le128( b.high, b.low, a.high, a.low )
6967         : le128( a.high, a.low, b.high, b.low );
6968 
6969 }
6970 
6971 /*----------------------------------------------------------------------------
6972 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6973 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6974 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6975 | Standard for Binary Floating-Point Arithmetic.
6976 *----------------------------------------------------------------------------*/
6977 
6978 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6979 {
6980     flag aSign, bSign;
6981 
6982     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6983               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6984          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6985               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6986        ) {
6987         if (float128_is_signaling_nan(a, status)
6988          || float128_is_signaling_nan(b, status)) {
6989             float_raise(float_flag_invalid, status);
6990         }
6991         return 0;
6992     }
6993     aSign = extractFloat128Sign( a );
6994     bSign = extractFloat128Sign( b );
6995     if ( aSign != bSign ) {
6996         return
6997                aSign
6998             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6999                  != 0 );
7000     }
7001     return
7002           aSign ? lt128( b.high, b.low, a.high, a.low )
7003         : lt128( a.high, a.low, b.high, b.low );
7004 
7005 }
7006 
7007 /*----------------------------------------------------------------------------
7008 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7009 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7010 | comparison is performed according to the IEC/IEEE Standard for Binary
7011 | Floating-Point Arithmetic.
7012 *----------------------------------------------------------------------------*/
7013 
7014 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7015 {
7016     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7017               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7018          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7019               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7020        ) {
7021         if (float128_is_signaling_nan(a, status)
7022          || float128_is_signaling_nan(b, status)) {
7023             float_raise(float_flag_invalid, status);
7024         }
7025         return 1;
7026     }
7027     return 0;
7028 }
7029 
7030 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7031                                             int is_quiet, float_status *status)
7032 {
7033     flag aSign, bSign;
7034 
7035     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7036         float_raise(float_flag_invalid, status);
7037         return float_relation_unordered;
7038     }
7039     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7040           ( extractFloatx80Frac( a )<<1 ) ) ||
7041         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7042           ( extractFloatx80Frac( b )<<1 ) )) {
7043         if (!is_quiet ||
7044             floatx80_is_signaling_nan(a, status) ||
7045             floatx80_is_signaling_nan(b, status)) {
7046             float_raise(float_flag_invalid, status);
7047         }
7048         return float_relation_unordered;
7049     }
7050     aSign = extractFloatx80Sign( a );
7051     bSign = extractFloatx80Sign( b );
7052     if ( aSign != bSign ) {
7053 
7054         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7055              ( ( a.low | b.low ) == 0 ) ) {
7056             /* zero case */
7057             return float_relation_equal;
7058         } else {
7059             return 1 - (2 * aSign);
7060         }
7061     } else {
7062         if (a.low == b.low && a.high == b.high) {
7063             return float_relation_equal;
7064         } else {
7065             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7066         }
7067     }
7068 }
7069 
7070 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7071 {
7072     return floatx80_compare_internal(a, b, 0, status);
7073 }
7074 
7075 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7076 {
7077     return floatx80_compare_internal(a, b, 1, status);
7078 }
7079 
7080 static inline int float128_compare_internal(float128 a, float128 b,
7081                                             int is_quiet, float_status *status)
7082 {
7083     flag aSign, bSign;
7084 
7085     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7086           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7087         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7088           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7089         if (!is_quiet ||
7090             float128_is_signaling_nan(a, status) ||
7091             float128_is_signaling_nan(b, status)) {
7092             float_raise(float_flag_invalid, status);
7093         }
7094         return float_relation_unordered;
7095     }
7096     aSign = extractFloat128Sign( a );
7097     bSign = extractFloat128Sign( b );
7098     if ( aSign != bSign ) {
7099         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7100             /* zero case */
7101             return float_relation_equal;
7102         } else {
7103             return 1 - (2 * aSign);
7104         }
7105     } else {
7106         if (a.low == b.low && a.high == b.high) {
7107             return float_relation_equal;
7108         } else {
7109             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7110         }
7111     }
7112 }
7113 
7114 int float128_compare(float128 a, float128 b, float_status *status)
7115 {
7116     return float128_compare_internal(a, b, 0, status);
7117 }
7118 
7119 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7120 {
7121     return float128_compare_internal(a, b, 1, status);
7122 }
7123 
7124 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7125 {
7126     flag aSign;
7127     int32_t aExp;
7128     uint64_t aSig;
7129 
7130     if (floatx80_invalid_encoding(a)) {
7131         float_raise(float_flag_invalid, status);
7132         return floatx80_default_nan(status);
7133     }
7134     aSig = extractFloatx80Frac( a );
7135     aExp = extractFloatx80Exp( a );
7136     aSign = extractFloatx80Sign( a );
7137 
7138     if ( aExp == 0x7FFF ) {
7139         if ( aSig<<1 ) {
7140             return propagateFloatx80NaN(a, a, status);
7141         }
7142         return a;
7143     }
7144 
7145     if (aExp == 0) {
7146         if (aSig == 0) {
7147             return a;
7148         }
7149         aExp++;
7150     }
7151 
7152     if (n > 0x10000) {
7153         n = 0x10000;
7154     } else if (n < -0x10000) {
7155         n = -0x10000;
7156     }
7157 
7158     aExp += n;
7159     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7160                                          aSign, aExp, aSig, 0, status);
7161 }
7162 
7163 float128 float128_scalbn(float128 a, int n, float_status *status)
7164 {
7165     flag aSign;
7166     int32_t aExp;
7167     uint64_t aSig0, aSig1;
7168 
7169     aSig1 = extractFloat128Frac1( a );
7170     aSig0 = extractFloat128Frac0( a );
7171     aExp = extractFloat128Exp( a );
7172     aSign = extractFloat128Sign( a );
7173     if ( aExp == 0x7FFF ) {
7174         if ( aSig0 | aSig1 ) {
7175             return propagateFloat128NaN(a, a, status);
7176         }
7177         return a;
7178     }
7179     if (aExp != 0) {
7180         aSig0 |= LIT64( 0x0001000000000000 );
7181     } else if (aSig0 == 0 && aSig1 == 0) {
7182         return a;
7183     } else {
7184         aExp++;
7185     }
7186 
7187     if (n > 0x10000) {
7188         n = 0x10000;
7189     } else if (n < -0x10000) {
7190         n = -0x10000;
7191     }
7192 
7193     aExp += n - 1;
7194     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7195                                          , status);
7196 
7197 }
7198