xref: /qemu/fpu/softfloat.c (revision ccf770ba7396c240ca8a1564740083742dd04c08)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523                 float_status *status)
1524 {
1525     FloatParts pa = float32_unpack_canonical(a, status);
1526     FloatParts pb = float32_unpack_canonical(b, status);
1527     FloatParts pc = float32_unpack_canonical(c, status);
1528     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1529 
1530     return float32_round_pack_canonical(pr, status);
1531 }
1532 
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535                 float_status *status)
1536 {
1537     FloatParts pa = float64_unpack_canonical(a, status);
1538     FloatParts pb = float64_unpack_canonical(b, status);
1539     FloatParts pc = float64_unpack_canonical(c, status);
1540     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1541 
1542     return float64_round_pack_canonical(pr, status);
1543 }
1544 
1545 float32 QEMU_FLATTEN
1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1547 {
1548     union_float32 ua, ub, uc, ur;
1549 
1550     ua.s = xa;
1551     ub.s = xb;
1552     uc.s = xc;
1553 
1554     if (unlikely(!can_use_fpu(s))) {
1555         goto soft;
1556     }
1557     if (unlikely(flags & float_muladd_halve_result)) {
1558         goto soft;
1559     }
1560 
1561     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1562     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1563         goto soft;
1564     }
1565     /*
1566      * When (a || b) == 0, there's no need to check for under/over flow,
1567      * since we know the addend is (normal || 0) and the product is 0.
1568      */
1569     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1570         union_float32 up;
1571         bool prod_sign;
1572 
1573         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1574         prod_sign ^= !!(flags & float_muladd_negate_product);
1575         up.s = float32_set_sign(float32_zero, prod_sign);
1576 
1577         if (flags & float_muladd_negate_c) {
1578             uc.h = -uc.h;
1579         }
1580         ur.h = up.h + uc.h;
1581     } else {
1582         if (flags & float_muladd_negate_product) {
1583             ua.h = -ua.h;
1584         }
1585         if (flags & float_muladd_negate_c) {
1586             uc.h = -uc.h;
1587         }
1588 
1589         ur.h = fmaf(ua.h, ub.h, uc.h);
1590 
1591         if (unlikely(f32_is_inf(ur))) {
1592             s->float_exception_flags |= float_flag_overflow;
1593         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1594             goto soft;
1595         }
1596     }
1597     if (flags & float_muladd_negate_result) {
1598         return float32_chs(ur.s);
1599     }
1600     return ur.s;
1601 
1602  soft:
1603     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1604 }
1605 
1606 float64 QEMU_FLATTEN
1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1608 {
1609     union_float64 ua, ub, uc, ur;
1610 
1611     ua.s = xa;
1612     ub.s = xb;
1613     uc.s = xc;
1614 
1615     if (unlikely(!can_use_fpu(s))) {
1616         goto soft;
1617     }
1618     if (unlikely(flags & float_muladd_halve_result)) {
1619         goto soft;
1620     }
1621 
1622     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1623     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1624         goto soft;
1625     }
1626     /*
1627      * When (a || b) == 0, there's no need to check for under/over flow,
1628      * since we know the addend is (normal || 0) and the product is 0.
1629      */
1630     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1631         union_float64 up;
1632         bool prod_sign;
1633 
1634         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1635         prod_sign ^= !!(flags & float_muladd_negate_product);
1636         up.s = float64_set_sign(float64_zero, prod_sign);
1637 
1638         if (flags & float_muladd_negate_c) {
1639             uc.h = -uc.h;
1640         }
1641         ur.h = up.h + uc.h;
1642     } else {
1643         if (flags & float_muladd_negate_product) {
1644             ua.h = -ua.h;
1645         }
1646         if (flags & float_muladd_negate_c) {
1647             uc.h = -uc.h;
1648         }
1649 
1650         ur.h = fma(ua.h, ub.h, uc.h);
1651 
1652         if (unlikely(f64_is_inf(ur))) {
1653             s->float_exception_flags |= float_flag_overflow;
1654         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1655             goto soft;
1656         }
1657     }
1658     if (flags & float_muladd_negate_result) {
1659         return float64_chs(ur.s);
1660     }
1661     return ur.s;
1662 
1663  soft:
1664     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1665 }
1666 
1667 /*
1668  * Returns the result of dividing the floating-point value `a' by the
1669  * corresponding value `b'. The operation is performed according to
1670  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1671  */
1672 
1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1674 {
1675     bool sign = a.sign ^ b.sign;
1676 
1677     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1678         uint64_t n0, n1, q, r;
1679         int exp = a.exp - b.exp;
1680 
1681         /*
1682          * We want a 2*N / N-bit division to produce exactly an N-bit
1683          * result, so that we do not lose any precision and so that we
1684          * do not have to renormalize afterward.  If A.frac < B.frac,
1685          * then division would produce an (N-1)-bit result; shift A left
1686          * by one to produce the an N-bit result, and decrement the
1687          * exponent to match.
1688          *
1689          * The udiv_qrnnd algorithm that we're using requires normalization,
1690          * i.e. the msb of the denominator must be set.  Since we know that
1691          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1692          * by one (more), and the remainder must be shifted right by one.
1693          */
1694         if (a.frac < b.frac) {
1695             exp -= 1;
1696             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1697         } else {
1698             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1699         }
1700         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1701 
1702         /*
1703          * Set lsb if there is a remainder, to set inexact.
1704          * As mentioned above, to find the actual value of the remainder we
1705          * would need to shift right, but (1) we are only concerned about
1706          * non-zero-ness, and (2) the remainder will always be even because
1707          * both inputs to the division primitive are even.
1708          */
1709         a.frac = q | (r != 0);
1710         a.sign = sign;
1711         a.exp = exp;
1712         return a;
1713     }
1714     /* handle all the NaN cases */
1715     if (is_nan(a.cls) || is_nan(b.cls)) {
1716         return pick_nan(a, b, s);
1717     }
1718     /* 0/0 or Inf/Inf */
1719     if (a.cls == b.cls
1720         &&
1721         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1722         s->float_exception_flags |= float_flag_invalid;
1723         return parts_default_nan(s);
1724     }
1725     /* Inf / x or 0 / x */
1726     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1727         a.sign = sign;
1728         return a;
1729     }
1730     /* Div 0 => Inf */
1731     if (b.cls == float_class_zero) {
1732         s->float_exception_flags |= float_flag_divbyzero;
1733         a.cls = float_class_inf;
1734         a.sign = sign;
1735         return a;
1736     }
1737     /* Div by Inf */
1738     if (b.cls == float_class_inf) {
1739         a.cls = float_class_zero;
1740         a.sign = sign;
1741         return a;
1742     }
1743     g_assert_not_reached();
1744 }
1745 
1746 float16 float16_div(float16 a, float16 b, float_status *status)
1747 {
1748     FloatParts pa = float16_unpack_canonical(a, status);
1749     FloatParts pb = float16_unpack_canonical(b, status);
1750     FloatParts pr = div_floats(pa, pb, status);
1751 
1752     return float16_round_pack_canonical(pr, status);
1753 }
1754 
1755 static float32 QEMU_SOFTFLOAT_ATTR
1756 soft_f32_div(float32 a, float32 b, float_status *status)
1757 {
1758     FloatParts pa = float32_unpack_canonical(a, status);
1759     FloatParts pb = float32_unpack_canonical(b, status);
1760     FloatParts pr = div_floats(pa, pb, status);
1761 
1762     return float32_round_pack_canonical(pr, status);
1763 }
1764 
1765 static float64 QEMU_SOFTFLOAT_ATTR
1766 soft_f64_div(float64 a, float64 b, float_status *status)
1767 {
1768     FloatParts pa = float64_unpack_canonical(a, status);
1769     FloatParts pb = float64_unpack_canonical(b, status);
1770     FloatParts pr = div_floats(pa, pb, status);
1771 
1772     return float64_round_pack_canonical(pr, status);
1773 }
1774 
1775 static float hard_f32_div(float a, float b)
1776 {
1777     return a / b;
1778 }
1779 
1780 static double hard_f64_div(double a, double b)
1781 {
1782     return a / b;
1783 }
1784 
1785 static bool f32_div_pre(union_float32 a, union_float32 b)
1786 {
1787     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1788         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1789                fpclassify(b.h) == FP_NORMAL;
1790     }
1791     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1792 }
1793 
1794 static bool f64_div_pre(union_float64 a, union_float64 b)
1795 {
1796     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1797         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1798                fpclassify(b.h) == FP_NORMAL;
1799     }
1800     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1801 }
1802 
1803 static bool f32_div_post(union_float32 a, union_float32 b)
1804 {
1805     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1806         return fpclassify(a.h) != FP_ZERO;
1807     }
1808     return !float32_is_zero(a.s);
1809 }
1810 
1811 static bool f64_div_post(union_float64 a, union_float64 b)
1812 {
1813     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1814         return fpclassify(a.h) != FP_ZERO;
1815     }
1816     return !float64_is_zero(a.s);
1817 }
1818 
1819 float32 QEMU_FLATTEN
1820 float32_div(float32 a, float32 b, float_status *s)
1821 {
1822     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1823                         f32_div_pre, f32_div_post, NULL, NULL);
1824 }
1825 
1826 float64 QEMU_FLATTEN
1827 float64_div(float64 a, float64 b, float_status *s)
1828 {
1829     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1830                         f64_div_pre, f64_div_post, NULL, NULL);
1831 }
1832 
1833 /*
1834  * Float to Float conversions
1835  *
1836  * Returns the result of converting one float format to another. The
1837  * conversion is performed according to the IEC/IEEE Standard for
1838  * Binary Floating-Point Arithmetic.
1839  *
1840  * The float_to_float helper only needs to take care of raising
1841  * invalid exceptions and handling the conversion on NaNs.
1842  */
1843 
1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1845                                  float_status *s)
1846 {
1847     if (dstf->arm_althp) {
1848         switch (a.cls) {
1849         case float_class_qnan:
1850         case float_class_snan:
1851             /* There is no NaN in the destination format.  Raise Invalid
1852              * and return a zero with the sign of the input NaN.
1853              */
1854             s->float_exception_flags |= float_flag_invalid;
1855             a.cls = float_class_zero;
1856             a.frac = 0;
1857             a.exp = 0;
1858             break;
1859 
1860         case float_class_inf:
1861             /* There is no Inf in the destination format.  Raise Invalid
1862              * and return the maximum normal with the correct sign.
1863              */
1864             s->float_exception_flags |= float_flag_invalid;
1865             a.cls = float_class_normal;
1866             a.exp = dstf->exp_max;
1867             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1868             break;
1869 
1870         default:
1871             break;
1872         }
1873     } else if (is_nan(a.cls)) {
1874         if (is_snan(a.cls)) {
1875             s->float_exception_flags |= float_flag_invalid;
1876             a = parts_silence_nan(a, s);
1877         }
1878         if (s->default_nan_mode) {
1879             return parts_default_nan(s);
1880         }
1881     }
1882     return a;
1883 }
1884 
1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1886 {
1887     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1888     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1889     FloatParts pr = float_to_float(p, &float32_params, s);
1890     return float32_round_pack_canonical(pr, s);
1891 }
1892 
1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1894 {
1895     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1896     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1897     FloatParts pr = float_to_float(p, &float64_params, s);
1898     return float64_round_pack_canonical(pr, s);
1899 }
1900 
1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1902 {
1903     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1904     FloatParts p = float32_unpack_canonical(a, s);
1905     FloatParts pr = float_to_float(p, fmt16, s);
1906     return float16a_round_pack_canonical(pr, s, fmt16);
1907 }
1908 
1909 float64 float32_to_float64(float32 a, float_status *s)
1910 {
1911     FloatParts p = float32_unpack_canonical(a, s);
1912     FloatParts pr = float_to_float(p, &float64_params, s);
1913     return float64_round_pack_canonical(pr, s);
1914 }
1915 
1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1917 {
1918     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1919     FloatParts p = float64_unpack_canonical(a, s);
1920     FloatParts pr = float_to_float(p, fmt16, s);
1921     return float16a_round_pack_canonical(pr, s, fmt16);
1922 }
1923 
1924 float32 float64_to_float32(float64 a, float_status *s)
1925 {
1926     FloatParts p = float64_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float32_params, s);
1928     return float32_round_pack_canonical(pr, s);
1929 }
1930 
1931 /*
1932  * Rounds the floating-point value `a' to an integer, and returns the
1933  * result as a floating-point value. The operation is performed
1934  * according to the IEC/IEEE Standard for Binary Floating-Point
1935  * Arithmetic.
1936  */
1937 
1938 static FloatParts round_to_int(FloatParts a, int rmode,
1939                                int scale, float_status *s)
1940 {
1941     switch (a.cls) {
1942     case float_class_qnan:
1943     case float_class_snan:
1944         return return_nan(a, s);
1945 
1946     case float_class_zero:
1947     case float_class_inf:
1948         /* already "integral" */
1949         break;
1950 
1951     case float_class_normal:
1952         scale = MIN(MAX(scale, -0x10000), 0x10000);
1953         a.exp += scale;
1954 
1955         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1956             /* already integral */
1957             break;
1958         }
1959         if (a.exp < 0) {
1960             bool one;
1961             /* all fractional */
1962             s->float_exception_flags |= float_flag_inexact;
1963             switch (rmode) {
1964             case float_round_nearest_even:
1965                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1966                 break;
1967             case float_round_ties_away:
1968                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1969                 break;
1970             case float_round_to_zero:
1971                 one = false;
1972                 break;
1973             case float_round_up:
1974                 one = !a.sign;
1975                 break;
1976             case float_round_down:
1977                 one = a.sign;
1978                 break;
1979             default:
1980                 g_assert_not_reached();
1981             }
1982 
1983             if (one) {
1984                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1985                 a.exp = 0;
1986             } else {
1987                 a.cls = float_class_zero;
1988             }
1989         } else {
1990             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1991             uint64_t frac_lsbm1 = frac_lsb >> 1;
1992             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1993             uint64_t rnd_mask = rnd_even_mask >> 1;
1994             uint64_t inc;
1995 
1996             switch (rmode) {
1997             case float_round_nearest_even:
1998                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1999                 break;
2000             case float_round_ties_away:
2001                 inc = frac_lsbm1;
2002                 break;
2003             case float_round_to_zero:
2004                 inc = 0;
2005                 break;
2006             case float_round_up:
2007                 inc = a.sign ? 0 : rnd_mask;
2008                 break;
2009             case float_round_down:
2010                 inc = a.sign ? rnd_mask : 0;
2011                 break;
2012             default:
2013                 g_assert_not_reached();
2014             }
2015 
2016             if (a.frac & rnd_mask) {
2017                 s->float_exception_flags |= float_flag_inexact;
2018                 a.frac += inc;
2019                 a.frac &= ~rnd_mask;
2020                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2021                     a.frac >>= 1;
2022                     a.exp++;
2023                 }
2024             }
2025         }
2026         break;
2027     default:
2028         g_assert_not_reached();
2029     }
2030     return a;
2031 }
2032 
2033 float16 float16_round_to_int(float16 a, float_status *s)
2034 {
2035     FloatParts pa = float16_unpack_canonical(a, s);
2036     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2037     return float16_round_pack_canonical(pr, s);
2038 }
2039 
2040 float32 float32_round_to_int(float32 a, float_status *s)
2041 {
2042     FloatParts pa = float32_unpack_canonical(a, s);
2043     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2044     return float32_round_pack_canonical(pr, s);
2045 }
2046 
2047 float64 float64_round_to_int(float64 a, float_status *s)
2048 {
2049     FloatParts pa = float64_unpack_canonical(a, s);
2050     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2051     return float64_round_pack_canonical(pr, s);
2052 }
2053 
2054 /*
2055  * Returns the result of converting the floating-point value `a' to
2056  * the two's complement integer format. The conversion is performed
2057  * according to the IEC/IEEE Standard for Binary Floating-Point
2058  * Arithmetic---which means in particular that the conversion is
2059  * rounded according to the current rounding mode. If `a' is a NaN,
2060  * the largest positive integer is returned. Otherwise, if the
2061  * conversion overflows, the largest integer with the same sign as `a'
2062  * is returned.
2063 */
2064 
2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2066                                      int64_t min, int64_t max,
2067                                      float_status *s)
2068 {
2069     uint64_t r;
2070     int orig_flags = get_float_exception_flags(s);
2071     FloatParts p = round_to_int(in, rmode, scale, s);
2072 
2073     switch (p.cls) {
2074     case float_class_snan:
2075     case float_class_qnan:
2076         s->float_exception_flags = orig_flags | float_flag_invalid;
2077         return max;
2078     case float_class_inf:
2079         s->float_exception_flags = orig_flags | float_flag_invalid;
2080         return p.sign ? min : max;
2081     case float_class_zero:
2082         return 0;
2083     case float_class_normal:
2084         if (p.exp < DECOMPOSED_BINARY_POINT) {
2085             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2086         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2087             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2088         } else {
2089             r = UINT64_MAX;
2090         }
2091         if (p.sign) {
2092             if (r <= -(uint64_t) min) {
2093                 return -r;
2094             } else {
2095                 s->float_exception_flags = orig_flags | float_flag_invalid;
2096                 return min;
2097             }
2098         } else {
2099             if (r <= max) {
2100                 return r;
2101             } else {
2102                 s->float_exception_flags = orig_flags | float_flag_invalid;
2103                 return max;
2104             }
2105         }
2106     default:
2107         g_assert_not_reached();
2108     }
2109 }
2110 
2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2112                                 float_status *s)
2113 {
2114     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2115                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2116 }
2117 
2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2119                                 float_status *s)
2120 {
2121     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2122                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2123 }
2124 
2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2126                                 float_status *s)
2127 {
2128     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2129                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2130 }
2131 
2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2133                                 float_status *s)
2134 {
2135     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2136                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2137 }
2138 
2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2140                                 float_status *s)
2141 {
2142     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2143                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2144 }
2145 
2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2147                                 float_status *s)
2148 {
2149     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2150                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2151 }
2152 
2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2154                                 float_status *s)
2155 {
2156     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2157                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2158 }
2159 
2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2161                                 float_status *s)
2162 {
2163     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2164                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2165 }
2166 
2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2168                                 float_status *s)
2169 {
2170     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2171                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2172 }
2173 
2174 int16_t float16_to_int16(float16 a, float_status *s)
2175 {
2176     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2177 }
2178 
2179 int32_t float16_to_int32(float16 a, float_status *s)
2180 {
2181     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2182 }
2183 
2184 int64_t float16_to_int64(float16 a, float_status *s)
2185 {
2186     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2187 }
2188 
2189 int16_t float32_to_int16(float32 a, float_status *s)
2190 {
2191     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2192 }
2193 
2194 int32_t float32_to_int32(float32 a, float_status *s)
2195 {
2196     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2197 }
2198 
2199 int64_t float32_to_int64(float32 a, float_status *s)
2200 {
2201     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2202 }
2203 
2204 int16_t float64_to_int16(float64 a, float_status *s)
2205 {
2206     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2207 }
2208 
2209 int32_t float64_to_int32(float64 a, float_status *s)
2210 {
2211     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2212 }
2213 
2214 int64_t float64_to_int64(float64 a, float_status *s)
2215 {
2216     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2217 }
2218 
2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2220 {
2221     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2222 }
2223 
2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2225 {
2226     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2227 }
2228 
2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2230 {
2231     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2232 }
2233 
2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2235 {
2236     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2237 }
2238 
2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2240 {
2241     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2242 }
2243 
2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2245 {
2246     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2247 }
2248 
2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2250 {
2251     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2252 }
2253 
2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2255 {
2256     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2257 }
2258 
2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2260 {
2261     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2262 }
2263 
2264 /*
2265  *  Returns the result of converting the floating-point value `a' to
2266  *  the unsigned integer format. The conversion is performed according
2267  *  to the IEC/IEEE Standard for Binary Floating-Point
2268  *  Arithmetic---which means in particular that the conversion is
2269  *  rounded according to the current rounding mode. If `a' is a NaN,
2270  *  the largest unsigned integer is returned. Otherwise, if the
2271  *  conversion overflows, the largest unsigned integer is returned. If
2272  *  the 'a' is negative, the result is rounded and zero is returned;
2273  *  values that do not round to zero will raise the inexact exception
2274  *  flag.
2275  */
2276 
2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2278                                        uint64_t max, float_status *s)
2279 {
2280     int orig_flags = get_float_exception_flags(s);
2281     FloatParts p = round_to_int(in, rmode, scale, s);
2282     uint64_t r;
2283 
2284     switch (p.cls) {
2285     case float_class_snan:
2286     case float_class_qnan:
2287         s->float_exception_flags = orig_flags | float_flag_invalid;
2288         return max;
2289     case float_class_inf:
2290         s->float_exception_flags = orig_flags | float_flag_invalid;
2291         return p.sign ? 0 : max;
2292     case float_class_zero:
2293         return 0;
2294     case float_class_normal:
2295         if (p.sign) {
2296             s->float_exception_flags = orig_flags | float_flag_invalid;
2297             return 0;
2298         }
2299 
2300         if (p.exp < DECOMPOSED_BINARY_POINT) {
2301             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2302         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2303             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2304         } else {
2305             s->float_exception_flags = orig_flags | float_flag_invalid;
2306             return max;
2307         }
2308 
2309         /* For uint64 this will never trip, but if p.exp is too large
2310          * to shift a decomposed fraction we shall have exited via the
2311          * 3rd leg above.
2312          */
2313         if (r > max) {
2314             s->float_exception_flags = orig_flags | float_flag_invalid;
2315             return max;
2316         }
2317         return r;
2318     default:
2319         g_assert_not_reached();
2320     }
2321 }
2322 
2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2324                                   float_status *s)
2325 {
2326     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2327                                   rmode, scale, UINT16_MAX, s);
2328 }
2329 
2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2331                                   float_status *s)
2332 {
2333     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2334                                   rmode, scale, UINT32_MAX, s);
2335 }
2336 
2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2338                                   float_status *s)
2339 {
2340     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2341                                   rmode, scale, UINT64_MAX, s);
2342 }
2343 
2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2345                                   float_status *s)
2346 {
2347     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2348                                   rmode, scale, UINT16_MAX, s);
2349 }
2350 
2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2352                                   float_status *s)
2353 {
2354     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2355                                   rmode, scale, UINT32_MAX, s);
2356 }
2357 
2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2359                                   float_status *s)
2360 {
2361     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2362                                   rmode, scale, UINT64_MAX, s);
2363 }
2364 
2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2366                                   float_status *s)
2367 {
2368     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2369                                   rmode, scale, UINT16_MAX, s);
2370 }
2371 
2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2373                                   float_status *s)
2374 {
2375     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2376                                   rmode, scale, UINT32_MAX, s);
2377 }
2378 
2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2380                                   float_status *s)
2381 {
2382     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2383                                   rmode, scale, UINT64_MAX, s);
2384 }
2385 
2386 uint16_t float16_to_uint16(float16 a, float_status *s)
2387 {
2388     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2389 }
2390 
2391 uint32_t float16_to_uint32(float16 a, float_status *s)
2392 {
2393     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2394 }
2395 
2396 uint64_t float16_to_uint64(float16 a, float_status *s)
2397 {
2398     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2399 }
2400 
2401 uint16_t float32_to_uint16(float32 a, float_status *s)
2402 {
2403     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2404 }
2405 
2406 uint32_t float32_to_uint32(float32 a, float_status *s)
2407 {
2408     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410 
2411 uint64_t float32_to_uint64(float32 a, float_status *s)
2412 {
2413     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415 
2416 uint16_t float64_to_uint16(float64 a, float_status *s)
2417 {
2418     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420 
2421 uint32_t float64_to_uint32(float64 a, float_status *s)
2422 {
2423     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425 
2426 uint64_t float64_to_uint64(float64 a, float_status *s)
2427 {
2428     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430 
2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2432 {
2433     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2434 }
2435 
2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2437 {
2438     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2439 }
2440 
2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2442 {
2443     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2444 }
2445 
2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2447 {
2448     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2449 }
2450 
2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2452 {
2453     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455 
2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2457 {
2458     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460 
2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2462 {
2463     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465 
2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2467 {
2468     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470 
2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2472 {
2473     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475 
2476 /*
2477  * Integer to float conversions
2478  *
2479  * Returns the result of converting the two's complement integer `a'
2480  * to the floating-point format. The conversion is performed according
2481  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2482  */
2483 
2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2485 {
2486     FloatParts r = { .sign = false };
2487 
2488     if (a == 0) {
2489         r.cls = float_class_zero;
2490     } else {
2491         uint64_t f = a;
2492         int shift;
2493 
2494         r.cls = float_class_normal;
2495         if (a < 0) {
2496             f = -f;
2497             r.sign = true;
2498         }
2499         shift = clz64(f) - 1;
2500         scale = MIN(MAX(scale, -0x10000), 0x10000);
2501 
2502         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2503         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2504     }
2505 
2506     return r;
2507 }
2508 
2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2510 {
2511     FloatParts pa = int_to_float(a, scale, status);
2512     return float16_round_pack_canonical(pa, status);
2513 }
2514 
2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2516 {
2517     return int64_to_float16_scalbn(a, scale, status);
2518 }
2519 
2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2521 {
2522     return int64_to_float16_scalbn(a, scale, status);
2523 }
2524 
2525 float16 int64_to_float16(int64_t a, float_status *status)
2526 {
2527     return int64_to_float16_scalbn(a, 0, status);
2528 }
2529 
2530 float16 int32_to_float16(int32_t a, float_status *status)
2531 {
2532     return int64_to_float16_scalbn(a, 0, status);
2533 }
2534 
2535 float16 int16_to_float16(int16_t a, float_status *status)
2536 {
2537     return int64_to_float16_scalbn(a, 0, status);
2538 }
2539 
2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2541 {
2542     FloatParts pa = int_to_float(a, scale, status);
2543     return float32_round_pack_canonical(pa, status);
2544 }
2545 
2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2547 {
2548     return int64_to_float32_scalbn(a, scale, status);
2549 }
2550 
2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2552 {
2553     return int64_to_float32_scalbn(a, scale, status);
2554 }
2555 
2556 float32 int64_to_float32(int64_t a, float_status *status)
2557 {
2558     return int64_to_float32_scalbn(a, 0, status);
2559 }
2560 
2561 float32 int32_to_float32(int32_t a, float_status *status)
2562 {
2563     return int64_to_float32_scalbn(a, 0, status);
2564 }
2565 
2566 float32 int16_to_float32(int16_t a, float_status *status)
2567 {
2568     return int64_to_float32_scalbn(a, 0, status);
2569 }
2570 
2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2572 {
2573     FloatParts pa = int_to_float(a, scale, status);
2574     return float64_round_pack_canonical(pa, status);
2575 }
2576 
2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2578 {
2579     return int64_to_float64_scalbn(a, scale, status);
2580 }
2581 
2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2583 {
2584     return int64_to_float64_scalbn(a, scale, status);
2585 }
2586 
2587 float64 int64_to_float64(int64_t a, float_status *status)
2588 {
2589     return int64_to_float64_scalbn(a, 0, status);
2590 }
2591 
2592 float64 int32_to_float64(int32_t a, float_status *status)
2593 {
2594     return int64_to_float64_scalbn(a, 0, status);
2595 }
2596 
2597 float64 int16_to_float64(int16_t a, float_status *status)
2598 {
2599     return int64_to_float64_scalbn(a, 0, status);
2600 }
2601 
2602 
2603 /*
2604  * Unsigned Integer to float conversions
2605  *
2606  * Returns the result of converting the unsigned integer `a' to the
2607  * floating-point format. The conversion is performed according to the
2608  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2609  */
2610 
2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2612 {
2613     FloatParts r = { .sign = false };
2614 
2615     if (a == 0) {
2616         r.cls = float_class_zero;
2617     } else {
2618         scale = MIN(MAX(scale, -0x10000), 0x10000);
2619         r.cls = float_class_normal;
2620         if ((int64_t)a < 0) {
2621             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2622             shift64RightJamming(a, 1, &a);
2623             r.frac = a;
2624         } else {
2625             int shift = clz64(a) - 1;
2626             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2627             r.frac = a << shift;
2628         }
2629     }
2630 
2631     return r;
2632 }
2633 
2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2635 {
2636     FloatParts pa = uint_to_float(a, scale, status);
2637     return float16_round_pack_canonical(pa, status);
2638 }
2639 
2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2641 {
2642     return uint64_to_float16_scalbn(a, scale, status);
2643 }
2644 
2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2646 {
2647     return uint64_to_float16_scalbn(a, scale, status);
2648 }
2649 
2650 float16 uint64_to_float16(uint64_t a, float_status *status)
2651 {
2652     return uint64_to_float16_scalbn(a, 0, status);
2653 }
2654 
2655 float16 uint32_to_float16(uint32_t a, float_status *status)
2656 {
2657     return uint64_to_float16_scalbn(a, 0, status);
2658 }
2659 
2660 float16 uint16_to_float16(uint16_t a, float_status *status)
2661 {
2662     return uint64_to_float16_scalbn(a, 0, status);
2663 }
2664 
2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2666 {
2667     FloatParts pa = uint_to_float(a, scale, status);
2668     return float32_round_pack_canonical(pa, status);
2669 }
2670 
2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2672 {
2673     return uint64_to_float32_scalbn(a, scale, status);
2674 }
2675 
2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2677 {
2678     return uint64_to_float32_scalbn(a, scale, status);
2679 }
2680 
2681 float32 uint64_to_float32(uint64_t a, float_status *status)
2682 {
2683     return uint64_to_float32_scalbn(a, 0, status);
2684 }
2685 
2686 float32 uint32_to_float32(uint32_t a, float_status *status)
2687 {
2688     return uint64_to_float32_scalbn(a, 0, status);
2689 }
2690 
2691 float32 uint16_to_float32(uint16_t a, float_status *status)
2692 {
2693     return uint64_to_float32_scalbn(a, 0, status);
2694 }
2695 
2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2697 {
2698     FloatParts pa = uint_to_float(a, scale, status);
2699     return float64_round_pack_canonical(pa, status);
2700 }
2701 
2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2703 {
2704     return uint64_to_float64_scalbn(a, scale, status);
2705 }
2706 
2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2708 {
2709     return uint64_to_float64_scalbn(a, scale, status);
2710 }
2711 
2712 float64 uint64_to_float64(uint64_t a, float_status *status)
2713 {
2714     return uint64_to_float64_scalbn(a, 0, status);
2715 }
2716 
2717 float64 uint32_to_float64(uint32_t a, float_status *status)
2718 {
2719     return uint64_to_float64_scalbn(a, 0, status);
2720 }
2721 
2722 float64 uint16_to_float64(uint16_t a, float_status *status)
2723 {
2724     return uint64_to_float64_scalbn(a, 0, status);
2725 }
2726 
2727 /* Float Min/Max */
2728 /* min() and max() functions. These can't be implemented as
2729  * 'compare and pick one input' because that would mishandle
2730  * NaNs and +0 vs -0.
2731  *
2732  * minnum() and maxnum() functions. These are similar to the min()
2733  * and max() functions but if one of the arguments is a QNaN and
2734  * the other is numerical then the numerical argument is returned.
2735  * SNaNs will get quietened before being returned.
2736  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2737  * and maxNum() operations. min() and max() are the typical min/max
2738  * semantics provided by many CPUs which predate that specification.
2739  *
2740  * minnummag() and maxnummag() functions correspond to minNumMag()
2741  * and minNumMag() from the IEEE-754 2008.
2742  */
2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2744                                 bool ieee, bool ismag, float_status *s)
2745 {
2746     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2747         if (ieee) {
2748             /* Takes two floating-point values `a' and `b', one of
2749              * which is a NaN, and returns the appropriate NaN
2750              * result. If either `a' or `b' is a signaling NaN,
2751              * the invalid exception is raised.
2752              */
2753             if (is_snan(a.cls) || is_snan(b.cls)) {
2754                 return pick_nan(a, b, s);
2755             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2756                 return b;
2757             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2758                 return a;
2759             }
2760         }
2761         return pick_nan(a, b, s);
2762     } else {
2763         int a_exp, b_exp;
2764 
2765         switch (a.cls) {
2766         case float_class_normal:
2767             a_exp = a.exp;
2768             break;
2769         case float_class_inf:
2770             a_exp = INT_MAX;
2771             break;
2772         case float_class_zero:
2773             a_exp = INT_MIN;
2774             break;
2775         default:
2776             g_assert_not_reached();
2777             break;
2778         }
2779         switch (b.cls) {
2780         case float_class_normal:
2781             b_exp = b.exp;
2782             break;
2783         case float_class_inf:
2784             b_exp = INT_MAX;
2785             break;
2786         case float_class_zero:
2787             b_exp = INT_MIN;
2788             break;
2789         default:
2790             g_assert_not_reached();
2791             break;
2792         }
2793 
2794         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2795             bool a_less = a_exp < b_exp;
2796             if (a_exp == b_exp) {
2797                 a_less = a.frac < b.frac;
2798             }
2799             return a_less ^ ismin ? b : a;
2800         }
2801 
2802         if (a.sign == b.sign) {
2803             bool a_less = a_exp < b_exp;
2804             if (a_exp == b_exp) {
2805                 a_less = a.frac < b.frac;
2806             }
2807             return a.sign ^ a_less ^ ismin ? b : a;
2808         } else {
2809             return a.sign ^ ismin ? b : a;
2810         }
2811     }
2812 }
2813 
2814 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2816                                      float_status *s)                   \
2817 {                                                                       \
2818     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2819     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2820     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2821                                                                         \
2822     return float ## sz ## _round_pack_canonical(pr, s);                 \
2823 }
2824 
2825 MINMAX(16, min, true, false, false)
2826 MINMAX(16, minnum, true, true, false)
2827 MINMAX(16, minnummag, true, true, true)
2828 MINMAX(16, max, false, false, false)
2829 MINMAX(16, maxnum, false, true, false)
2830 MINMAX(16, maxnummag, false, true, true)
2831 
2832 MINMAX(32, min, true, false, false)
2833 MINMAX(32, minnum, true, true, false)
2834 MINMAX(32, minnummag, true, true, true)
2835 MINMAX(32, max, false, false, false)
2836 MINMAX(32, maxnum, false, true, false)
2837 MINMAX(32, maxnummag, false, true, true)
2838 
2839 MINMAX(64, min, true, false, false)
2840 MINMAX(64, minnum, true, true, false)
2841 MINMAX(64, minnummag, true, true, true)
2842 MINMAX(64, max, false, false, false)
2843 MINMAX(64, maxnum, false, true, false)
2844 MINMAX(64, maxnummag, false, true, true)
2845 
2846 #undef MINMAX
2847 
2848 /* Floating point compare */
2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2850                           float_status *s)
2851 {
2852     if (is_nan(a.cls) || is_nan(b.cls)) {
2853         if (!is_quiet ||
2854             a.cls == float_class_snan ||
2855             b.cls == float_class_snan) {
2856             s->float_exception_flags |= float_flag_invalid;
2857         }
2858         return float_relation_unordered;
2859     }
2860 
2861     if (a.cls == float_class_zero) {
2862         if (b.cls == float_class_zero) {
2863             return float_relation_equal;
2864         }
2865         return b.sign ? float_relation_greater : float_relation_less;
2866     } else if (b.cls == float_class_zero) {
2867         return a.sign ? float_relation_less : float_relation_greater;
2868     }
2869 
2870     /* The only really important thing about infinity is its sign. If
2871      * both are infinities the sign marks the smallest of the two.
2872      */
2873     if (a.cls == float_class_inf) {
2874         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2875             return float_relation_equal;
2876         }
2877         return a.sign ? float_relation_less : float_relation_greater;
2878     } else if (b.cls == float_class_inf) {
2879         return b.sign ? float_relation_greater : float_relation_less;
2880     }
2881 
2882     if (a.sign != b.sign) {
2883         return a.sign ? float_relation_less : float_relation_greater;
2884     }
2885 
2886     if (a.exp == b.exp) {
2887         if (a.frac == b.frac) {
2888             return float_relation_equal;
2889         }
2890         if (a.sign) {
2891             return a.frac > b.frac ?
2892                 float_relation_less : float_relation_greater;
2893         } else {
2894             return a.frac > b.frac ?
2895                 float_relation_greater : float_relation_less;
2896         }
2897     } else {
2898         if (a.sign) {
2899             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2900         } else {
2901             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2902         }
2903     }
2904 }
2905 
2906 #define COMPARE(sz)                                                     \
2907 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2908                             float_status *s)                            \
2909 {                                                                       \
2910     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2911     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2912     return compare_floats(pa, pb, false, s);                            \
2913 }                                                                       \
2914 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2915                                   float_status *s)                      \
2916 {                                                                       \
2917     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2918     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2919     return compare_floats(pa, pb, true, s);                             \
2920 }
2921 
2922 COMPARE(16)
2923 COMPARE(32)
2924 COMPARE(64)
2925 
2926 #undef COMPARE
2927 
2928 /* Multiply A by 2 raised to the power N.  */
2929 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2930 {
2931     if (unlikely(is_nan(a.cls))) {
2932         return return_nan(a, s);
2933     }
2934     if (a.cls == float_class_normal) {
2935         /* The largest float type (even though not supported by FloatParts)
2936          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2937          * still allows rounding to infinity, without allowing overflow
2938          * within the int32_t that backs FloatParts.exp.
2939          */
2940         n = MIN(MAX(n, -0x10000), 0x10000);
2941         a.exp += n;
2942     }
2943     return a;
2944 }
2945 
2946 float16 float16_scalbn(float16 a, int n, float_status *status)
2947 {
2948     FloatParts pa = float16_unpack_canonical(a, status);
2949     FloatParts pr = scalbn_decomposed(pa, n, status);
2950     return float16_round_pack_canonical(pr, status);
2951 }
2952 
2953 float32 float32_scalbn(float32 a, int n, float_status *status)
2954 {
2955     FloatParts pa = float32_unpack_canonical(a, status);
2956     FloatParts pr = scalbn_decomposed(pa, n, status);
2957     return float32_round_pack_canonical(pr, status);
2958 }
2959 
2960 float64 float64_scalbn(float64 a, int n, float_status *status)
2961 {
2962     FloatParts pa = float64_unpack_canonical(a, status);
2963     FloatParts pr = scalbn_decomposed(pa, n, status);
2964     return float64_round_pack_canonical(pr, status);
2965 }
2966 
2967 /*
2968  * Square Root
2969  *
2970  * The old softfloat code did an approximation step before zeroing in
2971  * on the final result. However for simpleness we just compute the
2972  * square root by iterating down from the implicit bit to enough extra
2973  * bits to ensure we get a correctly rounded result.
2974  *
2975  * This does mean however the calculation is slower than before,
2976  * especially for 64 bit floats.
2977  */
2978 
2979 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2980 {
2981     uint64_t a_frac, r_frac, s_frac;
2982     int bit, last_bit;
2983 
2984     if (is_nan(a.cls)) {
2985         return return_nan(a, s);
2986     }
2987     if (a.cls == float_class_zero) {
2988         return a;  /* sqrt(+-0) = +-0 */
2989     }
2990     if (a.sign) {
2991         s->float_exception_flags |= float_flag_invalid;
2992         return parts_default_nan(s);
2993     }
2994     if (a.cls == float_class_inf) {
2995         return a;  /* sqrt(+inf) = +inf */
2996     }
2997 
2998     assert(a.cls == float_class_normal);
2999 
3000     /* We need two overflow bits at the top. Adding room for that is a
3001      * right shift. If the exponent is odd, we can discard the low bit
3002      * by multiplying the fraction by 2; that's a left shift. Combine
3003      * those and we shift right if the exponent is even.
3004      */
3005     a_frac = a.frac;
3006     if (!(a.exp & 1)) {
3007         a_frac >>= 1;
3008     }
3009     a.exp >>= 1;
3010 
3011     /* Bit-by-bit computation of sqrt.  */
3012     r_frac = 0;
3013     s_frac = 0;
3014 
3015     /* Iterate from implicit bit down to the 3 extra bits to compute a
3016      * properly rounded result. Remember we've inserted one more bit
3017      * at the top, so these positions are one less.
3018      */
3019     bit = DECOMPOSED_BINARY_POINT - 1;
3020     last_bit = MAX(p->frac_shift - 4, 0);
3021     do {
3022         uint64_t q = 1ULL << bit;
3023         uint64_t t_frac = s_frac + q;
3024         if (t_frac <= a_frac) {
3025             s_frac = t_frac + q;
3026             a_frac -= t_frac;
3027             r_frac += q;
3028         }
3029         a_frac <<= 1;
3030     } while (--bit >= last_bit);
3031 
3032     /* Undo the right shift done above. If there is any remaining
3033      * fraction, the result is inexact. Set the sticky bit.
3034      */
3035     a.frac = (r_frac << 1) + (a_frac != 0);
3036 
3037     return a;
3038 }
3039 
3040 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3041 {
3042     FloatParts pa = float16_unpack_canonical(a, status);
3043     FloatParts pr = sqrt_float(pa, status, &float16_params);
3044     return float16_round_pack_canonical(pr, status);
3045 }
3046 
3047 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
3048 {
3049     FloatParts pa = float32_unpack_canonical(a, status);
3050     FloatParts pr = sqrt_float(pa, status, &float32_params);
3051     return float32_round_pack_canonical(pr, status);
3052 }
3053 
3054 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
3055 {
3056     FloatParts pa = float64_unpack_canonical(a, status);
3057     FloatParts pr = sqrt_float(pa, status, &float64_params);
3058     return float64_round_pack_canonical(pr, status);
3059 }
3060 
3061 /*----------------------------------------------------------------------------
3062 | The pattern for a default generated NaN.
3063 *----------------------------------------------------------------------------*/
3064 
3065 float16 float16_default_nan(float_status *status)
3066 {
3067     FloatParts p = parts_default_nan(status);
3068     p.frac >>= float16_params.frac_shift;
3069     return float16_pack_raw(p);
3070 }
3071 
3072 float32 float32_default_nan(float_status *status)
3073 {
3074     FloatParts p = parts_default_nan(status);
3075     p.frac >>= float32_params.frac_shift;
3076     return float32_pack_raw(p);
3077 }
3078 
3079 float64 float64_default_nan(float_status *status)
3080 {
3081     FloatParts p = parts_default_nan(status);
3082     p.frac >>= float64_params.frac_shift;
3083     return float64_pack_raw(p);
3084 }
3085 
3086 float128 float128_default_nan(float_status *status)
3087 {
3088     FloatParts p = parts_default_nan(status);
3089     float128 r;
3090 
3091     /* Extrapolate from the choices made by parts_default_nan to fill
3092      * in the quad-floating format.  If the low bit is set, assume we
3093      * want to set all non-snan bits.
3094      */
3095     r.low = -(p.frac & 1);
3096     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3097     r.high |= LIT64(0x7FFF000000000000);
3098     r.high |= (uint64_t)p.sign << 63;
3099 
3100     return r;
3101 }
3102 
3103 /*----------------------------------------------------------------------------
3104 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3105 *----------------------------------------------------------------------------*/
3106 
3107 float16 float16_silence_nan(float16 a, float_status *status)
3108 {
3109     FloatParts p = float16_unpack_raw(a);
3110     p.frac <<= float16_params.frac_shift;
3111     p = parts_silence_nan(p, status);
3112     p.frac >>= float16_params.frac_shift;
3113     return float16_pack_raw(p);
3114 }
3115 
3116 float32 float32_silence_nan(float32 a, float_status *status)
3117 {
3118     FloatParts p = float32_unpack_raw(a);
3119     p.frac <<= float32_params.frac_shift;
3120     p = parts_silence_nan(p, status);
3121     p.frac >>= float32_params.frac_shift;
3122     return float32_pack_raw(p);
3123 }
3124 
3125 float64 float64_silence_nan(float64 a, float_status *status)
3126 {
3127     FloatParts p = float64_unpack_raw(a);
3128     p.frac <<= float64_params.frac_shift;
3129     p = parts_silence_nan(p, status);
3130     p.frac >>= float64_params.frac_shift;
3131     return float64_pack_raw(p);
3132 }
3133 
3134 /*----------------------------------------------------------------------------
3135 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3136 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3137 | input.  If `zSign' is 1, the input is negated before being converted to an
3138 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3139 | is simply rounded to an integer, with the inexact exception raised if the
3140 | input cannot be represented exactly as an integer.  However, if the fixed-
3141 | point input is too large, the invalid exception is raised and the largest
3142 | positive or negative integer is returned.
3143 *----------------------------------------------------------------------------*/
3144 
3145 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3146 {
3147     int8_t roundingMode;
3148     flag roundNearestEven;
3149     int8_t roundIncrement, roundBits;
3150     int32_t z;
3151 
3152     roundingMode = status->float_rounding_mode;
3153     roundNearestEven = ( roundingMode == float_round_nearest_even );
3154     switch (roundingMode) {
3155     case float_round_nearest_even:
3156     case float_round_ties_away:
3157         roundIncrement = 0x40;
3158         break;
3159     case float_round_to_zero:
3160         roundIncrement = 0;
3161         break;
3162     case float_round_up:
3163         roundIncrement = zSign ? 0 : 0x7f;
3164         break;
3165     case float_round_down:
3166         roundIncrement = zSign ? 0x7f : 0;
3167         break;
3168     default:
3169         abort();
3170     }
3171     roundBits = absZ & 0x7F;
3172     absZ = ( absZ + roundIncrement )>>7;
3173     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3174     z = absZ;
3175     if ( zSign ) z = - z;
3176     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3177         float_raise(float_flag_invalid, status);
3178         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3179     }
3180     if (roundBits) {
3181         status->float_exception_flags |= float_flag_inexact;
3182     }
3183     return z;
3184 
3185 }
3186 
3187 /*----------------------------------------------------------------------------
3188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3190 | and returns the properly rounded 64-bit integer corresponding to the input.
3191 | If `zSign' is 1, the input is negated before being converted to an integer.
3192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3193 | the inexact exception raised if the input cannot be represented exactly as
3194 | an integer.  However, if the fixed-point input is too large, the invalid
3195 | exception is raised and the largest positive or negative integer is
3196 | returned.
3197 *----------------------------------------------------------------------------*/
3198 
3199 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3200                                float_status *status)
3201 {
3202     int8_t roundingMode;
3203     flag roundNearestEven, increment;
3204     int64_t z;
3205 
3206     roundingMode = status->float_rounding_mode;
3207     roundNearestEven = ( roundingMode == float_round_nearest_even );
3208     switch (roundingMode) {
3209     case float_round_nearest_even:
3210     case float_round_ties_away:
3211         increment = ((int64_t) absZ1 < 0);
3212         break;
3213     case float_round_to_zero:
3214         increment = 0;
3215         break;
3216     case float_round_up:
3217         increment = !zSign && absZ1;
3218         break;
3219     case float_round_down:
3220         increment = zSign && absZ1;
3221         break;
3222     default:
3223         abort();
3224     }
3225     if ( increment ) {
3226         ++absZ0;
3227         if ( absZ0 == 0 ) goto overflow;
3228         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3229     }
3230     z = absZ0;
3231     if ( zSign ) z = - z;
3232     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3233  overflow:
3234         float_raise(float_flag_invalid, status);
3235         return
3236               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3237             : LIT64( 0x7FFFFFFFFFFFFFFF );
3238     }
3239     if (absZ1) {
3240         status->float_exception_flags |= float_flag_inexact;
3241     }
3242     return z;
3243 
3244 }
3245 
3246 /*----------------------------------------------------------------------------
3247 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3248 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3249 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3250 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3251 | with the inexact exception raised if the input cannot be represented exactly
3252 | as an integer.  However, if the fixed-point input is too large, the invalid
3253 | exception is raised and the largest unsigned integer is returned.
3254 *----------------------------------------------------------------------------*/
3255 
3256 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3257                                 uint64_t absZ1, float_status *status)
3258 {
3259     int8_t roundingMode;
3260     flag roundNearestEven, increment;
3261 
3262     roundingMode = status->float_rounding_mode;
3263     roundNearestEven = (roundingMode == float_round_nearest_even);
3264     switch (roundingMode) {
3265     case float_round_nearest_even:
3266     case float_round_ties_away:
3267         increment = ((int64_t)absZ1 < 0);
3268         break;
3269     case float_round_to_zero:
3270         increment = 0;
3271         break;
3272     case float_round_up:
3273         increment = !zSign && absZ1;
3274         break;
3275     case float_round_down:
3276         increment = zSign && absZ1;
3277         break;
3278     default:
3279         abort();
3280     }
3281     if (increment) {
3282         ++absZ0;
3283         if (absZ0 == 0) {
3284             float_raise(float_flag_invalid, status);
3285             return LIT64(0xFFFFFFFFFFFFFFFF);
3286         }
3287         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3288     }
3289 
3290     if (zSign && absZ0) {
3291         float_raise(float_flag_invalid, status);
3292         return 0;
3293     }
3294 
3295     if (absZ1) {
3296         status->float_exception_flags |= float_flag_inexact;
3297     }
3298     return absZ0;
3299 }
3300 
3301 /*----------------------------------------------------------------------------
3302 | If `a' is denormal and we are in flush-to-zero mode then set the
3303 | input-denormal exception and return zero. Otherwise just return the value.
3304 *----------------------------------------------------------------------------*/
3305 float32 float32_squash_input_denormal(float32 a, float_status *status)
3306 {
3307     if (status->flush_inputs_to_zero) {
3308         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3309             float_raise(float_flag_input_denormal, status);
3310             return make_float32(float32_val(a) & 0x80000000);
3311         }
3312     }
3313     return a;
3314 }
3315 
3316 /*----------------------------------------------------------------------------
3317 | Normalizes the subnormal single-precision floating-point value represented
3318 | by the denormalized significand `aSig'.  The normalized exponent and
3319 | significand are stored at the locations pointed to by `zExpPtr' and
3320 | `zSigPtr', respectively.
3321 *----------------------------------------------------------------------------*/
3322 
3323 static void
3324  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3325 {
3326     int8_t shiftCount;
3327 
3328     shiftCount = clz32(aSig) - 8;
3329     *zSigPtr = aSig<<shiftCount;
3330     *zExpPtr = 1 - shiftCount;
3331 
3332 }
3333 
3334 /*----------------------------------------------------------------------------
3335 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3336 | and significand `zSig', and returns the proper single-precision floating-
3337 | point value corresponding to the abstract input.  Ordinarily, the abstract
3338 | value is simply rounded and packed into the single-precision format, with
3339 | the inexact exception raised if the abstract input cannot be represented
3340 | exactly.  However, if the abstract value is too large, the overflow and
3341 | inexact exceptions are raised and an infinity or maximal finite value is
3342 | returned.  If the abstract value is too small, the input value is rounded to
3343 | a subnormal number, and the underflow and inexact exceptions are raised if
3344 | the abstract input cannot be represented exactly as a subnormal single-
3345 | precision floating-point number.
3346 |     The input significand `zSig' has its binary point between bits 30
3347 | and 29, which is 7 bits to the left of the usual location.  This shifted
3348 | significand must be normalized or smaller.  If `zSig' is not normalized,
3349 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3350 | and it must not require rounding.  In the usual case that `zSig' is
3351 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3352 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3353 | Binary Floating-Point Arithmetic.
3354 *----------------------------------------------------------------------------*/
3355 
3356 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3357                                    float_status *status)
3358 {
3359     int8_t roundingMode;
3360     flag roundNearestEven;
3361     int8_t roundIncrement, roundBits;
3362     flag isTiny;
3363 
3364     roundingMode = status->float_rounding_mode;
3365     roundNearestEven = ( roundingMode == float_round_nearest_even );
3366     switch (roundingMode) {
3367     case float_round_nearest_even:
3368     case float_round_ties_away:
3369         roundIncrement = 0x40;
3370         break;
3371     case float_round_to_zero:
3372         roundIncrement = 0;
3373         break;
3374     case float_round_up:
3375         roundIncrement = zSign ? 0 : 0x7f;
3376         break;
3377     case float_round_down:
3378         roundIncrement = zSign ? 0x7f : 0;
3379         break;
3380     default:
3381         abort();
3382         break;
3383     }
3384     roundBits = zSig & 0x7F;
3385     if ( 0xFD <= (uint16_t) zExp ) {
3386         if (    ( 0xFD < zExp )
3387              || (    ( zExp == 0xFD )
3388                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3389            ) {
3390             float_raise(float_flag_overflow | float_flag_inexact, status);
3391             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3392         }
3393         if ( zExp < 0 ) {
3394             if (status->flush_to_zero) {
3395                 float_raise(float_flag_output_denormal, status);
3396                 return packFloat32(zSign, 0, 0);
3397             }
3398             isTiny =
3399                 (status->float_detect_tininess
3400                  == float_tininess_before_rounding)
3401                 || ( zExp < -1 )
3402                 || ( zSig + roundIncrement < 0x80000000 );
3403             shift32RightJamming( zSig, - zExp, &zSig );
3404             zExp = 0;
3405             roundBits = zSig & 0x7F;
3406             if (isTiny && roundBits) {
3407                 float_raise(float_flag_underflow, status);
3408             }
3409         }
3410     }
3411     if (roundBits) {
3412         status->float_exception_flags |= float_flag_inexact;
3413     }
3414     zSig = ( zSig + roundIncrement )>>7;
3415     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3416     if ( zSig == 0 ) zExp = 0;
3417     return packFloat32( zSign, zExp, zSig );
3418 
3419 }
3420 
3421 /*----------------------------------------------------------------------------
3422 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3423 | and significand `zSig', and returns the proper single-precision floating-
3424 | point value corresponding to the abstract input.  This routine is just like
3425 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3426 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3427 | floating-point exponent.
3428 *----------------------------------------------------------------------------*/
3429 
3430 static float32
3431  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3432                               float_status *status)
3433 {
3434     int8_t shiftCount;
3435 
3436     shiftCount = clz32(zSig) - 1;
3437     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3438                                status);
3439 
3440 }
3441 
3442 /*----------------------------------------------------------------------------
3443 | If `a' is denormal and we are in flush-to-zero mode then set the
3444 | input-denormal exception and return zero. Otherwise just return the value.
3445 *----------------------------------------------------------------------------*/
3446 float64 float64_squash_input_denormal(float64 a, float_status *status)
3447 {
3448     if (status->flush_inputs_to_zero) {
3449         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3450             float_raise(float_flag_input_denormal, status);
3451             return make_float64(float64_val(a) & (1ULL << 63));
3452         }
3453     }
3454     return a;
3455 }
3456 
3457 /*----------------------------------------------------------------------------
3458 | Normalizes the subnormal double-precision floating-point value represented
3459 | by the denormalized significand `aSig'.  The normalized exponent and
3460 | significand are stored at the locations pointed to by `zExpPtr' and
3461 | `zSigPtr', respectively.
3462 *----------------------------------------------------------------------------*/
3463 
3464 static void
3465  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3466 {
3467     int8_t shiftCount;
3468 
3469     shiftCount = clz64(aSig) - 11;
3470     *zSigPtr = aSig<<shiftCount;
3471     *zExpPtr = 1 - shiftCount;
3472 
3473 }
3474 
3475 /*----------------------------------------------------------------------------
3476 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3477 | double-precision floating-point value, returning the result.  After being
3478 | shifted into the proper positions, the three fields are simply added
3479 | together to form the result.  This means that any integer portion of `zSig'
3480 | will be added into the exponent.  Since a properly normalized significand
3481 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3482 | than the desired result exponent whenever `zSig' is a complete, normalized
3483 | significand.
3484 *----------------------------------------------------------------------------*/
3485 
3486 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3487 {
3488 
3489     return make_float64(
3490         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3491 
3492 }
3493 
3494 /*----------------------------------------------------------------------------
3495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3496 | and significand `zSig', and returns the proper double-precision floating-
3497 | point value corresponding to the abstract input.  Ordinarily, the abstract
3498 | value is simply rounded and packed into the double-precision format, with
3499 | the inexact exception raised if the abstract input cannot be represented
3500 | exactly.  However, if the abstract value is too large, the overflow and
3501 | inexact exceptions are raised and an infinity or maximal finite value is
3502 | returned.  If the abstract value is too small, the input value is rounded to
3503 | a subnormal number, and the underflow and inexact exceptions are raised if
3504 | the abstract input cannot be represented exactly as a subnormal double-
3505 | precision floating-point number.
3506 |     The input significand `zSig' has its binary point between bits 62
3507 | and 61, which is 10 bits to the left of the usual location.  This shifted
3508 | significand must be normalized or smaller.  If `zSig' is not normalized,
3509 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3510 | and it must not require rounding.  In the usual case that `zSig' is
3511 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3512 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3513 | Binary Floating-Point Arithmetic.
3514 *----------------------------------------------------------------------------*/
3515 
3516 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3517                                    float_status *status)
3518 {
3519     int8_t roundingMode;
3520     flag roundNearestEven;
3521     int roundIncrement, roundBits;
3522     flag isTiny;
3523 
3524     roundingMode = status->float_rounding_mode;
3525     roundNearestEven = ( roundingMode == float_round_nearest_even );
3526     switch (roundingMode) {
3527     case float_round_nearest_even:
3528     case float_round_ties_away:
3529         roundIncrement = 0x200;
3530         break;
3531     case float_round_to_zero:
3532         roundIncrement = 0;
3533         break;
3534     case float_round_up:
3535         roundIncrement = zSign ? 0 : 0x3ff;
3536         break;
3537     case float_round_down:
3538         roundIncrement = zSign ? 0x3ff : 0;
3539         break;
3540     case float_round_to_odd:
3541         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3542         break;
3543     default:
3544         abort();
3545     }
3546     roundBits = zSig & 0x3FF;
3547     if ( 0x7FD <= (uint16_t) zExp ) {
3548         if (    ( 0x7FD < zExp )
3549              || (    ( zExp == 0x7FD )
3550                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3551            ) {
3552             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3553                                    roundIncrement != 0;
3554             float_raise(float_flag_overflow | float_flag_inexact, status);
3555             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3556         }
3557         if ( zExp < 0 ) {
3558             if (status->flush_to_zero) {
3559                 float_raise(float_flag_output_denormal, status);
3560                 return packFloat64(zSign, 0, 0);
3561             }
3562             isTiny =
3563                    (status->float_detect_tininess
3564                     == float_tininess_before_rounding)
3565                 || ( zExp < -1 )
3566                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3567             shift64RightJamming( zSig, - zExp, &zSig );
3568             zExp = 0;
3569             roundBits = zSig & 0x3FF;
3570             if (isTiny && roundBits) {
3571                 float_raise(float_flag_underflow, status);
3572             }
3573             if (roundingMode == float_round_to_odd) {
3574                 /*
3575                  * For round-to-odd case, the roundIncrement depends on
3576                  * zSig which just changed.
3577                  */
3578                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3579             }
3580         }
3581     }
3582     if (roundBits) {
3583         status->float_exception_flags |= float_flag_inexact;
3584     }
3585     zSig = ( zSig + roundIncrement )>>10;
3586     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3587     if ( zSig == 0 ) zExp = 0;
3588     return packFloat64( zSign, zExp, zSig );
3589 
3590 }
3591 
3592 /*----------------------------------------------------------------------------
3593 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3594 | and significand `zSig', and returns the proper double-precision floating-
3595 | point value corresponding to the abstract input.  This routine is just like
3596 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3597 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3598 | floating-point exponent.
3599 *----------------------------------------------------------------------------*/
3600 
3601 static float64
3602  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3603                               float_status *status)
3604 {
3605     int8_t shiftCount;
3606 
3607     shiftCount = clz64(zSig) - 1;
3608     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3609                                status);
3610 
3611 }
3612 
3613 /*----------------------------------------------------------------------------
3614 | Normalizes the subnormal extended double-precision floating-point value
3615 | represented by the denormalized significand `aSig'.  The normalized exponent
3616 | and significand are stored at the locations pointed to by `zExpPtr' and
3617 | `zSigPtr', respectively.
3618 *----------------------------------------------------------------------------*/
3619 
3620 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3621                                 uint64_t *zSigPtr)
3622 {
3623     int8_t shiftCount;
3624 
3625     shiftCount = clz64(aSig);
3626     *zSigPtr = aSig<<shiftCount;
3627     *zExpPtr = 1 - shiftCount;
3628 }
3629 
3630 /*----------------------------------------------------------------------------
3631 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3632 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3633 | and returns the proper extended double-precision floating-point value
3634 | corresponding to the abstract input.  Ordinarily, the abstract value is
3635 | rounded and packed into the extended double-precision format, with the
3636 | inexact exception raised if the abstract input cannot be represented
3637 | exactly.  However, if the abstract value is too large, the overflow and
3638 | inexact exceptions are raised and an infinity or maximal finite value is
3639 | returned.  If the abstract value is too small, the input value is rounded to
3640 | a subnormal number, and the underflow and inexact exceptions are raised if
3641 | the abstract input cannot be represented exactly as a subnormal extended
3642 | double-precision floating-point number.
3643 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3644 | number of bits as single or double precision, respectively.  Otherwise, the
3645 | result is rounded to the full precision of the extended double-precision
3646 | format.
3647 |     The input significand must be normalized or smaller.  If the input
3648 | significand is not normalized, `zExp' must be 0; in that case, the result
3649 | returned is a subnormal number, and it must not require rounding.  The
3650 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3651 | Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653 
3654 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3655                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3656                               float_status *status)
3657 {
3658     int8_t roundingMode;
3659     flag roundNearestEven, increment, isTiny;
3660     int64_t roundIncrement, roundMask, roundBits;
3661 
3662     roundingMode = status->float_rounding_mode;
3663     roundNearestEven = ( roundingMode == float_round_nearest_even );
3664     if ( roundingPrecision == 80 ) goto precision80;
3665     if ( roundingPrecision == 64 ) {
3666         roundIncrement = LIT64( 0x0000000000000400 );
3667         roundMask = LIT64( 0x00000000000007FF );
3668     }
3669     else if ( roundingPrecision == 32 ) {
3670         roundIncrement = LIT64( 0x0000008000000000 );
3671         roundMask = LIT64( 0x000000FFFFFFFFFF );
3672     }
3673     else {
3674         goto precision80;
3675     }
3676     zSig0 |= ( zSig1 != 0 );
3677     switch (roundingMode) {
3678     case float_round_nearest_even:
3679     case float_round_ties_away:
3680         break;
3681     case float_round_to_zero:
3682         roundIncrement = 0;
3683         break;
3684     case float_round_up:
3685         roundIncrement = zSign ? 0 : roundMask;
3686         break;
3687     case float_round_down:
3688         roundIncrement = zSign ? roundMask : 0;
3689         break;
3690     default:
3691         abort();
3692     }
3693     roundBits = zSig0 & roundMask;
3694     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3695         if (    ( 0x7FFE < zExp )
3696              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3697            ) {
3698             goto overflow;
3699         }
3700         if ( zExp <= 0 ) {
3701             if (status->flush_to_zero) {
3702                 float_raise(float_flag_output_denormal, status);
3703                 return packFloatx80(zSign, 0, 0);
3704             }
3705             isTiny =
3706                    (status->float_detect_tininess
3707                     == float_tininess_before_rounding)
3708                 || ( zExp < 0 )
3709                 || ( zSig0 <= zSig0 + roundIncrement );
3710             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3711             zExp = 0;
3712             roundBits = zSig0 & roundMask;
3713             if (isTiny && roundBits) {
3714                 float_raise(float_flag_underflow, status);
3715             }
3716             if (roundBits) {
3717                 status->float_exception_flags |= float_flag_inexact;
3718             }
3719             zSig0 += roundIncrement;
3720             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3721             roundIncrement = roundMask + 1;
3722             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3723                 roundMask |= roundIncrement;
3724             }
3725             zSig0 &= ~ roundMask;
3726             return packFloatx80( zSign, zExp, zSig0 );
3727         }
3728     }
3729     if (roundBits) {
3730         status->float_exception_flags |= float_flag_inexact;
3731     }
3732     zSig0 += roundIncrement;
3733     if ( zSig0 < roundIncrement ) {
3734         ++zExp;
3735         zSig0 = LIT64( 0x8000000000000000 );
3736     }
3737     roundIncrement = roundMask + 1;
3738     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3739         roundMask |= roundIncrement;
3740     }
3741     zSig0 &= ~ roundMask;
3742     if ( zSig0 == 0 ) zExp = 0;
3743     return packFloatx80( zSign, zExp, zSig0 );
3744  precision80:
3745     switch (roundingMode) {
3746     case float_round_nearest_even:
3747     case float_round_ties_away:
3748         increment = ((int64_t)zSig1 < 0);
3749         break;
3750     case float_round_to_zero:
3751         increment = 0;
3752         break;
3753     case float_round_up:
3754         increment = !zSign && zSig1;
3755         break;
3756     case float_round_down:
3757         increment = zSign && zSig1;
3758         break;
3759     default:
3760         abort();
3761     }
3762     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3763         if (    ( 0x7FFE < zExp )
3764              || (    ( zExp == 0x7FFE )
3765                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3766                   && increment
3767                 )
3768            ) {
3769             roundMask = 0;
3770  overflow:
3771             float_raise(float_flag_overflow | float_flag_inexact, status);
3772             if (    ( roundingMode == float_round_to_zero )
3773                  || ( zSign && ( roundingMode == float_round_up ) )
3774                  || ( ! zSign && ( roundingMode == float_round_down ) )
3775                ) {
3776                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3777             }
3778             return packFloatx80(zSign,
3779                                 floatx80_infinity_high,
3780                                 floatx80_infinity_low);
3781         }
3782         if ( zExp <= 0 ) {
3783             isTiny =
3784                    (status->float_detect_tininess
3785                     == float_tininess_before_rounding)
3786                 || ( zExp < 0 )
3787                 || ! increment
3788                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3789             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3790             zExp = 0;
3791             if (isTiny && zSig1) {
3792                 float_raise(float_flag_underflow, status);
3793             }
3794             if (zSig1) {
3795                 status->float_exception_flags |= float_flag_inexact;
3796             }
3797             switch (roundingMode) {
3798             case float_round_nearest_even:
3799             case float_round_ties_away:
3800                 increment = ((int64_t)zSig1 < 0);
3801                 break;
3802             case float_round_to_zero:
3803                 increment = 0;
3804                 break;
3805             case float_round_up:
3806                 increment = !zSign && zSig1;
3807                 break;
3808             case float_round_down:
3809                 increment = zSign && zSig1;
3810                 break;
3811             default:
3812                 abort();
3813             }
3814             if ( increment ) {
3815                 ++zSig0;
3816                 zSig0 &=
3817                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3818                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3819             }
3820             return packFloatx80( zSign, zExp, zSig0 );
3821         }
3822     }
3823     if (zSig1) {
3824         status->float_exception_flags |= float_flag_inexact;
3825     }
3826     if ( increment ) {
3827         ++zSig0;
3828         if ( zSig0 == 0 ) {
3829             ++zExp;
3830             zSig0 = LIT64( 0x8000000000000000 );
3831         }
3832         else {
3833             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3834         }
3835     }
3836     else {
3837         if ( zSig0 == 0 ) zExp = 0;
3838     }
3839     return packFloatx80( zSign, zExp, zSig0 );
3840 
3841 }
3842 
3843 /*----------------------------------------------------------------------------
3844 | Takes an abstract floating-point value having sign `zSign', exponent
3845 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3846 | and returns the proper extended double-precision floating-point value
3847 | corresponding to the abstract input.  This routine is just like
3848 | `roundAndPackFloatx80' except that the input significand does not have to be
3849 | normalized.
3850 *----------------------------------------------------------------------------*/
3851 
3852 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3853                                        flag zSign, int32_t zExp,
3854                                        uint64_t zSig0, uint64_t zSig1,
3855                                        float_status *status)
3856 {
3857     int8_t shiftCount;
3858 
3859     if ( zSig0 == 0 ) {
3860         zSig0 = zSig1;
3861         zSig1 = 0;
3862         zExp -= 64;
3863     }
3864     shiftCount = clz64(zSig0);
3865     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3866     zExp -= shiftCount;
3867     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3868                                 zSig0, zSig1, status);
3869 
3870 }
3871 
3872 /*----------------------------------------------------------------------------
3873 | Returns the least-significant 64 fraction bits of the quadruple-precision
3874 | floating-point value `a'.
3875 *----------------------------------------------------------------------------*/
3876 
3877 static inline uint64_t extractFloat128Frac1( float128 a )
3878 {
3879 
3880     return a.low;
3881 
3882 }
3883 
3884 /*----------------------------------------------------------------------------
3885 | Returns the most-significant 48 fraction bits of the quadruple-precision
3886 | floating-point value `a'.
3887 *----------------------------------------------------------------------------*/
3888 
3889 static inline uint64_t extractFloat128Frac0( float128 a )
3890 {
3891 
3892     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3893 
3894 }
3895 
3896 /*----------------------------------------------------------------------------
3897 | Returns the exponent bits of the quadruple-precision floating-point value
3898 | `a'.
3899 *----------------------------------------------------------------------------*/
3900 
3901 static inline int32_t extractFloat128Exp( float128 a )
3902 {
3903 
3904     return ( a.high>>48 ) & 0x7FFF;
3905 
3906 }
3907 
3908 /*----------------------------------------------------------------------------
3909 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3910 *----------------------------------------------------------------------------*/
3911 
3912 static inline flag extractFloat128Sign( float128 a )
3913 {
3914 
3915     return a.high>>63;
3916 
3917 }
3918 
3919 /*----------------------------------------------------------------------------
3920 | Normalizes the subnormal quadruple-precision floating-point value
3921 | represented by the denormalized significand formed by the concatenation of
3922 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3923 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3924 | significand are stored at the location pointed to by `zSig0Ptr', and the
3925 | least significant 64 bits of the normalized significand are stored at the
3926 | location pointed to by `zSig1Ptr'.
3927 *----------------------------------------------------------------------------*/
3928 
3929 static void
3930  normalizeFloat128Subnormal(
3931      uint64_t aSig0,
3932      uint64_t aSig1,
3933      int32_t *zExpPtr,
3934      uint64_t *zSig0Ptr,
3935      uint64_t *zSig1Ptr
3936  )
3937 {
3938     int8_t shiftCount;
3939 
3940     if ( aSig0 == 0 ) {
3941         shiftCount = clz64(aSig1) - 15;
3942         if ( shiftCount < 0 ) {
3943             *zSig0Ptr = aSig1>>( - shiftCount );
3944             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3945         }
3946         else {
3947             *zSig0Ptr = aSig1<<shiftCount;
3948             *zSig1Ptr = 0;
3949         }
3950         *zExpPtr = - shiftCount - 63;
3951     }
3952     else {
3953         shiftCount = clz64(aSig0) - 15;
3954         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3955         *zExpPtr = 1 - shiftCount;
3956     }
3957 
3958 }
3959 
3960 /*----------------------------------------------------------------------------
3961 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3962 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3963 | floating-point value, returning the result.  After being shifted into the
3964 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3965 | added together to form the most significant 32 bits of the result.  This
3966 | means that any integer portion of `zSig0' will be added into the exponent.
3967 | Since a properly normalized significand will have an integer portion equal
3968 | to 1, the `zExp' input should be 1 less than the desired result exponent
3969 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3970 | significand.
3971 *----------------------------------------------------------------------------*/
3972 
3973 static inline float128
3974  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3975 {
3976     float128 z;
3977 
3978     z.low = zSig1;
3979     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3980     return z;
3981 
3982 }
3983 
3984 /*----------------------------------------------------------------------------
3985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3986 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3987 | and `zSig2', and returns the proper quadruple-precision floating-point value
3988 | corresponding to the abstract input.  Ordinarily, the abstract value is
3989 | simply rounded and packed into the quadruple-precision format, with the
3990 | inexact exception raised if the abstract input cannot be represented
3991 | exactly.  However, if the abstract value is too large, the overflow and
3992 | inexact exceptions are raised and an infinity or maximal finite value is
3993 | returned.  If the abstract value is too small, the input value is rounded to
3994 | a subnormal number, and the underflow and inexact exceptions are raised if
3995 | the abstract input cannot be represented exactly as a subnormal quadruple-
3996 | precision floating-point number.
3997 |     The input significand must be normalized or smaller.  If the input
3998 | significand is not normalized, `zExp' must be 0; in that case, the result
3999 | returned is a subnormal number, and it must not require rounding.  In the
4000 | usual case that the input significand is normalized, `zExp' must be 1 less
4001 | than the ``true'' floating-point exponent.  The handling of underflow and
4002 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003 *----------------------------------------------------------------------------*/
4004 
4005 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4006                                      uint64_t zSig0, uint64_t zSig1,
4007                                      uint64_t zSig2, float_status *status)
4008 {
4009     int8_t roundingMode;
4010     flag roundNearestEven, increment, isTiny;
4011 
4012     roundingMode = status->float_rounding_mode;
4013     roundNearestEven = ( roundingMode == float_round_nearest_even );
4014     switch (roundingMode) {
4015     case float_round_nearest_even:
4016     case float_round_ties_away:
4017         increment = ((int64_t)zSig2 < 0);
4018         break;
4019     case float_round_to_zero:
4020         increment = 0;
4021         break;
4022     case float_round_up:
4023         increment = !zSign && zSig2;
4024         break;
4025     case float_round_down:
4026         increment = zSign && zSig2;
4027         break;
4028     case float_round_to_odd:
4029         increment = !(zSig1 & 0x1) && zSig2;
4030         break;
4031     default:
4032         abort();
4033     }
4034     if ( 0x7FFD <= (uint32_t) zExp ) {
4035         if (    ( 0x7FFD < zExp )
4036              || (    ( zExp == 0x7FFD )
4037                   && eq128(
4038                          LIT64( 0x0001FFFFFFFFFFFF ),
4039                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4040                          zSig0,
4041                          zSig1
4042                      )
4043                   && increment
4044                 )
4045            ) {
4046             float_raise(float_flag_overflow | float_flag_inexact, status);
4047             if (    ( roundingMode == float_round_to_zero )
4048                  || ( zSign && ( roundingMode == float_round_up ) )
4049                  || ( ! zSign && ( roundingMode == float_round_down ) )
4050                  || (roundingMode == float_round_to_odd)
4051                ) {
4052                 return
4053                     packFloat128(
4054                         zSign,
4055                         0x7FFE,
4056                         LIT64( 0x0000FFFFFFFFFFFF ),
4057                         LIT64( 0xFFFFFFFFFFFFFFFF )
4058                     );
4059             }
4060             return packFloat128( zSign, 0x7FFF, 0, 0 );
4061         }
4062         if ( zExp < 0 ) {
4063             if (status->flush_to_zero) {
4064                 float_raise(float_flag_output_denormal, status);
4065                 return packFloat128(zSign, 0, 0, 0);
4066             }
4067             isTiny =
4068                    (status->float_detect_tininess
4069                     == float_tininess_before_rounding)
4070                 || ( zExp < -1 )
4071                 || ! increment
4072                 || lt128(
4073                        zSig0,
4074                        zSig1,
4075                        LIT64( 0x0001FFFFFFFFFFFF ),
4076                        LIT64( 0xFFFFFFFFFFFFFFFF )
4077                    );
4078             shift128ExtraRightJamming(
4079                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4080             zExp = 0;
4081             if (isTiny && zSig2) {
4082                 float_raise(float_flag_underflow, status);
4083             }
4084             switch (roundingMode) {
4085             case float_round_nearest_even:
4086             case float_round_ties_away:
4087                 increment = ((int64_t)zSig2 < 0);
4088                 break;
4089             case float_round_to_zero:
4090                 increment = 0;
4091                 break;
4092             case float_round_up:
4093                 increment = !zSign && zSig2;
4094                 break;
4095             case float_round_down:
4096                 increment = zSign && zSig2;
4097                 break;
4098             case float_round_to_odd:
4099                 increment = !(zSig1 & 0x1) && zSig2;
4100                 break;
4101             default:
4102                 abort();
4103             }
4104         }
4105     }
4106     if (zSig2) {
4107         status->float_exception_flags |= float_flag_inexact;
4108     }
4109     if ( increment ) {
4110         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4111         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4112     }
4113     else {
4114         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4115     }
4116     return packFloat128( zSign, zExp, zSig0, zSig1 );
4117 
4118 }
4119 
4120 /*----------------------------------------------------------------------------
4121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4122 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4123 | returns the proper quadruple-precision floating-point value corresponding
4124 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4125 | except that the input significand has fewer bits and does not have to be
4126 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4127 | point exponent.
4128 *----------------------------------------------------------------------------*/
4129 
4130 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4131                                               uint64_t zSig0, uint64_t zSig1,
4132                                               float_status *status)
4133 {
4134     int8_t shiftCount;
4135     uint64_t zSig2;
4136 
4137     if ( zSig0 == 0 ) {
4138         zSig0 = zSig1;
4139         zSig1 = 0;
4140         zExp -= 64;
4141     }
4142     shiftCount = clz64(zSig0) - 15;
4143     if ( 0 <= shiftCount ) {
4144         zSig2 = 0;
4145         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4146     }
4147     else {
4148         shift128ExtraRightJamming(
4149             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4150     }
4151     zExp -= shiftCount;
4152     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4153 
4154 }
4155 
4156 
4157 /*----------------------------------------------------------------------------
4158 | Returns the result of converting the 32-bit two's complement integer `a'
4159 | to the extended double-precision floating-point format.  The conversion
4160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4161 | Arithmetic.
4162 *----------------------------------------------------------------------------*/
4163 
4164 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4165 {
4166     flag zSign;
4167     uint32_t absA;
4168     int8_t shiftCount;
4169     uint64_t zSig;
4170 
4171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4172     zSign = ( a < 0 );
4173     absA = zSign ? - a : a;
4174     shiftCount = clz32(absA) + 32;
4175     zSig = absA;
4176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4177 
4178 }
4179 
4180 /*----------------------------------------------------------------------------
4181 | Returns the result of converting the 32-bit two's complement integer `a' to
4182 | the quadruple-precision floating-point format.  The conversion is performed
4183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 
4186 float128 int32_to_float128(int32_t a, float_status *status)
4187 {
4188     flag zSign;
4189     uint32_t absA;
4190     int8_t shiftCount;
4191     uint64_t zSig0;
4192 
4193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4194     zSign = ( a < 0 );
4195     absA = zSign ? - a : a;
4196     shiftCount = clz32(absA) + 17;
4197     zSig0 = absA;
4198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4199 
4200 }
4201 
4202 /*----------------------------------------------------------------------------
4203 | Returns the result of converting the 64-bit two's complement integer `a'
4204 | to the extended double-precision floating-point format.  The conversion
4205 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4206 | Arithmetic.
4207 *----------------------------------------------------------------------------*/
4208 
4209 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4210 {
4211     flag zSign;
4212     uint64_t absA;
4213     int8_t shiftCount;
4214 
4215     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4216     zSign = ( a < 0 );
4217     absA = zSign ? - a : a;
4218     shiftCount = clz64(absA);
4219     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4220 
4221 }
4222 
4223 /*----------------------------------------------------------------------------
4224 | Returns the result of converting the 64-bit two's complement integer `a' to
4225 | the quadruple-precision floating-point format.  The conversion is performed
4226 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227 *----------------------------------------------------------------------------*/
4228 
4229 float128 int64_to_float128(int64_t a, float_status *status)
4230 {
4231     flag zSign;
4232     uint64_t absA;
4233     int8_t shiftCount;
4234     int32_t zExp;
4235     uint64_t zSig0, zSig1;
4236 
4237     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4238     zSign = ( a < 0 );
4239     absA = zSign ? - a : a;
4240     shiftCount = clz64(absA) + 49;
4241     zExp = 0x406E - shiftCount;
4242     if ( 64 <= shiftCount ) {
4243         zSig1 = 0;
4244         zSig0 = absA;
4245         shiftCount -= 64;
4246     }
4247     else {
4248         zSig1 = absA;
4249         zSig0 = 0;
4250     }
4251     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4252     return packFloat128( zSign, zExp, zSig0, zSig1 );
4253 
4254 }
4255 
4256 /*----------------------------------------------------------------------------
4257 | Returns the result of converting the 64-bit unsigned integer `a'
4258 | to the quadruple-precision floating-point format.  The conversion is performed
4259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4260 *----------------------------------------------------------------------------*/
4261 
4262 float128 uint64_to_float128(uint64_t a, float_status *status)
4263 {
4264     if (a == 0) {
4265         return float128_zero;
4266     }
4267     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4268 }
4269 
4270 /*----------------------------------------------------------------------------
4271 | Returns the result of converting the single-precision floating-point value
4272 | `a' to the extended double-precision floating-point format.  The conversion
4273 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic.
4275 *----------------------------------------------------------------------------*/
4276 
4277 floatx80 float32_to_floatx80(float32 a, float_status *status)
4278 {
4279     flag aSign;
4280     int aExp;
4281     uint32_t aSig;
4282 
4283     a = float32_squash_input_denormal(a, status);
4284     aSig = extractFloat32Frac( a );
4285     aExp = extractFloat32Exp( a );
4286     aSign = extractFloat32Sign( a );
4287     if ( aExp == 0xFF ) {
4288         if (aSig) {
4289             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4290         }
4291         return packFloatx80(aSign,
4292                             floatx80_infinity_high,
4293                             floatx80_infinity_low);
4294     }
4295     if ( aExp == 0 ) {
4296         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4297         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4298     }
4299     aSig |= 0x00800000;
4300     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4301 
4302 }
4303 
4304 /*----------------------------------------------------------------------------
4305 | Returns the result of converting the single-precision floating-point value
4306 | `a' to the double-precision floating-point format.  The conversion is
4307 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4308 | Arithmetic.
4309 *----------------------------------------------------------------------------*/
4310 
4311 float128 float32_to_float128(float32 a, float_status *status)
4312 {
4313     flag aSign;
4314     int aExp;
4315     uint32_t aSig;
4316 
4317     a = float32_squash_input_denormal(a, status);
4318     aSig = extractFloat32Frac( a );
4319     aExp = extractFloat32Exp( a );
4320     aSign = extractFloat32Sign( a );
4321     if ( aExp == 0xFF ) {
4322         if (aSig) {
4323             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4324         }
4325         return packFloat128( aSign, 0x7FFF, 0, 0 );
4326     }
4327     if ( aExp == 0 ) {
4328         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4329         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4330         --aExp;
4331     }
4332     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4333 
4334 }
4335 
4336 /*----------------------------------------------------------------------------
4337 | Returns the remainder of the single-precision floating-point value `a'
4338 | with respect to the corresponding value `b'.  The operation is performed
4339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4340 *----------------------------------------------------------------------------*/
4341 
4342 float32 float32_rem(float32 a, float32 b, float_status *status)
4343 {
4344     flag aSign, zSign;
4345     int aExp, bExp, expDiff;
4346     uint32_t aSig, bSig;
4347     uint32_t q;
4348     uint64_t aSig64, bSig64, q64;
4349     uint32_t alternateASig;
4350     int32_t sigMean;
4351     a = float32_squash_input_denormal(a, status);
4352     b = float32_squash_input_denormal(b, status);
4353 
4354     aSig = extractFloat32Frac( a );
4355     aExp = extractFloat32Exp( a );
4356     aSign = extractFloat32Sign( a );
4357     bSig = extractFloat32Frac( b );
4358     bExp = extractFloat32Exp( b );
4359     if ( aExp == 0xFF ) {
4360         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4361             return propagateFloat32NaN(a, b, status);
4362         }
4363         float_raise(float_flag_invalid, status);
4364         return float32_default_nan(status);
4365     }
4366     if ( bExp == 0xFF ) {
4367         if (bSig) {
4368             return propagateFloat32NaN(a, b, status);
4369         }
4370         return a;
4371     }
4372     if ( bExp == 0 ) {
4373         if ( bSig == 0 ) {
4374             float_raise(float_flag_invalid, status);
4375             return float32_default_nan(status);
4376         }
4377         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4378     }
4379     if ( aExp == 0 ) {
4380         if ( aSig == 0 ) return a;
4381         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4382     }
4383     expDiff = aExp - bExp;
4384     aSig |= 0x00800000;
4385     bSig |= 0x00800000;
4386     if ( expDiff < 32 ) {
4387         aSig <<= 8;
4388         bSig <<= 8;
4389         if ( expDiff < 0 ) {
4390             if ( expDiff < -1 ) return a;
4391             aSig >>= 1;
4392         }
4393         q = ( bSig <= aSig );
4394         if ( q ) aSig -= bSig;
4395         if ( 0 < expDiff ) {
4396             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4397             q >>= 32 - expDiff;
4398             bSig >>= 2;
4399             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4400         }
4401         else {
4402             aSig >>= 2;
4403             bSig >>= 2;
4404         }
4405     }
4406     else {
4407         if ( bSig <= aSig ) aSig -= bSig;
4408         aSig64 = ( (uint64_t) aSig )<<40;
4409         bSig64 = ( (uint64_t) bSig )<<40;
4410         expDiff -= 64;
4411         while ( 0 < expDiff ) {
4412             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4413             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4414             aSig64 = - ( ( bSig * q64 )<<38 );
4415             expDiff -= 62;
4416         }
4417         expDiff += 64;
4418         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4419         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4420         q = q64>>( 64 - expDiff );
4421         bSig <<= 6;
4422         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4423     }
4424     do {
4425         alternateASig = aSig;
4426         ++q;
4427         aSig -= bSig;
4428     } while ( 0 <= (int32_t) aSig );
4429     sigMean = aSig + alternateASig;
4430     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4431         aSig = alternateASig;
4432     }
4433     zSign = ( (int32_t) aSig < 0 );
4434     if ( zSign ) aSig = - aSig;
4435     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4436 }
4437 
4438 
4439 
4440 /*----------------------------------------------------------------------------
4441 | Returns the binary exponential of the single-precision floating-point value
4442 | `a'. The operation is performed according to the IEC/IEEE Standard for
4443 | Binary Floating-Point Arithmetic.
4444 |
4445 | Uses the following identities:
4446 |
4447 | 1. -------------------------------------------------------------------------
4448 |      x    x*ln(2)
4449 |     2  = e
4450 |
4451 | 2. -------------------------------------------------------------------------
4452 |                      2     3     4     5           n
4453 |      x        x     x     x     x     x           x
4454 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4455 |               1!    2!    3!    4!    5!          n!
4456 *----------------------------------------------------------------------------*/
4457 
4458 static const float64 float32_exp2_coefficients[15] =
4459 {
4460     const_float64( 0x3ff0000000000000ll ), /*  1 */
4461     const_float64( 0x3fe0000000000000ll ), /*  2 */
4462     const_float64( 0x3fc5555555555555ll ), /*  3 */
4463     const_float64( 0x3fa5555555555555ll ), /*  4 */
4464     const_float64( 0x3f81111111111111ll ), /*  5 */
4465     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4466     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4467     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4468     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4469     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4470     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4471     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4472     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4473     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4474     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4475 };
4476 
4477 float32 float32_exp2(float32 a, float_status *status)
4478 {
4479     flag aSign;
4480     int aExp;
4481     uint32_t aSig;
4482     float64 r, x, xn;
4483     int i;
4484     a = float32_squash_input_denormal(a, status);
4485 
4486     aSig = extractFloat32Frac( a );
4487     aExp = extractFloat32Exp( a );
4488     aSign = extractFloat32Sign( a );
4489 
4490     if ( aExp == 0xFF) {
4491         if (aSig) {
4492             return propagateFloat32NaN(a, float32_zero, status);
4493         }
4494         return (aSign) ? float32_zero : a;
4495     }
4496     if (aExp == 0) {
4497         if (aSig == 0) return float32_one;
4498     }
4499 
4500     float_raise(float_flag_inexact, status);
4501 
4502     /* ******************************* */
4503     /* using float64 for approximation */
4504     /* ******************************* */
4505     x = float32_to_float64(a, status);
4506     x = float64_mul(x, float64_ln2, status);
4507 
4508     xn = x;
4509     r = float64_one;
4510     for (i = 0 ; i < 15 ; i++) {
4511         float64 f;
4512 
4513         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4514         r = float64_add(r, f, status);
4515 
4516         xn = float64_mul(xn, x, status);
4517     }
4518 
4519     return float64_to_float32(r, status);
4520 }
4521 
4522 /*----------------------------------------------------------------------------
4523 | Returns the binary log of the single-precision floating-point value `a'.
4524 | The operation is performed according to the IEC/IEEE Standard for Binary
4525 | Floating-Point Arithmetic.
4526 *----------------------------------------------------------------------------*/
4527 float32 float32_log2(float32 a, float_status *status)
4528 {
4529     flag aSign, zSign;
4530     int aExp;
4531     uint32_t aSig, zSig, i;
4532 
4533     a = float32_squash_input_denormal(a, status);
4534     aSig = extractFloat32Frac( a );
4535     aExp = extractFloat32Exp( a );
4536     aSign = extractFloat32Sign( a );
4537 
4538     if ( aExp == 0 ) {
4539         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4540         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4541     }
4542     if ( aSign ) {
4543         float_raise(float_flag_invalid, status);
4544         return float32_default_nan(status);
4545     }
4546     if ( aExp == 0xFF ) {
4547         if (aSig) {
4548             return propagateFloat32NaN(a, float32_zero, status);
4549         }
4550         return a;
4551     }
4552 
4553     aExp -= 0x7F;
4554     aSig |= 0x00800000;
4555     zSign = aExp < 0;
4556     zSig = aExp << 23;
4557 
4558     for (i = 1 << 22; i > 0; i >>= 1) {
4559         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4560         if ( aSig & 0x01000000 ) {
4561             aSig >>= 1;
4562             zSig |= i;
4563         }
4564     }
4565 
4566     if ( zSign )
4567         zSig = -zSig;
4568 
4569     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4570 }
4571 
4572 /*----------------------------------------------------------------------------
4573 | Returns 1 if the single-precision floating-point value `a' is equal to
4574 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4575 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4577 *----------------------------------------------------------------------------*/
4578 
4579 int float32_eq(float32 a, float32 b, float_status *status)
4580 {
4581     uint32_t av, bv;
4582     a = float32_squash_input_denormal(a, status);
4583     b = float32_squash_input_denormal(b, status);
4584 
4585     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4586          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4587        ) {
4588         float_raise(float_flag_invalid, status);
4589         return 0;
4590     }
4591     av = float32_val(a);
4592     bv = float32_val(b);
4593     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4594 }
4595 
4596 /*----------------------------------------------------------------------------
4597 | Returns 1 if the single-precision floating-point value `a' is less than
4598 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4599 | exception is raised if either operand is a NaN.  The comparison is performed
4600 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4601 *----------------------------------------------------------------------------*/
4602 
4603 int float32_le(float32 a, float32 b, float_status *status)
4604 {
4605     flag aSign, bSign;
4606     uint32_t av, bv;
4607     a = float32_squash_input_denormal(a, status);
4608     b = float32_squash_input_denormal(b, status);
4609 
4610     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4611          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4612        ) {
4613         float_raise(float_flag_invalid, status);
4614         return 0;
4615     }
4616     aSign = extractFloat32Sign( a );
4617     bSign = extractFloat32Sign( b );
4618     av = float32_val(a);
4619     bv = float32_val(b);
4620     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4621     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4622 
4623 }
4624 
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the single-precision floating-point value `a' is less than
4627 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4628 | raised if either operand is a NaN.  The comparison is performed according
4629 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631 
4632 int float32_lt(float32 a, float32 b, float_status *status)
4633 {
4634     flag aSign, bSign;
4635     uint32_t av, bv;
4636     a = float32_squash_input_denormal(a, status);
4637     b = float32_squash_input_denormal(b, status);
4638 
4639     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4640          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4641        ) {
4642         float_raise(float_flag_invalid, status);
4643         return 0;
4644     }
4645     aSign = extractFloat32Sign( a );
4646     bSign = extractFloat32Sign( b );
4647     av = float32_val(a);
4648     bv = float32_val(b);
4649     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4650     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4651 
4652 }
4653 
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4656 | be compared, and 0 otherwise.  The invalid exception is raised if either
4657 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4658 | Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660 
4661 int float32_unordered(float32 a, float32 b, float_status *status)
4662 {
4663     a = float32_squash_input_denormal(a, status);
4664     b = float32_squash_input_denormal(b, status);
4665 
4666     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4667          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4668        ) {
4669         float_raise(float_flag_invalid, status);
4670         return 1;
4671     }
4672     return 0;
4673 }
4674 
4675 /*----------------------------------------------------------------------------
4676 | Returns 1 if the single-precision floating-point value `a' is equal to
4677 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4678 | exception.  The comparison is performed according to the IEC/IEEE Standard
4679 | for Binary Floating-Point Arithmetic.
4680 *----------------------------------------------------------------------------*/
4681 
4682 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4683 {
4684     a = float32_squash_input_denormal(a, status);
4685     b = float32_squash_input_denormal(b, status);
4686 
4687     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4688          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4689        ) {
4690         if (float32_is_signaling_nan(a, status)
4691          || float32_is_signaling_nan(b, status)) {
4692             float_raise(float_flag_invalid, status);
4693         }
4694         return 0;
4695     }
4696     return ( float32_val(a) == float32_val(b) ) ||
4697             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4698 }
4699 
4700 /*----------------------------------------------------------------------------
4701 | Returns 1 if the single-precision floating-point value `a' is less than or
4702 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4703 | cause an exception.  Otherwise, the comparison is performed according to the
4704 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4706 
4707 int float32_le_quiet(float32 a, float32 b, float_status *status)
4708 {
4709     flag aSign, bSign;
4710     uint32_t av, bv;
4711     a = float32_squash_input_denormal(a, status);
4712     b = float32_squash_input_denormal(b, status);
4713 
4714     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4715          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4716        ) {
4717         if (float32_is_signaling_nan(a, status)
4718          || float32_is_signaling_nan(b, status)) {
4719             float_raise(float_flag_invalid, status);
4720         }
4721         return 0;
4722     }
4723     aSign = extractFloat32Sign( a );
4724     bSign = extractFloat32Sign( b );
4725     av = float32_val(a);
4726     bv = float32_val(b);
4727     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4728     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4729 
4730 }
4731 
4732 /*----------------------------------------------------------------------------
4733 | Returns 1 if the single-precision floating-point value `a' is less than
4734 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4735 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4736 | Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4738 
4739 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4740 {
4741     flag aSign, bSign;
4742     uint32_t av, bv;
4743     a = float32_squash_input_denormal(a, status);
4744     b = float32_squash_input_denormal(b, status);
4745 
4746     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4747          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4748        ) {
4749         if (float32_is_signaling_nan(a, status)
4750          || float32_is_signaling_nan(b, status)) {
4751             float_raise(float_flag_invalid, status);
4752         }
4753         return 0;
4754     }
4755     aSign = extractFloat32Sign( a );
4756     bSign = extractFloat32Sign( b );
4757     av = float32_val(a);
4758     bv = float32_val(b);
4759     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4760     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4761 
4762 }
4763 
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4766 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4767 | comparison is performed according to the IEC/IEEE Standard for Binary
4768 | Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770 
4771 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4772 {
4773     a = float32_squash_input_denormal(a, status);
4774     b = float32_squash_input_denormal(b, status);
4775 
4776     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4777          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4778        ) {
4779         if (float32_is_signaling_nan(a, status)
4780          || float32_is_signaling_nan(b, status)) {
4781             float_raise(float_flag_invalid, status);
4782         }
4783         return 1;
4784     }
4785     return 0;
4786 }
4787 
4788 /*----------------------------------------------------------------------------
4789 | If `a' is denormal and we are in flush-to-zero mode then set the
4790 | input-denormal exception and return zero. Otherwise just return the value.
4791 *----------------------------------------------------------------------------*/
4792 float16 float16_squash_input_denormal(float16 a, float_status *status)
4793 {
4794     if (status->flush_inputs_to_zero) {
4795         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4796             float_raise(float_flag_input_denormal, status);
4797             return make_float16(float16_val(a) & 0x8000);
4798         }
4799     }
4800     return a;
4801 }
4802 
4803 /*----------------------------------------------------------------------------
4804 | Returns the result of converting the double-precision floating-point value
4805 | `a' to the extended double-precision floating-point format.  The conversion
4806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4807 | Arithmetic.
4808 *----------------------------------------------------------------------------*/
4809 
4810 floatx80 float64_to_floatx80(float64 a, float_status *status)
4811 {
4812     flag aSign;
4813     int aExp;
4814     uint64_t aSig;
4815 
4816     a = float64_squash_input_denormal(a, status);
4817     aSig = extractFloat64Frac( a );
4818     aExp = extractFloat64Exp( a );
4819     aSign = extractFloat64Sign( a );
4820     if ( aExp == 0x7FF ) {
4821         if (aSig) {
4822             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4823         }
4824         return packFloatx80(aSign,
4825                             floatx80_infinity_high,
4826                             floatx80_infinity_low);
4827     }
4828     if ( aExp == 0 ) {
4829         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4830         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4831     }
4832     return
4833         packFloatx80(
4834             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4835 
4836 }
4837 
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of converting the double-precision floating-point value
4840 | `a' to the quadruple-precision floating-point format.  The conversion is
4841 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4842 | Arithmetic.
4843 *----------------------------------------------------------------------------*/
4844 
4845 float128 float64_to_float128(float64 a, float_status *status)
4846 {
4847     flag aSign;
4848     int aExp;
4849     uint64_t aSig, zSig0, zSig1;
4850 
4851     a = float64_squash_input_denormal(a, status);
4852     aSig = extractFloat64Frac( a );
4853     aExp = extractFloat64Exp( a );
4854     aSign = extractFloat64Sign( a );
4855     if ( aExp == 0x7FF ) {
4856         if (aSig) {
4857             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4858         }
4859         return packFloat128( aSign, 0x7FFF, 0, 0 );
4860     }
4861     if ( aExp == 0 ) {
4862         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4863         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4864         --aExp;
4865     }
4866     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4867     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4868 
4869 }
4870 
4871 
4872 /*----------------------------------------------------------------------------
4873 | Returns the remainder of the double-precision floating-point value `a'
4874 | with respect to the corresponding value `b'.  The operation is performed
4875 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4876 *----------------------------------------------------------------------------*/
4877 
4878 float64 float64_rem(float64 a, float64 b, float_status *status)
4879 {
4880     flag aSign, zSign;
4881     int aExp, bExp, expDiff;
4882     uint64_t aSig, bSig;
4883     uint64_t q, alternateASig;
4884     int64_t sigMean;
4885 
4886     a = float64_squash_input_denormal(a, status);
4887     b = float64_squash_input_denormal(b, status);
4888     aSig = extractFloat64Frac( a );
4889     aExp = extractFloat64Exp( a );
4890     aSign = extractFloat64Sign( a );
4891     bSig = extractFloat64Frac( b );
4892     bExp = extractFloat64Exp( b );
4893     if ( aExp == 0x7FF ) {
4894         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4895             return propagateFloat64NaN(a, b, status);
4896         }
4897         float_raise(float_flag_invalid, status);
4898         return float64_default_nan(status);
4899     }
4900     if ( bExp == 0x7FF ) {
4901         if (bSig) {
4902             return propagateFloat64NaN(a, b, status);
4903         }
4904         return a;
4905     }
4906     if ( bExp == 0 ) {
4907         if ( bSig == 0 ) {
4908             float_raise(float_flag_invalid, status);
4909             return float64_default_nan(status);
4910         }
4911         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4912     }
4913     if ( aExp == 0 ) {
4914         if ( aSig == 0 ) return a;
4915         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4916     }
4917     expDiff = aExp - bExp;
4918     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4919     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4920     if ( expDiff < 0 ) {
4921         if ( expDiff < -1 ) return a;
4922         aSig >>= 1;
4923     }
4924     q = ( bSig <= aSig );
4925     if ( q ) aSig -= bSig;
4926     expDiff -= 64;
4927     while ( 0 < expDiff ) {
4928         q = estimateDiv128To64( aSig, 0, bSig );
4929         q = ( 2 < q ) ? q - 2 : 0;
4930         aSig = - ( ( bSig>>2 ) * q );
4931         expDiff -= 62;
4932     }
4933     expDiff += 64;
4934     if ( 0 < expDiff ) {
4935         q = estimateDiv128To64( aSig, 0, bSig );
4936         q = ( 2 < q ) ? q - 2 : 0;
4937         q >>= 64 - expDiff;
4938         bSig >>= 2;
4939         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4940     }
4941     else {
4942         aSig >>= 2;
4943         bSig >>= 2;
4944     }
4945     do {
4946         alternateASig = aSig;
4947         ++q;
4948         aSig -= bSig;
4949     } while ( 0 <= (int64_t) aSig );
4950     sigMean = aSig + alternateASig;
4951     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4952         aSig = alternateASig;
4953     }
4954     zSign = ( (int64_t) aSig < 0 );
4955     if ( zSign ) aSig = - aSig;
4956     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4957 
4958 }
4959 
4960 /*----------------------------------------------------------------------------
4961 | Returns the binary log of the double-precision floating-point value `a'.
4962 | The operation is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic.
4964 *----------------------------------------------------------------------------*/
4965 float64 float64_log2(float64 a, float_status *status)
4966 {
4967     flag aSign, zSign;
4968     int aExp;
4969     uint64_t aSig, aSig0, aSig1, zSig, i;
4970     a = float64_squash_input_denormal(a, status);
4971 
4972     aSig = extractFloat64Frac( a );
4973     aExp = extractFloat64Exp( a );
4974     aSign = extractFloat64Sign( a );
4975 
4976     if ( aExp == 0 ) {
4977         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4978         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4979     }
4980     if ( aSign ) {
4981         float_raise(float_flag_invalid, status);
4982         return float64_default_nan(status);
4983     }
4984     if ( aExp == 0x7FF ) {
4985         if (aSig) {
4986             return propagateFloat64NaN(a, float64_zero, status);
4987         }
4988         return a;
4989     }
4990 
4991     aExp -= 0x3FF;
4992     aSig |= LIT64( 0x0010000000000000 );
4993     zSign = aExp < 0;
4994     zSig = (uint64_t)aExp << 52;
4995     for (i = 1LL << 51; i > 0; i >>= 1) {
4996         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4997         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4998         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4999             aSig >>= 1;
5000             zSig |= i;
5001         }
5002     }
5003 
5004     if ( zSign )
5005         zSig = -zSig;
5006     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5007 }
5008 
5009 /*----------------------------------------------------------------------------
5010 | Returns 1 if the double-precision floating-point value `a' is equal to the
5011 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5012 | if either operand is a NaN.  Otherwise, the comparison is performed
5013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5015 
5016 int float64_eq(float64 a, float64 b, float_status *status)
5017 {
5018     uint64_t av, bv;
5019     a = float64_squash_input_denormal(a, status);
5020     b = float64_squash_input_denormal(b, status);
5021 
5022     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5023          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5024        ) {
5025         float_raise(float_flag_invalid, status);
5026         return 0;
5027     }
5028     av = float64_val(a);
5029     bv = float64_val(b);
5030     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5031 
5032 }
5033 
5034 /*----------------------------------------------------------------------------
5035 | Returns 1 if the double-precision floating-point value `a' is less than or
5036 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5037 | exception is raised if either operand is a NaN.  The comparison is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5040 
5041 int float64_le(float64 a, float64 b, float_status *status)
5042 {
5043     flag aSign, bSign;
5044     uint64_t av, bv;
5045     a = float64_squash_input_denormal(a, status);
5046     b = float64_squash_input_denormal(b, status);
5047 
5048     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5049          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5050        ) {
5051         float_raise(float_flag_invalid, status);
5052         return 0;
5053     }
5054     aSign = extractFloat64Sign( a );
5055     bSign = extractFloat64Sign( b );
5056     av = float64_val(a);
5057     bv = float64_val(b);
5058     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5059     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5060 
5061 }
5062 
5063 /*----------------------------------------------------------------------------
5064 | Returns 1 if the double-precision floating-point value `a' is less than
5065 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5066 | raised if either operand is a NaN.  The comparison is performed according
5067 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5068 *----------------------------------------------------------------------------*/
5069 
5070 int float64_lt(float64 a, float64 b, float_status *status)
5071 {
5072     flag aSign, bSign;
5073     uint64_t av, bv;
5074 
5075     a = float64_squash_input_denormal(a, status);
5076     b = float64_squash_input_denormal(b, status);
5077     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5078          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5079        ) {
5080         float_raise(float_flag_invalid, status);
5081         return 0;
5082     }
5083     aSign = extractFloat64Sign( a );
5084     bSign = extractFloat64Sign( b );
5085     av = float64_val(a);
5086     bv = float64_val(b);
5087     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5088     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5089 
5090 }
5091 
5092 /*----------------------------------------------------------------------------
5093 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5094 | be compared, and 0 otherwise.  The invalid exception is raised if either
5095 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5096 | Standard for Binary Floating-Point Arithmetic.
5097 *----------------------------------------------------------------------------*/
5098 
5099 int float64_unordered(float64 a, float64 b, float_status *status)
5100 {
5101     a = float64_squash_input_denormal(a, status);
5102     b = float64_squash_input_denormal(b, status);
5103 
5104     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5105          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5106        ) {
5107         float_raise(float_flag_invalid, status);
5108         return 1;
5109     }
5110     return 0;
5111 }
5112 
5113 /*----------------------------------------------------------------------------
5114 | Returns 1 if the double-precision floating-point value `a' is equal to the
5115 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5116 | exception.The comparison is performed according to the IEC/IEEE Standard
5117 | for Binary Floating-Point Arithmetic.
5118 *----------------------------------------------------------------------------*/
5119 
5120 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5121 {
5122     uint64_t av, bv;
5123     a = float64_squash_input_denormal(a, status);
5124     b = float64_squash_input_denormal(b, status);
5125 
5126     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5127          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5128        ) {
5129         if (float64_is_signaling_nan(a, status)
5130          || float64_is_signaling_nan(b, status)) {
5131             float_raise(float_flag_invalid, status);
5132         }
5133         return 0;
5134     }
5135     av = float64_val(a);
5136     bv = float64_val(b);
5137     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5138 
5139 }
5140 
5141 /*----------------------------------------------------------------------------
5142 | Returns 1 if the double-precision floating-point value `a' is less than or
5143 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5144 | cause an exception.  Otherwise, the comparison is performed according to the
5145 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5146 *----------------------------------------------------------------------------*/
5147 
5148 int float64_le_quiet(float64 a, float64 b, float_status *status)
5149 {
5150     flag aSign, bSign;
5151     uint64_t av, bv;
5152     a = float64_squash_input_denormal(a, status);
5153     b = float64_squash_input_denormal(b, status);
5154 
5155     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5156          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5157        ) {
5158         if (float64_is_signaling_nan(a, status)
5159          || float64_is_signaling_nan(b, status)) {
5160             float_raise(float_flag_invalid, status);
5161         }
5162         return 0;
5163     }
5164     aSign = extractFloat64Sign( a );
5165     bSign = extractFloat64Sign( b );
5166     av = float64_val(a);
5167     bv = float64_val(b);
5168     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5169     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5170 
5171 }
5172 
5173 /*----------------------------------------------------------------------------
5174 | Returns 1 if the double-precision floating-point value `a' is less than
5175 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5176 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5177 | Standard for Binary Floating-Point Arithmetic.
5178 *----------------------------------------------------------------------------*/
5179 
5180 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5181 {
5182     flag aSign, bSign;
5183     uint64_t av, bv;
5184     a = float64_squash_input_denormal(a, status);
5185     b = float64_squash_input_denormal(b, status);
5186 
5187     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5188          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5189        ) {
5190         if (float64_is_signaling_nan(a, status)
5191          || float64_is_signaling_nan(b, status)) {
5192             float_raise(float_flag_invalid, status);
5193         }
5194         return 0;
5195     }
5196     aSign = extractFloat64Sign( a );
5197     bSign = extractFloat64Sign( b );
5198     av = float64_val(a);
5199     bv = float64_val(b);
5200     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5201     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5202 
5203 }
5204 
5205 /*----------------------------------------------------------------------------
5206 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5207 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5208 | comparison is performed according to the IEC/IEEE Standard for Binary
5209 | Floating-Point Arithmetic.
5210 *----------------------------------------------------------------------------*/
5211 
5212 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5213 {
5214     a = float64_squash_input_denormal(a, status);
5215     b = float64_squash_input_denormal(b, status);
5216 
5217     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5218          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5219        ) {
5220         if (float64_is_signaling_nan(a, status)
5221          || float64_is_signaling_nan(b, status)) {
5222             float_raise(float_flag_invalid, status);
5223         }
5224         return 1;
5225     }
5226     return 0;
5227 }
5228 
5229 /*----------------------------------------------------------------------------
5230 | Returns the result of converting the extended double-precision floating-
5231 | point value `a' to the 32-bit two's complement integer format.  The
5232 | conversion is performed according to the IEC/IEEE Standard for Binary
5233 | Floating-Point Arithmetic---which means in particular that the conversion
5234 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5235 | largest positive integer is returned.  Otherwise, if the conversion
5236 | overflows, the largest integer with the same sign as `a' is returned.
5237 *----------------------------------------------------------------------------*/
5238 
5239 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5240 {
5241     flag aSign;
5242     int32_t aExp, shiftCount;
5243     uint64_t aSig;
5244 
5245     if (floatx80_invalid_encoding(a)) {
5246         float_raise(float_flag_invalid, status);
5247         return 1 << 31;
5248     }
5249     aSig = extractFloatx80Frac( a );
5250     aExp = extractFloatx80Exp( a );
5251     aSign = extractFloatx80Sign( a );
5252     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5253     shiftCount = 0x4037 - aExp;
5254     if ( shiftCount <= 0 ) shiftCount = 1;
5255     shift64RightJamming( aSig, shiftCount, &aSig );
5256     return roundAndPackInt32(aSign, aSig, status);
5257 
5258 }
5259 
5260 /*----------------------------------------------------------------------------
5261 | Returns the result of converting the extended double-precision floating-
5262 | point value `a' to the 32-bit two's complement integer format.  The
5263 | conversion is performed according to the IEC/IEEE Standard for Binary
5264 | Floating-Point Arithmetic, except that the conversion is always rounded
5265 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5266 | Otherwise, if the conversion overflows, the largest integer with the same
5267 | sign as `a' is returned.
5268 *----------------------------------------------------------------------------*/
5269 
5270 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5271 {
5272     flag aSign;
5273     int32_t aExp, shiftCount;
5274     uint64_t aSig, savedASig;
5275     int32_t z;
5276 
5277     if (floatx80_invalid_encoding(a)) {
5278         float_raise(float_flag_invalid, status);
5279         return 1 << 31;
5280     }
5281     aSig = extractFloatx80Frac( a );
5282     aExp = extractFloatx80Exp( a );
5283     aSign = extractFloatx80Sign( a );
5284     if ( 0x401E < aExp ) {
5285         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5286         goto invalid;
5287     }
5288     else if ( aExp < 0x3FFF ) {
5289         if (aExp || aSig) {
5290             status->float_exception_flags |= float_flag_inexact;
5291         }
5292         return 0;
5293     }
5294     shiftCount = 0x403E - aExp;
5295     savedASig = aSig;
5296     aSig >>= shiftCount;
5297     z = aSig;
5298     if ( aSign ) z = - z;
5299     if ( ( z < 0 ) ^ aSign ) {
5300  invalid:
5301         float_raise(float_flag_invalid, status);
5302         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5303     }
5304     if ( ( aSig<<shiftCount ) != savedASig ) {
5305         status->float_exception_flags |= float_flag_inexact;
5306     }
5307     return z;
5308 
5309 }
5310 
5311 /*----------------------------------------------------------------------------
5312 | Returns the result of converting the extended double-precision floating-
5313 | point value `a' to the 64-bit two's complement integer format.  The
5314 | conversion is performed according to the IEC/IEEE Standard for Binary
5315 | Floating-Point Arithmetic---which means in particular that the conversion
5316 | is rounded according to the current rounding mode.  If `a' is a NaN,
5317 | the largest positive integer is returned.  Otherwise, if the conversion
5318 | overflows, the largest integer with the same sign as `a' is returned.
5319 *----------------------------------------------------------------------------*/
5320 
5321 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5322 {
5323     flag aSign;
5324     int32_t aExp, shiftCount;
5325     uint64_t aSig, aSigExtra;
5326 
5327     if (floatx80_invalid_encoding(a)) {
5328         float_raise(float_flag_invalid, status);
5329         return 1ULL << 63;
5330     }
5331     aSig = extractFloatx80Frac( a );
5332     aExp = extractFloatx80Exp( a );
5333     aSign = extractFloatx80Sign( a );
5334     shiftCount = 0x403E - aExp;
5335     if ( shiftCount <= 0 ) {
5336         if ( shiftCount ) {
5337             float_raise(float_flag_invalid, status);
5338             if (!aSign || floatx80_is_any_nan(a)) {
5339                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5340             }
5341             return (int64_t) LIT64( 0x8000000000000000 );
5342         }
5343         aSigExtra = 0;
5344     }
5345     else {
5346         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5347     }
5348     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5349 
5350 }
5351 
5352 /*----------------------------------------------------------------------------
5353 | Returns the result of converting the extended double-precision floating-
5354 | point value `a' to the 64-bit two's complement integer format.  The
5355 | conversion is performed according to the IEC/IEEE Standard for Binary
5356 | Floating-Point Arithmetic, except that the conversion is always rounded
5357 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5358 | Otherwise, if the conversion overflows, the largest integer with the same
5359 | sign as `a' is returned.
5360 *----------------------------------------------------------------------------*/
5361 
5362 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5363 {
5364     flag aSign;
5365     int32_t aExp, shiftCount;
5366     uint64_t aSig;
5367     int64_t z;
5368 
5369     if (floatx80_invalid_encoding(a)) {
5370         float_raise(float_flag_invalid, status);
5371         return 1ULL << 63;
5372     }
5373     aSig = extractFloatx80Frac( a );
5374     aExp = extractFloatx80Exp( a );
5375     aSign = extractFloatx80Sign( a );
5376     shiftCount = aExp - 0x403E;
5377     if ( 0 <= shiftCount ) {
5378         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5379         if ( ( a.high != 0xC03E ) || aSig ) {
5380             float_raise(float_flag_invalid, status);
5381             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5383             }
5384         }
5385         return (int64_t) LIT64( 0x8000000000000000 );
5386     }
5387     else if ( aExp < 0x3FFF ) {
5388         if (aExp | aSig) {
5389             status->float_exception_flags |= float_flag_inexact;
5390         }
5391         return 0;
5392     }
5393     z = aSig>>( - shiftCount );
5394     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5395         status->float_exception_flags |= float_flag_inexact;
5396     }
5397     if ( aSign ) z = - z;
5398     return z;
5399 
5400 }
5401 
5402 /*----------------------------------------------------------------------------
5403 | Returns the result of converting the extended double-precision floating-
5404 | point value `a' to the single-precision floating-point format.  The
5405 | conversion is performed according to the IEC/IEEE Standard for Binary
5406 | Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408 
5409 float32 floatx80_to_float32(floatx80 a, float_status *status)
5410 {
5411     flag aSign;
5412     int32_t aExp;
5413     uint64_t aSig;
5414 
5415     if (floatx80_invalid_encoding(a)) {
5416         float_raise(float_flag_invalid, status);
5417         return float32_default_nan(status);
5418     }
5419     aSig = extractFloatx80Frac( a );
5420     aExp = extractFloatx80Exp( a );
5421     aSign = extractFloatx80Sign( a );
5422     if ( aExp == 0x7FFF ) {
5423         if ( (uint64_t) ( aSig<<1 ) ) {
5424             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5425         }
5426         return packFloat32( aSign, 0xFF, 0 );
5427     }
5428     shift64RightJamming( aSig, 33, &aSig );
5429     if ( aExp || aSig ) aExp -= 0x3F81;
5430     return roundAndPackFloat32(aSign, aExp, aSig, status);
5431 
5432 }
5433 
5434 /*----------------------------------------------------------------------------
5435 | Returns the result of converting the extended double-precision floating-
5436 | point value `a' to the double-precision floating-point format.  The
5437 | conversion is performed according to the IEC/IEEE Standard for Binary
5438 | Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 
5441 float64 floatx80_to_float64(floatx80 a, float_status *status)
5442 {
5443     flag aSign;
5444     int32_t aExp;
5445     uint64_t aSig, zSig;
5446 
5447     if (floatx80_invalid_encoding(a)) {
5448         float_raise(float_flag_invalid, status);
5449         return float64_default_nan(status);
5450     }
5451     aSig = extractFloatx80Frac( a );
5452     aExp = extractFloatx80Exp( a );
5453     aSign = extractFloatx80Sign( a );
5454     if ( aExp == 0x7FFF ) {
5455         if ( (uint64_t) ( aSig<<1 ) ) {
5456             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5457         }
5458         return packFloat64( aSign, 0x7FF, 0 );
5459     }
5460     shift64RightJamming( aSig, 1, &zSig );
5461     if ( aExp || aSig ) aExp -= 0x3C01;
5462     return roundAndPackFloat64(aSign, aExp, zSig, status);
5463 
5464 }
5465 
5466 /*----------------------------------------------------------------------------
5467 | Returns the result of converting the extended double-precision floating-
5468 | point value `a' to the quadruple-precision floating-point format.  The
5469 | conversion is performed according to the IEC/IEEE Standard for Binary
5470 | Floating-Point Arithmetic.
5471 *----------------------------------------------------------------------------*/
5472 
5473 float128 floatx80_to_float128(floatx80 a, float_status *status)
5474 {
5475     flag aSign;
5476     int aExp;
5477     uint64_t aSig, zSig0, zSig1;
5478 
5479     if (floatx80_invalid_encoding(a)) {
5480         float_raise(float_flag_invalid, status);
5481         return float128_default_nan(status);
5482     }
5483     aSig = extractFloatx80Frac( a );
5484     aExp = extractFloatx80Exp( a );
5485     aSign = extractFloatx80Sign( a );
5486     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5487         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5488     }
5489     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5490     return packFloat128( aSign, aExp, zSig0, zSig1 );
5491 
5492 }
5493 
5494 /*----------------------------------------------------------------------------
5495 | Rounds the extended double-precision floating-point value `a'
5496 | to the precision provided by floatx80_rounding_precision and returns the
5497 | result as an extended double-precision floating-point value.
5498 | The operation is performed according to the IEC/IEEE Standard for Binary
5499 | Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5501 
5502 floatx80 floatx80_round(floatx80 a, float_status *status)
5503 {
5504     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5505                                 extractFloatx80Sign(a),
5506                                 extractFloatx80Exp(a),
5507                                 extractFloatx80Frac(a), 0, status);
5508 }
5509 
5510 /*----------------------------------------------------------------------------
5511 | Rounds the extended double-precision floating-point value `a' to an integer,
5512 | and returns the result as an extended quadruple-precision floating-point
5513 | value.  The operation is performed according to the IEC/IEEE Standard for
5514 | Binary Floating-Point Arithmetic.
5515 *----------------------------------------------------------------------------*/
5516 
5517 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5518 {
5519     flag aSign;
5520     int32_t aExp;
5521     uint64_t lastBitMask, roundBitsMask;
5522     floatx80 z;
5523 
5524     if (floatx80_invalid_encoding(a)) {
5525         float_raise(float_flag_invalid, status);
5526         return floatx80_default_nan(status);
5527     }
5528     aExp = extractFloatx80Exp( a );
5529     if ( 0x403E <= aExp ) {
5530         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5531             return propagateFloatx80NaN(a, a, status);
5532         }
5533         return a;
5534     }
5535     if ( aExp < 0x3FFF ) {
5536         if (    ( aExp == 0 )
5537              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5538             return a;
5539         }
5540         status->float_exception_flags |= float_flag_inexact;
5541         aSign = extractFloatx80Sign( a );
5542         switch (status->float_rounding_mode) {
5543          case float_round_nearest_even:
5544             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5545                ) {
5546                 return
5547                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5548             }
5549             break;
5550         case float_round_ties_away:
5551             if (aExp == 0x3FFE) {
5552                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5553             }
5554             break;
5555          case float_round_down:
5556             return
5557                   aSign ?
5558                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5559                 : packFloatx80( 0, 0, 0 );
5560          case float_round_up:
5561             return
5562                   aSign ? packFloatx80( 1, 0, 0 )
5563                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5564         }
5565         return packFloatx80( aSign, 0, 0 );
5566     }
5567     lastBitMask = 1;
5568     lastBitMask <<= 0x403E - aExp;
5569     roundBitsMask = lastBitMask - 1;
5570     z = a;
5571     switch (status->float_rounding_mode) {
5572     case float_round_nearest_even:
5573         z.low += lastBitMask>>1;
5574         if ((z.low & roundBitsMask) == 0) {
5575             z.low &= ~lastBitMask;
5576         }
5577         break;
5578     case float_round_ties_away:
5579         z.low += lastBitMask >> 1;
5580         break;
5581     case float_round_to_zero:
5582         break;
5583     case float_round_up:
5584         if (!extractFloatx80Sign(z)) {
5585             z.low += roundBitsMask;
5586         }
5587         break;
5588     case float_round_down:
5589         if (extractFloatx80Sign(z)) {
5590             z.low += roundBitsMask;
5591         }
5592         break;
5593     default:
5594         abort();
5595     }
5596     z.low &= ~ roundBitsMask;
5597     if ( z.low == 0 ) {
5598         ++z.high;
5599         z.low = LIT64( 0x8000000000000000 );
5600     }
5601     if (z.low != a.low) {
5602         status->float_exception_flags |= float_flag_inexact;
5603     }
5604     return z;
5605 
5606 }
5607 
5608 /*----------------------------------------------------------------------------
5609 | Returns the result of adding the absolute values of the extended double-
5610 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5611 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5612 | The addition is performed according to the IEC/IEEE Standard for Binary
5613 | Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5615 
5616 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5617                                 float_status *status)
5618 {
5619     int32_t aExp, bExp, zExp;
5620     uint64_t aSig, bSig, zSig0, zSig1;
5621     int32_t expDiff;
5622 
5623     aSig = extractFloatx80Frac( a );
5624     aExp = extractFloatx80Exp( a );
5625     bSig = extractFloatx80Frac( b );
5626     bExp = extractFloatx80Exp( b );
5627     expDiff = aExp - bExp;
5628     if ( 0 < expDiff ) {
5629         if ( aExp == 0x7FFF ) {
5630             if ((uint64_t)(aSig << 1)) {
5631                 return propagateFloatx80NaN(a, b, status);
5632             }
5633             return a;
5634         }
5635         if ( bExp == 0 ) --expDiff;
5636         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5637         zExp = aExp;
5638     }
5639     else if ( expDiff < 0 ) {
5640         if ( bExp == 0x7FFF ) {
5641             if ((uint64_t)(bSig << 1)) {
5642                 return propagateFloatx80NaN(a, b, status);
5643             }
5644             return packFloatx80(zSign,
5645                                 floatx80_infinity_high,
5646                                 floatx80_infinity_low);
5647         }
5648         if ( aExp == 0 ) ++expDiff;
5649         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5650         zExp = bExp;
5651     }
5652     else {
5653         if ( aExp == 0x7FFF ) {
5654             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5655                 return propagateFloatx80NaN(a, b, status);
5656             }
5657             return a;
5658         }
5659         zSig1 = 0;
5660         zSig0 = aSig + bSig;
5661         if ( aExp == 0 ) {
5662             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5663             goto roundAndPack;
5664         }
5665         zExp = aExp;
5666         goto shiftRight1;
5667     }
5668     zSig0 = aSig + bSig;
5669     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5670  shiftRight1:
5671     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5672     zSig0 |= LIT64( 0x8000000000000000 );
5673     ++zExp;
5674  roundAndPack:
5675     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5676                                 zSign, zExp, zSig0, zSig1, status);
5677 }
5678 
5679 /*----------------------------------------------------------------------------
5680 | Returns the result of subtracting the absolute values of the extended
5681 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5682 | difference is negated before being returned.  `zSign' is ignored if the
5683 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5684 | Standard for Binary Floating-Point Arithmetic.
5685 *----------------------------------------------------------------------------*/
5686 
5687 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5688                                 float_status *status)
5689 {
5690     int32_t aExp, bExp, zExp;
5691     uint64_t aSig, bSig, zSig0, zSig1;
5692     int32_t expDiff;
5693 
5694     aSig = extractFloatx80Frac( a );
5695     aExp = extractFloatx80Exp( a );
5696     bSig = extractFloatx80Frac( b );
5697     bExp = extractFloatx80Exp( b );
5698     expDiff = aExp - bExp;
5699     if ( 0 < expDiff ) goto aExpBigger;
5700     if ( expDiff < 0 ) goto bExpBigger;
5701     if ( aExp == 0x7FFF ) {
5702         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5703             return propagateFloatx80NaN(a, b, status);
5704         }
5705         float_raise(float_flag_invalid, status);
5706         return floatx80_default_nan(status);
5707     }
5708     if ( aExp == 0 ) {
5709         aExp = 1;
5710         bExp = 1;
5711     }
5712     zSig1 = 0;
5713     if ( bSig < aSig ) goto aBigger;
5714     if ( aSig < bSig ) goto bBigger;
5715     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5716  bExpBigger:
5717     if ( bExp == 0x7FFF ) {
5718         if ((uint64_t)(bSig << 1)) {
5719             return propagateFloatx80NaN(a, b, status);
5720         }
5721         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5722                             floatx80_infinity_low);
5723     }
5724     if ( aExp == 0 ) ++expDiff;
5725     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5726  bBigger:
5727     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5728     zExp = bExp;
5729     zSign ^= 1;
5730     goto normalizeRoundAndPack;
5731  aExpBigger:
5732     if ( aExp == 0x7FFF ) {
5733         if ((uint64_t)(aSig << 1)) {
5734             return propagateFloatx80NaN(a, b, status);
5735         }
5736         return a;
5737     }
5738     if ( bExp == 0 ) --expDiff;
5739     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5740  aBigger:
5741     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5742     zExp = aExp;
5743  normalizeRoundAndPack:
5744     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5745                                          zSign, zExp, zSig0, zSig1, status);
5746 }
5747 
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of adding the extended double-precision floating-point
5750 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5751 | Standard for Binary Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5753 
5754 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5755 {
5756     flag aSign, bSign;
5757 
5758     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5759         float_raise(float_flag_invalid, status);
5760         return floatx80_default_nan(status);
5761     }
5762     aSign = extractFloatx80Sign( a );
5763     bSign = extractFloatx80Sign( b );
5764     if ( aSign == bSign ) {
5765         return addFloatx80Sigs(a, b, aSign, status);
5766     }
5767     else {
5768         return subFloatx80Sigs(a, b, aSign, status);
5769     }
5770 
5771 }
5772 
5773 /*----------------------------------------------------------------------------
5774 | Returns the result of subtracting the extended double-precision floating-
5775 | point values `a' and `b'.  The operation is performed according to the
5776 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5777 *----------------------------------------------------------------------------*/
5778 
5779 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5780 {
5781     flag aSign, bSign;
5782 
5783     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5784         float_raise(float_flag_invalid, status);
5785         return floatx80_default_nan(status);
5786     }
5787     aSign = extractFloatx80Sign( a );
5788     bSign = extractFloatx80Sign( b );
5789     if ( aSign == bSign ) {
5790         return subFloatx80Sigs(a, b, aSign, status);
5791     }
5792     else {
5793         return addFloatx80Sigs(a, b, aSign, status);
5794     }
5795 
5796 }
5797 
5798 /*----------------------------------------------------------------------------
5799 | Returns the result of multiplying the extended double-precision floating-
5800 | point values `a' and `b'.  The operation is performed according to the
5801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5802 *----------------------------------------------------------------------------*/
5803 
5804 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5805 {
5806     flag aSign, bSign, zSign;
5807     int32_t aExp, bExp, zExp;
5808     uint64_t aSig, bSig, zSig0, zSig1;
5809 
5810     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5811         float_raise(float_flag_invalid, status);
5812         return floatx80_default_nan(status);
5813     }
5814     aSig = extractFloatx80Frac( a );
5815     aExp = extractFloatx80Exp( a );
5816     aSign = extractFloatx80Sign( a );
5817     bSig = extractFloatx80Frac( b );
5818     bExp = extractFloatx80Exp( b );
5819     bSign = extractFloatx80Sign( b );
5820     zSign = aSign ^ bSign;
5821     if ( aExp == 0x7FFF ) {
5822         if (    (uint64_t) ( aSig<<1 )
5823              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5824             return propagateFloatx80NaN(a, b, status);
5825         }
5826         if ( ( bExp | bSig ) == 0 ) goto invalid;
5827         return packFloatx80(zSign, floatx80_infinity_high,
5828                                    floatx80_infinity_low);
5829     }
5830     if ( bExp == 0x7FFF ) {
5831         if ((uint64_t)(bSig << 1)) {
5832             return propagateFloatx80NaN(a, b, status);
5833         }
5834         if ( ( aExp | aSig ) == 0 ) {
5835  invalid:
5836             float_raise(float_flag_invalid, status);
5837             return floatx80_default_nan(status);
5838         }
5839         return packFloatx80(zSign, floatx80_infinity_high,
5840                                    floatx80_infinity_low);
5841     }
5842     if ( aExp == 0 ) {
5843         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5844         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5845     }
5846     if ( bExp == 0 ) {
5847         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5848         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5849     }
5850     zExp = aExp + bExp - 0x3FFE;
5851     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5852     if ( 0 < (int64_t) zSig0 ) {
5853         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5854         --zExp;
5855     }
5856     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5857                                 zSign, zExp, zSig0, zSig1, status);
5858 }
5859 
5860 /*----------------------------------------------------------------------------
5861 | Returns the result of dividing the extended double-precision floating-point
5862 | value `a' by the corresponding value `b'.  The operation is performed
5863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5864 *----------------------------------------------------------------------------*/
5865 
5866 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5867 {
5868     flag aSign, bSign, zSign;
5869     int32_t aExp, bExp, zExp;
5870     uint64_t aSig, bSig, zSig0, zSig1;
5871     uint64_t rem0, rem1, rem2, term0, term1, term2;
5872 
5873     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5874         float_raise(float_flag_invalid, status);
5875         return floatx80_default_nan(status);
5876     }
5877     aSig = extractFloatx80Frac( a );
5878     aExp = extractFloatx80Exp( a );
5879     aSign = extractFloatx80Sign( a );
5880     bSig = extractFloatx80Frac( b );
5881     bExp = extractFloatx80Exp( b );
5882     bSign = extractFloatx80Sign( b );
5883     zSign = aSign ^ bSign;
5884     if ( aExp == 0x7FFF ) {
5885         if ((uint64_t)(aSig << 1)) {
5886             return propagateFloatx80NaN(a, b, status);
5887         }
5888         if ( bExp == 0x7FFF ) {
5889             if ((uint64_t)(bSig << 1)) {
5890                 return propagateFloatx80NaN(a, b, status);
5891             }
5892             goto invalid;
5893         }
5894         return packFloatx80(zSign, floatx80_infinity_high,
5895                                    floatx80_infinity_low);
5896     }
5897     if ( bExp == 0x7FFF ) {
5898         if ((uint64_t)(bSig << 1)) {
5899             return propagateFloatx80NaN(a, b, status);
5900         }
5901         return packFloatx80( zSign, 0, 0 );
5902     }
5903     if ( bExp == 0 ) {
5904         if ( bSig == 0 ) {
5905             if ( ( aExp | aSig ) == 0 ) {
5906  invalid:
5907                 float_raise(float_flag_invalid, status);
5908                 return floatx80_default_nan(status);
5909             }
5910             float_raise(float_flag_divbyzero, status);
5911             return packFloatx80(zSign, floatx80_infinity_high,
5912                                        floatx80_infinity_low);
5913         }
5914         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5915     }
5916     if ( aExp == 0 ) {
5917         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5918         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5919     }
5920     zExp = aExp - bExp + 0x3FFE;
5921     rem1 = 0;
5922     if ( bSig <= aSig ) {
5923         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5924         ++zExp;
5925     }
5926     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5927     mul64To128( bSig, zSig0, &term0, &term1 );
5928     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5929     while ( (int64_t) rem0 < 0 ) {
5930         --zSig0;
5931         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5932     }
5933     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5934     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5935         mul64To128( bSig, zSig1, &term1, &term2 );
5936         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5937         while ( (int64_t) rem1 < 0 ) {
5938             --zSig1;
5939             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5940         }
5941         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5942     }
5943     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5944                                 zSign, zExp, zSig0, zSig1, status);
5945 }
5946 
5947 /*----------------------------------------------------------------------------
5948 | Returns the remainder of the extended double-precision floating-point value
5949 | `a' with respect to the corresponding value `b'.  The operation is performed
5950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5952 
5953 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5954 {
5955     flag aSign, zSign;
5956     int32_t aExp, bExp, expDiff;
5957     uint64_t aSig0, aSig1, bSig;
5958     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5959 
5960     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5961         float_raise(float_flag_invalid, status);
5962         return floatx80_default_nan(status);
5963     }
5964     aSig0 = extractFloatx80Frac( a );
5965     aExp = extractFloatx80Exp( a );
5966     aSign = extractFloatx80Sign( a );
5967     bSig = extractFloatx80Frac( b );
5968     bExp = extractFloatx80Exp( b );
5969     if ( aExp == 0x7FFF ) {
5970         if (    (uint64_t) ( aSig0<<1 )
5971              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5972             return propagateFloatx80NaN(a, b, status);
5973         }
5974         goto invalid;
5975     }
5976     if ( bExp == 0x7FFF ) {
5977         if ((uint64_t)(bSig << 1)) {
5978             return propagateFloatx80NaN(a, b, status);
5979         }
5980         return a;
5981     }
5982     if ( bExp == 0 ) {
5983         if ( bSig == 0 ) {
5984  invalid:
5985             float_raise(float_flag_invalid, status);
5986             return floatx80_default_nan(status);
5987         }
5988         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5989     }
5990     if ( aExp == 0 ) {
5991         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5992         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5993     }
5994     bSig |= LIT64( 0x8000000000000000 );
5995     zSign = aSign;
5996     expDiff = aExp - bExp;
5997     aSig1 = 0;
5998     if ( expDiff < 0 ) {
5999         if ( expDiff < -1 ) return a;
6000         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6001         expDiff = 0;
6002     }
6003     q = ( bSig <= aSig0 );
6004     if ( q ) aSig0 -= bSig;
6005     expDiff -= 64;
6006     while ( 0 < expDiff ) {
6007         q = estimateDiv128To64( aSig0, aSig1, bSig );
6008         q = ( 2 < q ) ? q - 2 : 0;
6009         mul64To128( bSig, q, &term0, &term1 );
6010         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6011         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6012         expDiff -= 62;
6013     }
6014     expDiff += 64;
6015     if ( 0 < expDiff ) {
6016         q = estimateDiv128To64( aSig0, aSig1, bSig );
6017         q = ( 2 < q ) ? q - 2 : 0;
6018         q >>= 64 - expDiff;
6019         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6020         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6021         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6022         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6023             ++q;
6024             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6025         }
6026     }
6027     else {
6028         term1 = 0;
6029         term0 = bSig;
6030     }
6031     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6032     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6033          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6034               && ( q & 1 ) )
6035        ) {
6036         aSig0 = alternateASig0;
6037         aSig1 = alternateASig1;
6038         zSign = ! zSign;
6039     }
6040     return
6041         normalizeRoundAndPackFloatx80(
6042             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6043 
6044 }
6045 
6046 /*----------------------------------------------------------------------------
6047 | Returns the square root of the extended double-precision floating-point
6048 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6049 | for Binary Floating-Point Arithmetic.
6050 *----------------------------------------------------------------------------*/
6051 
6052 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6053 {
6054     flag aSign;
6055     int32_t aExp, zExp;
6056     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6057     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6058 
6059     if (floatx80_invalid_encoding(a)) {
6060         float_raise(float_flag_invalid, status);
6061         return floatx80_default_nan(status);
6062     }
6063     aSig0 = extractFloatx80Frac( a );
6064     aExp = extractFloatx80Exp( a );
6065     aSign = extractFloatx80Sign( a );
6066     if ( aExp == 0x7FFF ) {
6067         if ((uint64_t)(aSig0 << 1)) {
6068             return propagateFloatx80NaN(a, a, status);
6069         }
6070         if ( ! aSign ) return a;
6071         goto invalid;
6072     }
6073     if ( aSign ) {
6074         if ( ( aExp | aSig0 ) == 0 ) return a;
6075  invalid:
6076         float_raise(float_flag_invalid, status);
6077         return floatx80_default_nan(status);
6078     }
6079     if ( aExp == 0 ) {
6080         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6081         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6082     }
6083     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6084     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6085     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6086     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6087     doubleZSig0 = zSig0<<1;
6088     mul64To128( zSig0, zSig0, &term0, &term1 );
6089     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6090     while ( (int64_t) rem0 < 0 ) {
6091         --zSig0;
6092         doubleZSig0 -= 2;
6093         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6094     }
6095     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6096     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6097         if ( zSig1 == 0 ) zSig1 = 1;
6098         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6099         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6100         mul64To128( zSig1, zSig1, &term2, &term3 );
6101         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6102         while ( (int64_t) rem1 < 0 ) {
6103             --zSig1;
6104             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6105             term3 |= 1;
6106             term2 |= doubleZSig0;
6107             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6108         }
6109         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6110     }
6111     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6112     zSig0 |= doubleZSig0;
6113     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6114                                 0, zExp, zSig0, zSig1, status);
6115 }
6116 
6117 /*----------------------------------------------------------------------------
6118 | Returns 1 if the extended double-precision floating-point value `a' is equal
6119 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6120 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6123 
6124 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6125 {
6126 
6127     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6128         || (extractFloatx80Exp(a) == 0x7FFF
6129             && (uint64_t) (extractFloatx80Frac(a) << 1))
6130         || (extractFloatx80Exp(b) == 0x7FFF
6131             && (uint64_t) (extractFloatx80Frac(b) << 1))
6132        ) {
6133         float_raise(float_flag_invalid, status);
6134         return 0;
6135     }
6136     return
6137            ( a.low == b.low )
6138         && (    ( a.high == b.high )
6139              || (    ( a.low == 0 )
6140                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6141            );
6142 
6143 }
6144 
6145 /*----------------------------------------------------------------------------
6146 | Returns 1 if the extended double-precision floating-point value `a' is
6147 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6148 | invalid exception is raised if either operand is a NaN.  The comparison is
6149 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6150 | Arithmetic.
6151 *----------------------------------------------------------------------------*/
6152 
6153 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6154 {
6155     flag aSign, bSign;
6156 
6157     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6158         || (extractFloatx80Exp(a) == 0x7FFF
6159             && (uint64_t) (extractFloatx80Frac(a) << 1))
6160         || (extractFloatx80Exp(b) == 0x7FFF
6161             && (uint64_t) (extractFloatx80Frac(b) << 1))
6162        ) {
6163         float_raise(float_flag_invalid, status);
6164         return 0;
6165     }
6166     aSign = extractFloatx80Sign( a );
6167     bSign = extractFloatx80Sign( b );
6168     if ( aSign != bSign ) {
6169         return
6170                aSign
6171             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6172                  == 0 );
6173     }
6174     return
6175           aSign ? le128( b.high, b.low, a.high, a.low )
6176         : le128( a.high, a.low, b.high, b.low );
6177 
6178 }
6179 
6180 /*----------------------------------------------------------------------------
6181 | Returns 1 if the extended double-precision floating-point value `a' is
6182 | less than the corresponding value `b', and 0 otherwise.  The invalid
6183 | exception is raised if either operand is a NaN.  The comparison is performed
6184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6185 *----------------------------------------------------------------------------*/
6186 
6187 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6188 {
6189     flag aSign, bSign;
6190 
6191     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6192         || (extractFloatx80Exp(a) == 0x7FFF
6193             && (uint64_t) (extractFloatx80Frac(a) << 1))
6194         || (extractFloatx80Exp(b) == 0x7FFF
6195             && (uint64_t) (extractFloatx80Frac(b) << 1))
6196        ) {
6197         float_raise(float_flag_invalid, status);
6198         return 0;
6199     }
6200     aSign = extractFloatx80Sign( a );
6201     bSign = extractFloatx80Sign( b );
6202     if ( aSign != bSign ) {
6203         return
6204                aSign
6205             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6206                  != 0 );
6207     }
6208     return
6209           aSign ? lt128( b.high, b.low, a.high, a.low )
6210         : lt128( a.high, a.low, b.high, b.low );
6211 
6212 }
6213 
6214 /*----------------------------------------------------------------------------
6215 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6216 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6217 | either operand is a NaN.   The comparison is performed according to the
6218 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6220 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6221 {
6222     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6223         || (extractFloatx80Exp(a) == 0x7FFF
6224             && (uint64_t) (extractFloatx80Frac(a) << 1))
6225         || (extractFloatx80Exp(b) == 0x7FFF
6226             && (uint64_t) (extractFloatx80Frac(b) << 1))
6227        ) {
6228         float_raise(float_flag_invalid, status);
6229         return 1;
6230     }
6231     return 0;
6232 }
6233 
6234 /*----------------------------------------------------------------------------
6235 | Returns 1 if the extended double-precision floating-point value `a' is
6236 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6237 | cause an exception.  The comparison is performed according to the IEC/IEEE
6238 | Standard for Binary Floating-Point Arithmetic.
6239 *----------------------------------------------------------------------------*/
6240 
6241 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6242 {
6243 
6244     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6245         float_raise(float_flag_invalid, status);
6246         return 0;
6247     }
6248     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6249               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6250          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6251               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6252        ) {
6253         if (floatx80_is_signaling_nan(a, status)
6254          || floatx80_is_signaling_nan(b, status)) {
6255             float_raise(float_flag_invalid, status);
6256         }
6257         return 0;
6258     }
6259     return
6260            ( a.low == b.low )
6261         && (    ( a.high == b.high )
6262              || (    ( a.low == 0 )
6263                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6264            );
6265 
6266 }
6267 
6268 /*----------------------------------------------------------------------------
6269 | Returns 1 if the extended double-precision floating-point value `a' is less
6270 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6271 | do not cause an exception.  Otherwise, the comparison is performed according
6272 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6273 *----------------------------------------------------------------------------*/
6274 
6275 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6276 {
6277     flag aSign, bSign;
6278 
6279     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6280         float_raise(float_flag_invalid, status);
6281         return 0;
6282     }
6283     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6284               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6285          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6286               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6287        ) {
6288         if (floatx80_is_signaling_nan(a, status)
6289          || floatx80_is_signaling_nan(b, status)) {
6290             float_raise(float_flag_invalid, status);
6291         }
6292         return 0;
6293     }
6294     aSign = extractFloatx80Sign( a );
6295     bSign = extractFloatx80Sign( b );
6296     if ( aSign != bSign ) {
6297         return
6298                aSign
6299             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6300                  == 0 );
6301     }
6302     return
6303           aSign ? le128( b.high, b.low, a.high, a.low )
6304         : le128( a.high, a.low, b.high, b.low );
6305 
6306 }
6307 
6308 /*----------------------------------------------------------------------------
6309 | Returns 1 if the extended double-precision floating-point value `a' is less
6310 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6311 | an exception.  Otherwise, the comparison is performed according to the
6312 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6313 *----------------------------------------------------------------------------*/
6314 
6315 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6316 {
6317     flag aSign, bSign;
6318 
6319     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6320         float_raise(float_flag_invalid, status);
6321         return 0;
6322     }
6323     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6324               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6325          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6326               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6327        ) {
6328         if (floatx80_is_signaling_nan(a, status)
6329          || floatx80_is_signaling_nan(b, status)) {
6330             float_raise(float_flag_invalid, status);
6331         }
6332         return 0;
6333     }
6334     aSign = extractFloatx80Sign( a );
6335     bSign = extractFloatx80Sign( b );
6336     if ( aSign != bSign ) {
6337         return
6338                aSign
6339             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6340                  != 0 );
6341     }
6342     return
6343           aSign ? lt128( b.high, b.low, a.high, a.low )
6344         : lt128( a.high, a.low, b.high, b.low );
6345 
6346 }
6347 
6348 /*----------------------------------------------------------------------------
6349 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6350 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6351 | The comparison is performed according to the IEC/IEEE Standard for Binary
6352 | Floating-Point Arithmetic.
6353 *----------------------------------------------------------------------------*/
6354 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6355 {
6356     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6357         float_raise(float_flag_invalid, status);
6358         return 1;
6359     }
6360     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6361               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6362          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6363               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6364        ) {
6365         if (floatx80_is_signaling_nan(a, status)
6366          || floatx80_is_signaling_nan(b, status)) {
6367             float_raise(float_flag_invalid, status);
6368         }
6369         return 1;
6370     }
6371     return 0;
6372 }
6373 
6374 /*----------------------------------------------------------------------------
6375 | Returns the result of converting the quadruple-precision floating-point
6376 | value `a' to the 32-bit two's complement integer format.  The conversion
6377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6378 | Arithmetic---which means in particular that the conversion is rounded
6379 | according to the current rounding mode.  If `a' is a NaN, the largest
6380 | positive integer is returned.  Otherwise, if the conversion overflows, the
6381 | largest integer with the same sign as `a' is returned.
6382 *----------------------------------------------------------------------------*/
6383 
6384 int32_t float128_to_int32(float128 a, float_status *status)
6385 {
6386     flag aSign;
6387     int32_t aExp, shiftCount;
6388     uint64_t aSig0, aSig1;
6389 
6390     aSig1 = extractFloat128Frac1( a );
6391     aSig0 = extractFloat128Frac0( a );
6392     aExp = extractFloat128Exp( a );
6393     aSign = extractFloat128Sign( a );
6394     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6395     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6396     aSig0 |= ( aSig1 != 0 );
6397     shiftCount = 0x4028 - aExp;
6398     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6399     return roundAndPackInt32(aSign, aSig0, status);
6400 
6401 }
6402 
6403 /*----------------------------------------------------------------------------
6404 | Returns the result of converting the quadruple-precision floating-point
6405 | value `a' to the 32-bit two's complement integer format.  The conversion
6406 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6407 | Arithmetic, except that the conversion is always rounded toward zero.  If
6408 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6409 | conversion overflows, the largest integer with the same sign as `a' is
6410 | returned.
6411 *----------------------------------------------------------------------------*/
6412 
6413 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6414 {
6415     flag aSign;
6416     int32_t aExp, shiftCount;
6417     uint64_t aSig0, aSig1, savedASig;
6418     int32_t z;
6419 
6420     aSig1 = extractFloat128Frac1( a );
6421     aSig0 = extractFloat128Frac0( a );
6422     aExp = extractFloat128Exp( a );
6423     aSign = extractFloat128Sign( a );
6424     aSig0 |= ( aSig1 != 0 );
6425     if ( 0x401E < aExp ) {
6426         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6427         goto invalid;
6428     }
6429     else if ( aExp < 0x3FFF ) {
6430         if (aExp || aSig0) {
6431             status->float_exception_flags |= float_flag_inexact;
6432         }
6433         return 0;
6434     }
6435     aSig0 |= LIT64( 0x0001000000000000 );
6436     shiftCount = 0x402F - aExp;
6437     savedASig = aSig0;
6438     aSig0 >>= shiftCount;
6439     z = aSig0;
6440     if ( aSign ) z = - z;
6441     if ( ( z < 0 ) ^ aSign ) {
6442  invalid:
6443         float_raise(float_flag_invalid, status);
6444         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6445     }
6446     if ( ( aSig0<<shiftCount ) != savedASig ) {
6447         status->float_exception_flags |= float_flag_inexact;
6448     }
6449     return z;
6450 
6451 }
6452 
6453 /*----------------------------------------------------------------------------
6454 | Returns the result of converting the quadruple-precision floating-point
6455 | value `a' to the 64-bit two's complement integer format.  The conversion
6456 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6457 | Arithmetic---which means in particular that the conversion is rounded
6458 | according to the current rounding mode.  If `a' is a NaN, the largest
6459 | positive integer is returned.  Otherwise, if the conversion overflows, the
6460 | largest integer with the same sign as `a' is returned.
6461 *----------------------------------------------------------------------------*/
6462 
6463 int64_t float128_to_int64(float128 a, float_status *status)
6464 {
6465     flag aSign;
6466     int32_t aExp, shiftCount;
6467     uint64_t aSig0, aSig1;
6468 
6469     aSig1 = extractFloat128Frac1( a );
6470     aSig0 = extractFloat128Frac0( a );
6471     aExp = extractFloat128Exp( a );
6472     aSign = extractFloat128Sign( a );
6473     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6474     shiftCount = 0x402F - aExp;
6475     if ( shiftCount <= 0 ) {
6476         if ( 0x403E < aExp ) {
6477             float_raise(float_flag_invalid, status);
6478             if (    ! aSign
6479                  || (    ( aExp == 0x7FFF )
6480                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6481                     )
6482                ) {
6483                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6484             }
6485             return (int64_t) LIT64( 0x8000000000000000 );
6486         }
6487         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6488     }
6489     else {
6490         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6491     }
6492     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6493 
6494 }
6495 
6496 /*----------------------------------------------------------------------------
6497 | Returns the result of converting the quadruple-precision floating-point
6498 | value `a' to the 64-bit two's complement integer format.  The conversion
6499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6500 | Arithmetic, except that the conversion is always rounded toward zero.
6501 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6502 | the conversion overflows, the largest integer with the same sign as `a' is
6503 | returned.
6504 *----------------------------------------------------------------------------*/
6505 
6506 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6507 {
6508     flag aSign;
6509     int32_t aExp, shiftCount;
6510     uint64_t aSig0, aSig1;
6511     int64_t z;
6512 
6513     aSig1 = extractFloat128Frac1( a );
6514     aSig0 = extractFloat128Frac0( a );
6515     aExp = extractFloat128Exp( a );
6516     aSign = extractFloat128Sign( a );
6517     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6518     shiftCount = aExp - 0x402F;
6519     if ( 0 < shiftCount ) {
6520         if ( 0x403E <= aExp ) {
6521             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6522             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6523                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6524                 if (aSig1) {
6525                     status->float_exception_flags |= float_flag_inexact;
6526                 }
6527             }
6528             else {
6529                 float_raise(float_flag_invalid, status);
6530                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6531                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6532                 }
6533             }
6534             return (int64_t) LIT64( 0x8000000000000000 );
6535         }
6536         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6537         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6538             status->float_exception_flags |= float_flag_inexact;
6539         }
6540     }
6541     else {
6542         if ( aExp < 0x3FFF ) {
6543             if ( aExp | aSig0 | aSig1 ) {
6544                 status->float_exception_flags |= float_flag_inexact;
6545             }
6546             return 0;
6547         }
6548         z = aSig0>>( - shiftCount );
6549         if (    aSig1
6550              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6551             status->float_exception_flags |= float_flag_inexact;
6552         }
6553     }
6554     if ( aSign ) z = - z;
6555     return z;
6556 
6557 }
6558 
6559 /*----------------------------------------------------------------------------
6560 | Returns the result of converting the quadruple-precision floating-point value
6561 | `a' to the 64-bit unsigned integer format.  The conversion is
6562 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6563 | Arithmetic---which means in particular that the conversion is rounded
6564 | according to the current rounding mode.  If `a' is a NaN, the largest
6565 | positive integer is returned.  If the conversion overflows, the
6566 | largest unsigned integer is returned.  If 'a' is negative, the value is
6567 | rounded and zero is returned; negative values that do not round to zero
6568 | will raise the inexact exception.
6569 *----------------------------------------------------------------------------*/
6570 
6571 uint64_t float128_to_uint64(float128 a, float_status *status)
6572 {
6573     flag aSign;
6574     int aExp;
6575     int shiftCount;
6576     uint64_t aSig0, aSig1;
6577 
6578     aSig0 = extractFloat128Frac0(a);
6579     aSig1 = extractFloat128Frac1(a);
6580     aExp = extractFloat128Exp(a);
6581     aSign = extractFloat128Sign(a);
6582     if (aSign && (aExp > 0x3FFE)) {
6583         float_raise(float_flag_invalid, status);
6584         if (float128_is_any_nan(a)) {
6585             return LIT64(0xFFFFFFFFFFFFFFFF);
6586         } else {
6587             return 0;
6588         }
6589     }
6590     if (aExp) {
6591         aSig0 |= LIT64(0x0001000000000000);
6592     }
6593     shiftCount = 0x402F - aExp;
6594     if (shiftCount <= 0) {
6595         if (0x403E < aExp) {
6596             float_raise(float_flag_invalid, status);
6597             return LIT64(0xFFFFFFFFFFFFFFFF);
6598         }
6599         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6600     } else {
6601         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6602     }
6603     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6604 }
6605 
6606 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6607 {
6608     uint64_t v;
6609     signed char current_rounding_mode = status->float_rounding_mode;
6610 
6611     set_float_rounding_mode(float_round_to_zero, status);
6612     v = float128_to_uint64(a, status);
6613     set_float_rounding_mode(current_rounding_mode, status);
6614 
6615     return v;
6616 }
6617 
6618 /*----------------------------------------------------------------------------
6619 | Returns the result of converting the quadruple-precision floating-point
6620 | value `a' to the 32-bit unsigned integer format.  The conversion
6621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6622 | Arithmetic except that the conversion is always rounded toward zero.
6623 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6624 | if the conversion overflows, the largest unsigned integer is returned.
6625 | If 'a' is negative, the value is rounded and zero is returned; negative
6626 | values that do not round to zero will raise the inexact exception.
6627 *----------------------------------------------------------------------------*/
6628 
6629 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6630 {
6631     uint64_t v;
6632     uint32_t res;
6633     int old_exc_flags = get_float_exception_flags(status);
6634 
6635     v = float128_to_uint64_round_to_zero(a, status);
6636     if (v > 0xffffffff) {
6637         res = 0xffffffff;
6638     } else {
6639         return v;
6640     }
6641     set_float_exception_flags(old_exc_flags, status);
6642     float_raise(float_flag_invalid, status);
6643     return res;
6644 }
6645 
6646 /*----------------------------------------------------------------------------
6647 | Returns the result of converting the quadruple-precision floating-point
6648 | value `a' to the single-precision floating-point format.  The conversion
6649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6650 | Arithmetic.
6651 *----------------------------------------------------------------------------*/
6652 
6653 float32 float128_to_float32(float128 a, float_status *status)
6654 {
6655     flag aSign;
6656     int32_t aExp;
6657     uint64_t aSig0, aSig1;
6658     uint32_t zSig;
6659 
6660     aSig1 = extractFloat128Frac1( a );
6661     aSig0 = extractFloat128Frac0( a );
6662     aExp = extractFloat128Exp( a );
6663     aSign = extractFloat128Sign( a );
6664     if ( aExp == 0x7FFF ) {
6665         if ( aSig0 | aSig1 ) {
6666             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6667         }
6668         return packFloat32( aSign, 0xFF, 0 );
6669     }
6670     aSig0 |= ( aSig1 != 0 );
6671     shift64RightJamming( aSig0, 18, &aSig0 );
6672     zSig = aSig0;
6673     if ( aExp || zSig ) {
6674         zSig |= 0x40000000;
6675         aExp -= 0x3F81;
6676     }
6677     return roundAndPackFloat32(aSign, aExp, zSig, status);
6678 
6679 }
6680 
6681 /*----------------------------------------------------------------------------
6682 | Returns the result of converting the quadruple-precision floating-point
6683 | value `a' to the double-precision floating-point format.  The conversion
6684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6685 | Arithmetic.
6686 *----------------------------------------------------------------------------*/
6687 
6688 float64 float128_to_float64(float128 a, float_status *status)
6689 {
6690     flag aSign;
6691     int32_t aExp;
6692     uint64_t aSig0, aSig1;
6693 
6694     aSig1 = extractFloat128Frac1( a );
6695     aSig0 = extractFloat128Frac0( a );
6696     aExp = extractFloat128Exp( a );
6697     aSign = extractFloat128Sign( a );
6698     if ( aExp == 0x7FFF ) {
6699         if ( aSig0 | aSig1 ) {
6700             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6701         }
6702         return packFloat64( aSign, 0x7FF, 0 );
6703     }
6704     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6705     aSig0 |= ( aSig1 != 0 );
6706     if ( aExp || aSig0 ) {
6707         aSig0 |= LIT64( 0x4000000000000000 );
6708         aExp -= 0x3C01;
6709     }
6710     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6711 
6712 }
6713 
6714 /*----------------------------------------------------------------------------
6715 | Returns the result of converting the quadruple-precision floating-point
6716 | value `a' to the extended double-precision floating-point format.  The
6717 | conversion is performed according to the IEC/IEEE Standard for Binary
6718 | Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6720 
6721 floatx80 float128_to_floatx80(float128 a, float_status *status)
6722 {
6723     flag aSign;
6724     int32_t aExp;
6725     uint64_t aSig0, aSig1;
6726 
6727     aSig1 = extractFloat128Frac1( a );
6728     aSig0 = extractFloat128Frac0( a );
6729     aExp = extractFloat128Exp( a );
6730     aSign = extractFloat128Sign( a );
6731     if ( aExp == 0x7FFF ) {
6732         if ( aSig0 | aSig1 ) {
6733             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6734         }
6735         return packFloatx80(aSign, floatx80_infinity_high,
6736                                    floatx80_infinity_low);
6737     }
6738     if ( aExp == 0 ) {
6739         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6740         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6741     }
6742     else {
6743         aSig0 |= LIT64( 0x0001000000000000 );
6744     }
6745     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6746     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6747 
6748 }
6749 
6750 /*----------------------------------------------------------------------------
6751 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6752 | returns the result as a quadruple-precision floating-point value.  The
6753 | operation is performed according to the IEC/IEEE Standard for Binary
6754 | Floating-Point Arithmetic.
6755 *----------------------------------------------------------------------------*/
6756 
6757 float128 float128_round_to_int(float128 a, float_status *status)
6758 {
6759     flag aSign;
6760     int32_t aExp;
6761     uint64_t lastBitMask, roundBitsMask;
6762     float128 z;
6763 
6764     aExp = extractFloat128Exp( a );
6765     if ( 0x402F <= aExp ) {
6766         if ( 0x406F <= aExp ) {
6767             if (    ( aExp == 0x7FFF )
6768                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6769                ) {
6770                 return propagateFloat128NaN(a, a, status);
6771             }
6772             return a;
6773         }
6774         lastBitMask = 1;
6775         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6776         roundBitsMask = lastBitMask - 1;
6777         z = a;
6778         switch (status->float_rounding_mode) {
6779         case float_round_nearest_even:
6780             if ( lastBitMask ) {
6781                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6782                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6783             }
6784             else {
6785                 if ( (int64_t) z.low < 0 ) {
6786                     ++z.high;
6787                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6788                 }
6789             }
6790             break;
6791         case float_round_ties_away:
6792             if (lastBitMask) {
6793                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6794             } else {
6795                 if ((int64_t) z.low < 0) {
6796                     ++z.high;
6797                 }
6798             }
6799             break;
6800         case float_round_to_zero:
6801             break;
6802         case float_round_up:
6803             if (!extractFloat128Sign(z)) {
6804                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6805             }
6806             break;
6807         case float_round_down:
6808             if (extractFloat128Sign(z)) {
6809                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6810             }
6811             break;
6812         default:
6813             abort();
6814         }
6815         z.low &= ~ roundBitsMask;
6816     }
6817     else {
6818         if ( aExp < 0x3FFF ) {
6819             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6820             status->float_exception_flags |= float_flag_inexact;
6821             aSign = extractFloat128Sign( a );
6822             switch (status->float_rounding_mode) {
6823              case float_round_nearest_even:
6824                 if (    ( aExp == 0x3FFE )
6825                      && (   extractFloat128Frac0( a )
6826                           | extractFloat128Frac1( a ) )
6827                    ) {
6828                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6829                 }
6830                 break;
6831             case float_round_ties_away:
6832                 if (aExp == 0x3FFE) {
6833                     return packFloat128(aSign, 0x3FFF, 0, 0);
6834                 }
6835                 break;
6836              case float_round_down:
6837                 return
6838                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6839                     : packFloat128( 0, 0, 0, 0 );
6840              case float_round_up:
6841                 return
6842                       aSign ? packFloat128( 1, 0, 0, 0 )
6843                     : packFloat128( 0, 0x3FFF, 0, 0 );
6844             }
6845             return packFloat128( aSign, 0, 0, 0 );
6846         }
6847         lastBitMask = 1;
6848         lastBitMask <<= 0x402F - aExp;
6849         roundBitsMask = lastBitMask - 1;
6850         z.low = 0;
6851         z.high = a.high;
6852         switch (status->float_rounding_mode) {
6853         case float_round_nearest_even:
6854             z.high += lastBitMask>>1;
6855             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6856                 z.high &= ~ lastBitMask;
6857             }
6858             break;
6859         case float_round_ties_away:
6860             z.high += lastBitMask>>1;
6861             break;
6862         case float_round_to_zero:
6863             break;
6864         case float_round_up:
6865             if (!extractFloat128Sign(z)) {
6866                 z.high |= ( a.low != 0 );
6867                 z.high += roundBitsMask;
6868             }
6869             break;
6870         case float_round_down:
6871             if (extractFloat128Sign(z)) {
6872                 z.high |= (a.low != 0);
6873                 z.high += roundBitsMask;
6874             }
6875             break;
6876         default:
6877             abort();
6878         }
6879         z.high &= ~ roundBitsMask;
6880     }
6881     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6882         status->float_exception_flags |= float_flag_inexact;
6883     }
6884     return z;
6885 
6886 }
6887 
6888 /*----------------------------------------------------------------------------
6889 | Returns the result of adding the absolute values of the quadruple-precision
6890 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6891 | before being returned.  `zSign' is ignored if the result is a NaN.
6892 | The addition is performed according to the IEC/IEEE Standard for Binary
6893 | Floating-Point Arithmetic.
6894 *----------------------------------------------------------------------------*/
6895 
6896 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6897                                 float_status *status)
6898 {
6899     int32_t aExp, bExp, zExp;
6900     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6901     int32_t expDiff;
6902 
6903     aSig1 = extractFloat128Frac1( a );
6904     aSig0 = extractFloat128Frac0( a );
6905     aExp = extractFloat128Exp( a );
6906     bSig1 = extractFloat128Frac1( b );
6907     bSig0 = extractFloat128Frac0( b );
6908     bExp = extractFloat128Exp( b );
6909     expDiff = aExp - bExp;
6910     if ( 0 < expDiff ) {
6911         if ( aExp == 0x7FFF ) {
6912             if (aSig0 | aSig1) {
6913                 return propagateFloat128NaN(a, b, status);
6914             }
6915             return a;
6916         }
6917         if ( bExp == 0 ) {
6918             --expDiff;
6919         }
6920         else {
6921             bSig0 |= LIT64( 0x0001000000000000 );
6922         }
6923         shift128ExtraRightJamming(
6924             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6925         zExp = aExp;
6926     }
6927     else if ( expDiff < 0 ) {
6928         if ( bExp == 0x7FFF ) {
6929             if (bSig0 | bSig1) {
6930                 return propagateFloat128NaN(a, b, status);
6931             }
6932             return packFloat128( zSign, 0x7FFF, 0, 0 );
6933         }
6934         if ( aExp == 0 ) {
6935             ++expDiff;
6936         }
6937         else {
6938             aSig0 |= LIT64( 0x0001000000000000 );
6939         }
6940         shift128ExtraRightJamming(
6941             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6942         zExp = bExp;
6943     }
6944     else {
6945         if ( aExp == 0x7FFF ) {
6946             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6947                 return propagateFloat128NaN(a, b, status);
6948             }
6949             return a;
6950         }
6951         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6952         if ( aExp == 0 ) {
6953             if (status->flush_to_zero) {
6954                 if (zSig0 | zSig1) {
6955                     float_raise(float_flag_output_denormal, status);
6956                 }
6957                 return packFloat128(zSign, 0, 0, 0);
6958             }
6959             return packFloat128( zSign, 0, zSig0, zSig1 );
6960         }
6961         zSig2 = 0;
6962         zSig0 |= LIT64( 0x0002000000000000 );
6963         zExp = aExp;
6964         goto shiftRight1;
6965     }
6966     aSig0 |= LIT64( 0x0001000000000000 );
6967     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6968     --zExp;
6969     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6970     ++zExp;
6971  shiftRight1:
6972     shift128ExtraRightJamming(
6973         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6974  roundAndPack:
6975     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6976 
6977 }
6978 
6979 /*----------------------------------------------------------------------------
6980 | Returns the result of subtracting the absolute values of the quadruple-
6981 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6982 | difference is negated before being returned.  `zSign' is ignored if the
6983 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6984 | Standard for Binary Floating-Point Arithmetic.
6985 *----------------------------------------------------------------------------*/
6986 
6987 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6988                                 float_status *status)
6989 {
6990     int32_t aExp, bExp, zExp;
6991     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6992     int32_t expDiff;
6993 
6994     aSig1 = extractFloat128Frac1( a );
6995     aSig0 = extractFloat128Frac0( a );
6996     aExp = extractFloat128Exp( a );
6997     bSig1 = extractFloat128Frac1( b );
6998     bSig0 = extractFloat128Frac0( b );
6999     bExp = extractFloat128Exp( b );
7000     expDiff = aExp - bExp;
7001     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7002     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7003     if ( 0 < expDiff ) goto aExpBigger;
7004     if ( expDiff < 0 ) goto bExpBigger;
7005     if ( aExp == 0x7FFF ) {
7006         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7007             return propagateFloat128NaN(a, b, status);
7008         }
7009         float_raise(float_flag_invalid, status);
7010         return float128_default_nan(status);
7011     }
7012     if ( aExp == 0 ) {
7013         aExp = 1;
7014         bExp = 1;
7015     }
7016     if ( bSig0 < aSig0 ) goto aBigger;
7017     if ( aSig0 < bSig0 ) goto bBigger;
7018     if ( bSig1 < aSig1 ) goto aBigger;
7019     if ( aSig1 < bSig1 ) goto bBigger;
7020     return packFloat128(status->float_rounding_mode == float_round_down,
7021                         0, 0, 0);
7022  bExpBigger:
7023     if ( bExp == 0x7FFF ) {
7024         if (bSig0 | bSig1) {
7025             return propagateFloat128NaN(a, b, status);
7026         }
7027         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7028     }
7029     if ( aExp == 0 ) {
7030         ++expDiff;
7031     }
7032     else {
7033         aSig0 |= LIT64( 0x4000000000000000 );
7034     }
7035     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7036     bSig0 |= LIT64( 0x4000000000000000 );
7037  bBigger:
7038     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7039     zExp = bExp;
7040     zSign ^= 1;
7041     goto normalizeRoundAndPack;
7042  aExpBigger:
7043     if ( aExp == 0x7FFF ) {
7044         if (aSig0 | aSig1) {
7045             return propagateFloat128NaN(a, b, status);
7046         }
7047         return a;
7048     }
7049     if ( bExp == 0 ) {
7050         --expDiff;
7051     }
7052     else {
7053         bSig0 |= LIT64( 0x4000000000000000 );
7054     }
7055     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7056     aSig0 |= LIT64( 0x4000000000000000 );
7057  aBigger:
7058     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7059     zExp = aExp;
7060  normalizeRoundAndPack:
7061     --zExp;
7062     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7063                                          status);
7064 
7065 }
7066 
7067 /*----------------------------------------------------------------------------
7068 | Returns the result of adding the quadruple-precision floating-point values
7069 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7070 | for Binary Floating-Point Arithmetic.
7071 *----------------------------------------------------------------------------*/
7072 
7073 float128 float128_add(float128 a, float128 b, float_status *status)
7074 {
7075     flag aSign, bSign;
7076 
7077     aSign = extractFloat128Sign( a );
7078     bSign = extractFloat128Sign( b );
7079     if ( aSign == bSign ) {
7080         return addFloat128Sigs(a, b, aSign, status);
7081     }
7082     else {
7083         return subFloat128Sigs(a, b, aSign, status);
7084     }
7085 
7086 }
7087 
7088 /*----------------------------------------------------------------------------
7089 | Returns the result of subtracting the quadruple-precision floating-point
7090 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7091 | Standard for Binary Floating-Point Arithmetic.
7092 *----------------------------------------------------------------------------*/
7093 
7094 float128 float128_sub(float128 a, float128 b, float_status *status)
7095 {
7096     flag aSign, bSign;
7097 
7098     aSign = extractFloat128Sign( a );
7099     bSign = extractFloat128Sign( b );
7100     if ( aSign == bSign ) {
7101         return subFloat128Sigs(a, b, aSign, status);
7102     }
7103     else {
7104         return addFloat128Sigs(a, b, aSign, status);
7105     }
7106 
7107 }
7108 
7109 /*----------------------------------------------------------------------------
7110 | Returns the result of multiplying the quadruple-precision floating-point
7111 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7112 | Standard for Binary Floating-Point Arithmetic.
7113 *----------------------------------------------------------------------------*/
7114 
7115 float128 float128_mul(float128 a, float128 b, float_status *status)
7116 {
7117     flag aSign, bSign, zSign;
7118     int32_t aExp, bExp, zExp;
7119     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7120 
7121     aSig1 = extractFloat128Frac1( a );
7122     aSig0 = extractFloat128Frac0( a );
7123     aExp = extractFloat128Exp( a );
7124     aSign = extractFloat128Sign( a );
7125     bSig1 = extractFloat128Frac1( b );
7126     bSig0 = extractFloat128Frac0( b );
7127     bExp = extractFloat128Exp( b );
7128     bSign = extractFloat128Sign( b );
7129     zSign = aSign ^ bSign;
7130     if ( aExp == 0x7FFF ) {
7131         if (    ( aSig0 | aSig1 )
7132              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7133             return propagateFloat128NaN(a, b, status);
7134         }
7135         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7136         return packFloat128( zSign, 0x7FFF, 0, 0 );
7137     }
7138     if ( bExp == 0x7FFF ) {
7139         if (bSig0 | bSig1) {
7140             return propagateFloat128NaN(a, b, status);
7141         }
7142         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7143  invalid:
7144             float_raise(float_flag_invalid, status);
7145             return float128_default_nan(status);
7146         }
7147         return packFloat128( zSign, 0x7FFF, 0, 0 );
7148     }
7149     if ( aExp == 0 ) {
7150         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7151         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7152     }
7153     if ( bExp == 0 ) {
7154         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7155         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7156     }
7157     zExp = aExp + bExp - 0x4000;
7158     aSig0 |= LIT64( 0x0001000000000000 );
7159     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7160     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7161     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7162     zSig2 |= ( zSig3 != 0 );
7163     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7164         shift128ExtraRightJamming(
7165             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7166         ++zExp;
7167     }
7168     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7169 
7170 }
7171 
7172 /*----------------------------------------------------------------------------
7173 | Returns the result of dividing the quadruple-precision floating-point value
7174 | `a' by the corresponding value `b'.  The operation is performed according to
7175 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7176 *----------------------------------------------------------------------------*/
7177 
7178 float128 float128_div(float128 a, float128 b, float_status *status)
7179 {
7180     flag aSign, bSign, zSign;
7181     int32_t aExp, bExp, zExp;
7182     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7183     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7184 
7185     aSig1 = extractFloat128Frac1( a );
7186     aSig0 = extractFloat128Frac0( a );
7187     aExp = extractFloat128Exp( a );
7188     aSign = extractFloat128Sign( a );
7189     bSig1 = extractFloat128Frac1( b );
7190     bSig0 = extractFloat128Frac0( b );
7191     bExp = extractFloat128Exp( b );
7192     bSign = extractFloat128Sign( b );
7193     zSign = aSign ^ bSign;
7194     if ( aExp == 0x7FFF ) {
7195         if (aSig0 | aSig1) {
7196             return propagateFloat128NaN(a, b, status);
7197         }
7198         if ( bExp == 0x7FFF ) {
7199             if (bSig0 | bSig1) {
7200                 return propagateFloat128NaN(a, b, status);
7201             }
7202             goto invalid;
7203         }
7204         return packFloat128( zSign, 0x7FFF, 0, 0 );
7205     }
7206     if ( bExp == 0x7FFF ) {
7207         if (bSig0 | bSig1) {
7208             return propagateFloat128NaN(a, b, status);
7209         }
7210         return packFloat128( zSign, 0, 0, 0 );
7211     }
7212     if ( bExp == 0 ) {
7213         if ( ( bSig0 | bSig1 ) == 0 ) {
7214             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7215  invalid:
7216                 float_raise(float_flag_invalid, status);
7217                 return float128_default_nan(status);
7218             }
7219             float_raise(float_flag_divbyzero, status);
7220             return packFloat128( zSign, 0x7FFF, 0, 0 );
7221         }
7222         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7223     }
7224     if ( aExp == 0 ) {
7225         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7226         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7227     }
7228     zExp = aExp - bExp + 0x3FFD;
7229     shortShift128Left(
7230         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7231     shortShift128Left(
7232         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7233     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7234         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7235         ++zExp;
7236     }
7237     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7238     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7239     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7240     while ( (int64_t) rem0 < 0 ) {
7241         --zSig0;
7242         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7243     }
7244     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7245     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7246         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7247         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7248         while ( (int64_t) rem1 < 0 ) {
7249             --zSig1;
7250             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7251         }
7252         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7253     }
7254     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7255     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7256 
7257 }
7258 
7259 /*----------------------------------------------------------------------------
7260 | Returns the remainder of the quadruple-precision floating-point value `a'
7261 | with respect to the corresponding value `b'.  The operation is performed
7262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7263 *----------------------------------------------------------------------------*/
7264 
7265 float128 float128_rem(float128 a, float128 b, float_status *status)
7266 {
7267     flag aSign, zSign;
7268     int32_t aExp, bExp, expDiff;
7269     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7270     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7271     int64_t sigMean0;
7272 
7273     aSig1 = extractFloat128Frac1( a );
7274     aSig0 = extractFloat128Frac0( a );
7275     aExp = extractFloat128Exp( a );
7276     aSign = extractFloat128Sign( a );
7277     bSig1 = extractFloat128Frac1( b );
7278     bSig0 = extractFloat128Frac0( b );
7279     bExp = extractFloat128Exp( b );
7280     if ( aExp == 0x7FFF ) {
7281         if (    ( aSig0 | aSig1 )
7282              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7283             return propagateFloat128NaN(a, b, status);
7284         }
7285         goto invalid;
7286     }
7287     if ( bExp == 0x7FFF ) {
7288         if (bSig0 | bSig1) {
7289             return propagateFloat128NaN(a, b, status);
7290         }
7291         return a;
7292     }
7293     if ( bExp == 0 ) {
7294         if ( ( bSig0 | bSig1 ) == 0 ) {
7295  invalid:
7296             float_raise(float_flag_invalid, status);
7297             return float128_default_nan(status);
7298         }
7299         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7300     }
7301     if ( aExp == 0 ) {
7302         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7303         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7304     }
7305     expDiff = aExp - bExp;
7306     if ( expDiff < -1 ) return a;
7307     shortShift128Left(
7308         aSig0 | LIT64( 0x0001000000000000 ),
7309         aSig1,
7310         15 - ( expDiff < 0 ),
7311         &aSig0,
7312         &aSig1
7313     );
7314     shortShift128Left(
7315         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7316     q = le128( bSig0, bSig1, aSig0, aSig1 );
7317     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7318     expDiff -= 64;
7319     while ( 0 < expDiff ) {
7320         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7321         q = ( 4 < q ) ? q - 4 : 0;
7322         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7323         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7324         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7325         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7326         expDiff -= 61;
7327     }
7328     if ( -64 < expDiff ) {
7329         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7330         q = ( 4 < q ) ? q - 4 : 0;
7331         q >>= - expDiff;
7332         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7333         expDiff += 52;
7334         if ( expDiff < 0 ) {
7335             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7336         }
7337         else {
7338             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7339         }
7340         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7341         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7342     }
7343     else {
7344         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7345         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7346     }
7347     do {
7348         alternateASig0 = aSig0;
7349         alternateASig1 = aSig1;
7350         ++q;
7351         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7352     } while ( 0 <= (int64_t) aSig0 );
7353     add128(
7354         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7355     if (    ( sigMean0 < 0 )
7356          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7357         aSig0 = alternateASig0;
7358         aSig1 = alternateASig1;
7359     }
7360     zSign = ( (int64_t) aSig0 < 0 );
7361     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7362     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7363                                          status);
7364 }
7365 
7366 /*----------------------------------------------------------------------------
7367 | Returns the square root of the quadruple-precision floating-point value `a'.
7368 | The operation is performed according to the IEC/IEEE Standard for Binary
7369 | Floating-Point Arithmetic.
7370 *----------------------------------------------------------------------------*/
7371 
7372 float128 float128_sqrt(float128 a, float_status *status)
7373 {
7374     flag aSign;
7375     int32_t aExp, zExp;
7376     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7377     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7378 
7379     aSig1 = extractFloat128Frac1( a );
7380     aSig0 = extractFloat128Frac0( a );
7381     aExp = extractFloat128Exp( a );
7382     aSign = extractFloat128Sign( a );
7383     if ( aExp == 0x7FFF ) {
7384         if (aSig0 | aSig1) {
7385             return propagateFloat128NaN(a, a, status);
7386         }
7387         if ( ! aSign ) return a;
7388         goto invalid;
7389     }
7390     if ( aSign ) {
7391         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7392  invalid:
7393         float_raise(float_flag_invalid, status);
7394         return float128_default_nan(status);
7395     }
7396     if ( aExp == 0 ) {
7397         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7398         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7399     }
7400     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7401     aSig0 |= LIT64( 0x0001000000000000 );
7402     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7403     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7404     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7405     doubleZSig0 = zSig0<<1;
7406     mul64To128( zSig0, zSig0, &term0, &term1 );
7407     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7408     while ( (int64_t) rem0 < 0 ) {
7409         --zSig0;
7410         doubleZSig0 -= 2;
7411         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7412     }
7413     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7414     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7415         if ( zSig1 == 0 ) zSig1 = 1;
7416         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7417         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7418         mul64To128( zSig1, zSig1, &term2, &term3 );
7419         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7420         while ( (int64_t) rem1 < 0 ) {
7421             --zSig1;
7422             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7423             term3 |= 1;
7424             term2 |= doubleZSig0;
7425             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7426         }
7427         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7428     }
7429     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7430     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7431 
7432 }
7433 
7434 /*----------------------------------------------------------------------------
7435 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7436 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7437 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7439 *----------------------------------------------------------------------------*/
7440 
7441 int float128_eq(float128 a, float128 b, float_status *status)
7442 {
7443 
7444     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7445               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7446          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7447               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7448        ) {
7449         float_raise(float_flag_invalid, status);
7450         return 0;
7451     }
7452     return
7453            ( a.low == b.low )
7454         && (    ( a.high == b.high )
7455              || (    ( a.low == 0 )
7456                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7457            );
7458 
7459 }
7460 
7461 /*----------------------------------------------------------------------------
7462 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7463 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7464 | exception is raised if either operand is a NaN.  The comparison is performed
7465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7466 *----------------------------------------------------------------------------*/
7467 
7468 int float128_le(float128 a, float128 b, float_status *status)
7469 {
7470     flag aSign, bSign;
7471 
7472     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7473               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7474          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7475               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7476        ) {
7477         float_raise(float_flag_invalid, status);
7478         return 0;
7479     }
7480     aSign = extractFloat128Sign( a );
7481     bSign = extractFloat128Sign( b );
7482     if ( aSign != bSign ) {
7483         return
7484                aSign
7485             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7486                  == 0 );
7487     }
7488     return
7489           aSign ? le128( b.high, b.low, a.high, a.low )
7490         : le128( a.high, a.low, b.high, b.low );
7491 
7492 }
7493 
7494 /*----------------------------------------------------------------------------
7495 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7496 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7497 | raised if either operand is a NaN.  The comparison is performed according
7498 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7499 *----------------------------------------------------------------------------*/
7500 
7501 int float128_lt(float128 a, float128 b, float_status *status)
7502 {
7503     flag aSign, bSign;
7504 
7505     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7506               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7507          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7508               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7509        ) {
7510         float_raise(float_flag_invalid, status);
7511         return 0;
7512     }
7513     aSign = extractFloat128Sign( a );
7514     bSign = extractFloat128Sign( b );
7515     if ( aSign != bSign ) {
7516         return
7517                aSign
7518             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7519                  != 0 );
7520     }
7521     return
7522           aSign ? lt128( b.high, b.low, a.high, a.low )
7523         : lt128( a.high, a.low, b.high, b.low );
7524 
7525 }
7526 
7527 /*----------------------------------------------------------------------------
7528 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7529 | be compared, and 0 otherwise.  The invalid exception is raised if either
7530 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7531 | Standard for Binary Floating-Point Arithmetic.
7532 *----------------------------------------------------------------------------*/
7533 
7534 int float128_unordered(float128 a, float128 b, float_status *status)
7535 {
7536     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7537               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7538          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7539               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7540        ) {
7541         float_raise(float_flag_invalid, status);
7542         return 1;
7543     }
7544     return 0;
7545 }
7546 
7547 /*----------------------------------------------------------------------------
7548 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7549 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7550 | exception.  The comparison is performed according to the IEC/IEEE Standard
7551 | for Binary Floating-Point Arithmetic.
7552 *----------------------------------------------------------------------------*/
7553 
7554 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7555 {
7556 
7557     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7558               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7559          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7560               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7561        ) {
7562         if (float128_is_signaling_nan(a, status)
7563          || float128_is_signaling_nan(b, status)) {
7564             float_raise(float_flag_invalid, status);
7565         }
7566         return 0;
7567     }
7568     return
7569            ( a.low == b.low )
7570         && (    ( a.high == b.high )
7571              || (    ( a.low == 0 )
7572                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7573            );
7574 
7575 }
7576 
7577 /*----------------------------------------------------------------------------
7578 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7579 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7580 | cause an exception.  Otherwise, the comparison is performed according to the
7581 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7582 *----------------------------------------------------------------------------*/
7583 
7584 int float128_le_quiet(float128 a, float128 b, float_status *status)
7585 {
7586     flag aSign, bSign;
7587 
7588     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7589               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7590          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7591               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7592        ) {
7593         if (float128_is_signaling_nan(a, status)
7594          || float128_is_signaling_nan(b, status)) {
7595             float_raise(float_flag_invalid, status);
7596         }
7597         return 0;
7598     }
7599     aSign = extractFloat128Sign( a );
7600     bSign = extractFloat128Sign( b );
7601     if ( aSign != bSign ) {
7602         return
7603                aSign
7604             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7605                  == 0 );
7606     }
7607     return
7608           aSign ? le128( b.high, b.low, a.high, a.low )
7609         : le128( a.high, a.low, b.high, b.low );
7610 
7611 }
7612 
7613 /*----------------------------------------------------------------------------
7614 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7615 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7616 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7617 | Standard for Binary Floating-Point Arithmetic.
7618 *----------------------------------------------------------------------------*/
7619 
7620 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7621 {
7622     flag aSign, bSign;
7623 
7624     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7625               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7626          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7627               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7628        ) {
7629         if (float128_is_signaling_nan(a, status)
7630          || float128_is_signaling_nan(b, status)) {
7631             float_raise(float_flag_invalid, status);
7632         }
7633         return 0;
7634     }
7635     aSign = extractFloat128Sign( a );
7636     bSign = extractFloat128Sign( b );
7637     if ( aSign != bSign ) {
7638         return
7639                aSign
7640             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7641                  != 0 );
7642     }
7643     return
7644           aSign ? lt128( b.high, b.low, a.high, a.low )
7645         : lt128( a.high, a.low, b.high, b.low );
7646 
7647 }
7648 
7649 /*----------------------------------------------------------------------------
7650 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7651 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7652 | comparison is performed according to the IEC/IEEE Standard for Binary
7653 | Floating-Point Arithmetic.
7654 *----------------------------------------------------------------------------*/
7655 
7656 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7657 {
7658     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7659               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7660          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7661               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7662        ) {
7663         if (float128_is_signaling_nan(a, status)
7664          || float128_is_signaling_nan(b, status)) {
7665             float_raise(float_flag_invalid, status);
7666         }
7667         return 1;
7668     }
7669     return 0;
7670 }
7671 
7672 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7673                                             int is_quiet, float_status *status)
7674 {
7675     flag aSign, bSign;
7676 
7677     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7678         float_raise(float_flag_invalid, status);
7679         return float_relation_unordered;
7680     }
7681     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7682           ( extractFloatx80Frac( a )<<1 ) ) ||
7683         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7684           ( extractFloatx80Frac( b )<<1 ) )) {
7685         if (!is_quiet ||
7686             floatx80_is_signaling_nan(a, status) ||
7687             floatx80_is_signaling_nan(b, status)) {
7688             float_raise(float_flag_invalid, status);
7689         }
7690         return float_relation_unordered;
7691     }
7692     aSign = extractFloatx80Sign( a );
7693     bSign = extractFloatx80Sign( b );
7694     if ( aSign != bSign ) {
7695 
7696         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7697              ( ( a.low | b.low ) == 0 ) ) {
7698             /* zero case */
7699             return float_relation_equal;
7700         } else {
7701             return 1 - (2 * aSign);
7702         }
7703     } else {
7704         if (a.low == b.low && a.high == b.high) {
7705             return float_relation_equal;
7706         } else {
7707             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7708         }
7709     }
7710 }
7711 
7712 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7713 {
7714     return floatx80_compare_internal(a, b, 0, status);
7715 }
7716 
7717 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7718 {
7719     return floatx80_compare_internal(a, b, 1, status);
7720 }
7721 
7722 static inline int float128_compare_internal(float128 a, float128 b,
7723                                             int is_quiet, float_status *status)
7724 {
7725     flag aSign, bSign;
7726 
7727     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7728           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7729         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7730           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7731         if (!is_quiet ||
7732             float128_is_signaling_nan(a, status) ||
7733             float128_is_signaling_nan(b, status)) {
7734             float_raise(float_flag_invalid, status);
7735         }
7736         return float_relation_unordered;
7737     }
7738     aSign = extractFloat128Sign( a );
7739     bSign = extractFloat128Sign( b );
7740     if ( aSign != bSign ) {
7741         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7742             /* zero case */
7743             return float_relation_equal;
7744         } else {
7745             return 1 - (2 * aSign);
7746         }
7747     } else {
7748         if (a.low == b.low && a.high == b.high) {
7749             return float_relation_equal;
7750         } else {
7751             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7752         }
7753     }
7754 }
7755 
7756 int float128_compare(float128 a, float128 b, float_status *status)
7757 {
7758     return float128_compare_internal(a, b, 0, status);
7759 }
7760 
7761 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7762 {
7763     return float128_compare_internal(a, b, 1, status);
7764 }
7765 
7766 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7767 {
7768     flag aSign;
7769     int32_t aExp;
7770     uint64_t aSig;
7771 
7772     if (floatx80_invalid_encoding(a)) {
7773         float_raise(float_flag_invalid, status);
7774         return floatx80_default_nan(status);
7775     }
7776     aSig = extractFloatx80Frac( a );
7777     aExp = extractFloatx80Exp( a );
7778     aSign = extractFloatx80Sign( a );
7779 
7780     if ( aExp == 0x7FFF ) {
7781         if ( aSig<<1 ) {
7782             return propagateFloatx80NaN(a, a, status);
7783         }
7784         return a;
7785     }
7786 
7787     if (aExp == 0) {
7788         if (aSig == 0) {
7789             return a;
7790         }
7791         aExp++;
7792     }
7793 
7794     if (n > 0x10000) {
7795         n = 0x10000;
7796     } else if (n < -0x10000) {
7797         n = -0x10000;
7798     }
7799 
7800     aExp += n;
7801     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7802                                          aSign, aExp, aSig, 0, status);
7803 }
7804 
7805 float128 float128_scalbn(float128 a, int n, float_status *status)
7806 {
7807     flag aSign;
7808     int32_t aExp;
7809     uint64_t aSig0, aSig1;
7810 
7811     aSig1 = extractFloat128Frac1( a );
7812     aSig0 = extractFloat128Frac0( a );
7813     aExp = extractFloat128Exp( a );
7814     aSign = extractFloat128Sign( a );
7815     if ( aExp == 0x7FFF ) {
7816         if ( aSig0 | aSig1 ) {
7817             return propagateFloat128NaN(a, a, status);
7818         }
7819         return a;
7820     }
7821     if (aExp != 0) {
7822         aSig0 |= LIT64( 0x0001000000000000 );
7823     } else if (aSig0 == 0 && aSig1 == 0) {
7824         return a;
7825     } else {
7826         aExp++;
7827     }
7828 
7829     if (n > 0x10000) {
7830         n = 0x10000;
7831     } else if (n < -0x10000) {
7832         n = -0x10000;
7833     }
7834 
7835     aExp += n - 1;
7836     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7837                                          , status);
7838 
7839 }
7840