xref: /qemu/fpu/softfloat.c (revision f131bae8a7b7ed1928cc94c69df291db609c316a)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523                 float_status *status)
1524 {
1525     FloatParts pa = float32_unpack_canonical(a, status);
1526     FloatParts pb = float32_unpack_canonical(b, status);
1527     FloatParts pc = float32_unpack_canonical(c, status);
1528     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1529 
1530     return float32_round_pack_canonical(pr, status);
1531 }
1532 
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535                 float_status *status)
1536 {
1537     FloatParts pa = float64_unpack_canonical(a, status);
1538     FloatParts pb = float64_unpack_canonical(b, status);
1539     FloatParts pc = float64_unpack_canonical(c, status);
1540     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1541 
1542     return float64_round_pack_canonical(pr, status);
1543 }
1544 
1545 float32 QEMU_FLATTEN
1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1547 {
1548     union_float32 ua, ub, uc, ur;
1549 
1550     ua.s = xa;
1551     ub.s = xb;
1552     uc.s = xc;
1553 
1554     if (unlikely(!can_use_fpu(s))) {
1555         goto soft;
1556     }
1557     if (unlikely(flags & float_muladd_halve_result)) {
1558         goto soft;
1559     }
1560 
1561     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1562     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1563         goto soft;
1564     }
1565     /*
1566      * When (a || b) == 0, there's no need to check for under/over flow,
1567      * since we know the addend is (normal || 0) and the product is 0.
1568      */
1569     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1570         union_float32 up;
1571         bool prod_sign;
1572 
1573         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1574         prod_sign ^= !!(flags & float_muladd_negate_product);
1575         up.s = float32_set_sign(float32_zero, prod_sign);
1576 
1577         if (flags & float_muladd_negate_c) {
1578             uc.h = -uc.h;
1579         }
1580         ur.h = up.h + uc.h;
1581     } else {
1582         if (flags & float_muladd_negate_product) {
1583             ua.h = -ua.h;
1584         }
1585         if (flags & float_muladd_negate_c) {
1586             uc.h = -uc.h;
1587         }
1588 
1589         ur.h = fmaf(ua.h, ub.h, uc.h);
1590 
1591         if (unlikely(f32_is_inf(ur))) {
1592             s->float_exception_flags |= float_flag_overflow;
1593         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1594             goto soft;
1595         }
1596     }
1597     if (flags & float_muladd_negate_result) {
1598         return float32_chs(ur.s);
1599     }
1600     return ur.s;
1601 
1602  soft:
1603     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1604 }
1605 
1606 float64 QEMU_FLATTEN
1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1608 {
1609     union_float64 ua, ub, uc, ur;
1610 
1611     ua.s = xa;
1612     ub.s = xb;
1613     uc.s = xc;
1614 
1615     if (unlikely(!can_use_fpu(s))) {
1616         goto soft;
1617     }
1618     if (unlikely(flags & float_muladd_halve_result)) {
1619         goto soft;
1620     }
1621 
1622     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1623     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1624         goto soft;
1625     }
1626     /*
1627      * When (a || b) == 0, there's no need to check for under/over flow,
1628      * since we know the addend is (normal || 0) and the product is 0.
1629      */
1630     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1631         union_float64 up;
1632         bool prod_sign;
1633 
1634         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1635         prod_sign ^= !!(flags & float_muladd_negate_product);
1636         up.s = float64_set_sign(float64_zero, prod_sign);
1637 
1638         if (flags & float_muladd_negate_c) {
1639             uc.h = -uc.h;
1640         }
1641         ur.h = up.h + uc.h;
1642     } else {
1643         if (flags & float_muladd_negate_product) {
1644             ua.h = -ua.h;
1645         }
1646         if (flags & float_muladd_negate_c) {
1647             uc.h = -uc.h;
1648         }
1649 
1650         ur.h = fma(ua.h, ub.h, uc.h);
1651 
1652         if (unlikely(f64_is_inf(ur))) {
1653             s->float_exception_flags |= float_flag_overflow;
1654         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1655             goto soft;
1656         }
1657     }
1658     if (flags & float_muladd_negate_result) {
1659         return float64_chs(ur.s);
1660     }
1661     return ur.s;
1662 
1663  soft:
1664     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1665 }
1666 
1667 /*
1668  * Returns the result of dividing the floating-point value `a' by the
1669  * corresponding value `b'. The operation is performed according to
1670  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1671  */
1672 
1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1674 {
1675     bool sign = a.sign ^ b.sign;
1676 
1677     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1678         uint64_t n0, n1, q, r;
1679         int exp = a.exp - b.exp;
1680 
1681         /*
1682          * We want a 2*N / N-bit division to produce exactly an N-bit
1683          * result, so that we do not lose any precision and so that we
1684          * do not have to renormalize afterward.  If A.frac < B.frac,
1685          * then division would produce an (N-1)-bit result; shift A left
1686          * by one to produce the an N-bit result, and decrement the
1687          * exponent to match.
1688          *
1689          * The udiv_qrnnd algorithm that we're using requires normalization,
1690          * i.e. the msb of the denominator must be set.  Since we know that
1691          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1692          * by one (more), and the remainder must be shifted right by one.
1693          */
1694         if (a.frac < b.frac) {
1695             exp -= 1;
1696             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1697         } else {
1698             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1699         }
1700         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1701 
1702         /*
1703          * Set lsb if there is a remainder, to set inexact.
1704          * As mentioned above, to find the actual value of the remainder we
1705          * would need to shift right, but (1) we are only concerned about
1706          * non-zero-ness, and (2) the remainder will always be even because
1707          * both inputs to the division primitive are even.
1708          */
1709         a.frac = q | (r != 0);
1710         a.sign = sign;
1711         a.exp = exp;
1712         return a;
1713     }
1714     /* handle all the NaN cases */
1715     if (is_nan(a.cls) || is_nan(b.cls)) {
1716         return pick_nan(a, b, s);
1717     }
1718     /* 0/0 or Inf/Inf */
1719     if (a.cls == b.cls
1720         &&
1721         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1722         s->float_exception_flags |= float_flag_invalid;
1723         return parts_default_nan(s);
1724     }
1725     /* Inf / x or 0 / x */
1726     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1727         a.sign = sign;
1728         return a;
1729     }
1730     /* Div 0 => Inf */
1731     if (b.cls == float_class_zero) {
1732         s->float_exception_flags |= float_flag_divbyzero;
1733         a.cls = float_class_inf;
1734         a.sign = sign;
1735         return a;
1736     }
1737     /* Div by Inf */
1738     if (b.cls == float_class_inf) {
1739         a.cls = float_class_zero;
1740         a.sign = sign;
1741         return a;
1742     }
1743     g_assert_not_reached();
1744 }
1745 
1746 float16 float16_div(float16 a, float16 b, float_status *status)
1747 {
1748     FloatParts pa = float16_unpack_canonical(a, status);
1749     FloatParts pb = float16_unpack_canonical(b, status);
1750     FloatParts pr = div_floats(pa, pb, status);
1751 
1752     return float16_round_pack_canonical(pr, status);
1753 }
1754 
1755 static float32 QEMU_SOFTFLOAT_ATTR
1756 soft_f32_div(float32 a, float32 b, float_status *status)
1757 {
1758     FloatParts pa = float32_unpack_canonical(a, status);
1759     FloatParts pb = float32_unpack_canonical(b, status);
1760     FloatParts pr = div_floats(pa, pb, status);
1761 
1762     return float32_round_pack_canonical(pr, status);
1763 }
1764 
1765 static float64 QEMU_SOFTFLOAT_ATTR
1766 soft_f64_div(float64 a, float64 b, float_status *status)
1767 {
1768     FloatParts pa = float64_unpack_canonical(a, status);
1769     FloatParts pb = float64_unpack_canonical(b, status);
1770     FloatParts pr = div_floats(pa, pb, status);
1771 
1772     return float64_round_pack_canonical(pr, status);
1773 }
1774 
1775 static float hard_f32_div(float a, float b)
1776 {
1777     return a / b;
1778 }
1779 
1780 static double hard_f64_div(double a, double b)
1781 {
1782     return a / b;
1783 }
1784 
1785 static bool f32_div_pre(union_float32 a, union_float32 b)
1786 {
1787     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1788         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1789                fpclassify(b.h) == FP_NORMAL;
1790     }
1791     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1792 }
1793 
1794 static bool f64_div_pre(union_float64 a, union_float64 b)
1795 {
1796     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1797         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1798                fpclassify(b.h) == FP_NORMAL;
1799     }
1800     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1801 }
1802 
1803 static bool f32_div_post(union_float32 a, union_float32 b)
1804 {
1805     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1806         return fpclassify(a.h) != FP_ZERO;
1807     }
1808     return !float32_is_zero(a.s);
1809 }
1810 
1811 static bool f64_div_post(union_float64 a, union_float64 b)
1812 {
1813     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1814         return fpclassify(a.h) != FP_ZERO;
1815     }
1816     return !float64_is_zero(a.s);
1817 }
1818 
1819 float32 QEMU_FLATTEN
1820 float32_div(float32 a, float32 b, float_status *s)
1821 {
1822     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1823                         f32_div_pre, f32_div_post, NULL, NULL);
1824 }
1825 
1826 float64 QEMU_FLATTEN
1827 float64_div(float64 a, float64 b, float_status *s)
1828 {
1829     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1830                         f64_div_pre, f64_div_post, NULL, NULL);
1831 }
1832 
1833 /*
1834  * Float to Float conversions
1835  *
1836  * Returns the result of converting one float format to another. The
1837  * conversion is performed according to the IEC/IEEE Standard for
1838  * Binary Floating-Point Arithmetic.
1839  *
1840  * The float_to_float helper only needs to take care of raising
1841  * invalid exceptions and handling the conversion on NaNs.
1842  */
1843 
1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1845                                  float_status *s)
1846 {
1847     if (dstf->arm_althp) {
1848         switch (a.cls) {
1849         case float_class_qnan:
1850         case float_class_snan:
1851             /* There is no NaN in the destination format.  Raise Invalid
1852              * and return a zero with the sign of the input NaN.
1853              */
1854             s->float_exception_flags |= float_flag_invalid;
1855             a.cls = float_class_zero;
1856             a.frac = 0;
1857             a.exp = 0;
1858             break;
1859 
1860         case float_class_inf:
1861             /* There is no Inf in the destination format.  Raise Invalid
1862              * and return the maximum normal with the correct sign.
1863              */
1864             s->float_exception_flags |= float_flag_invalid;
1865             a.cls = float_class_normal;
1866             a.exp = dstf->exp_max;
1867             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1868             break;
1869 
1870         default:
1871             break;
1872         }
1873     } else if (is_nan(a.cls)) {
1874         if (is_snan(a.cls)) {
1875             s->float_exception_flags |= float_flag_invalid;
1876             a = parts_silence_nan(a, s);
1877         }
1878         if (s->default_nan_mode) {
1879             return parts_default_nan(s);
1880         }
1881     }
1882     return a;
1883 }
1884 
1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1886 {
1887     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1888     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1889     FloatParts pr = float_to_float(p, &float32_params, s);
1890     return float32_round_pack_canonical(pr, s);
1891 }
1892 
1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1894 {
1895     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1896     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1897     FloatParts pr = float_to_float(p, &float64_params, s);
1898     return float64_round_pack_canonical(pr, s);
1899 }
1900 
1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1902 {
1903     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1904     FloatParts p = float32_unpack_canonical(a, s);
1905     FloatParts pr = float_to_float(p, fmt16, s);
1906     return float16a_round_pack_canonical(pr, s, fmt16);
1907 }
1908 
1909 float64 float32_to_float64(float32 a, float_status *s)
1910 {
1911     FloatParts p = float32_unpack_canonical(a, s);
1912     FloatParts pr = float_to_float(p, &float64_params, s);
1913     return float64_round_pack_canonical(pr, s);
1914 }
1915 
1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1917 {
1918     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1919     FloatParts p = float64_unpack_canonical(a, s);
1920     FloatParts pr = float_to_float(p, fmt16, s);
1921     return float16a_round_pack_canonical(pr, s, fmt16);
1922 }
1923 
1924 float32 float64_to_float32(float64 a, float_status *s)
1925 {
1926     FloatParts p = float64_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float32_params, s);
1928     return float32_round_pack_canonical(pr, s);
1929 }
1930 
1931 /*
1932  * Rounds the floating-point value `a' to an integer, and returns the
1933  * result as a floating-point value. The operation is performed
1934  * according to the IEC/IEEE Standard for Binary Floating-Point
1935  * Arithmetic.
1936  */
1937 
1938 static FloatParts round_to_int(FloatParts a, int rmode,
1939                                int scale, float_status *s)
1940 {
1941     switch (a.cls) {
1942     case float_class_qnan:
1943     case float_class_snan:
1944         return return_nan(a, s);
1945 
1946     case float_class_zero:
1947     case float_class_inf:
1948         /* already "integral" */
1949         break;
1950 
1951     case float_class_normal:
1952         scale = MIN(MAX(scale, -0x10000), 0x10000);
1953         a.exp += scale;
1954 
1955         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1956             /* already integral */
1957             break;
1958         }
1959         if (a.exp < 0) {
1960             bool one;
1961             /* all fractional */
1962             s->float_exception_flags |= float_flag_inexact;
1963             switch (rmode) {
1964             case float_round_nearest_even:
1965                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1966                 break;
1967             case float_round_ties_away:
1968                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1969                 break;
1970             case float_round_to_zero:
1971                 one = false;
1972                 break;
1973             case float_round_up:
1974                 one = !a.sign;
1975                 break;
1976             case float_round_down:
1977                 one = a.sign;
1978                 break;
1979             default:
1980                 g_assert_not_reached();
1981             }
1982 
1983             if (one) {
1984                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1985                 a.exp = 0;
1986             } else {
1987                 a.cls = float_class_zero;
1988             }
1989         } else {
1990             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1991             uint64_t frac_lsbm1 = frac_lsb >> 1;
1992             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1993             uint64_t rnd_mask = rnd_even_mask >> 1;
1994             uint64_t inc;
1995 
1996             switch (rmode) {
1997             case float_round_nearest_even:
1998                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1999                 break;
2000             case float_round_ties_away:
2001                 inc = frac_lsbm1;
2002                 break;
2003             case float_round_to_zero:
2004                 inc = 0;
2005                 break;
2006             case float_round_up:
2007                 inc = a.sign ? 0 : rnd_mask;
2008                 break;
2009             case float_round_down:
2010                 inc = a.sign ? rnd_mask : 0;
2011                 break;
2012             default:
2013                 g_assert_not_reached();
2014             }
2015 
2016             if (a.frac & rnd_mask) {
2017                 s->float_exception_flags |= float_flag_inexact;
2018                 a.frac += inc;
2019                 a.frac &= ~rnd_mask;
2020                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2021                     a.frac >>= 1;
2022                     a.exp++;
2023                 }
2024             }
2025         }
2026         break;
2027     default:
2028         g_assert_not_reached();
2029     }
2030     return a;
2031 }
2032 
2033 float16 float16_round_to_int(float16 a, float_status *s)
2034 {
2035     FloatParts pa = float16_unpack_canonical(a, s);
2036     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2037     return float16_round_pack_canonical(pr, s);
2038 }
2039 
2040 float32 float32_round_to_int(float32 a, float_status *s)
2041 {
2042     FloatParts pa = float32_unpack_canonical(a, s);
2043     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2044     return float32_round_pack_canonical(pr, s);
2045 }
2046 
2047 float64 float64_round_to_int(float64 a, float_status *s)
2048 {
2049     FloatParts pa = float64_unpack_canonical(a, s);
2050     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2051     return float64_round_pack_canonical(pr, s);
2052 }
2053 
2054 /*
2055  * Returns the result of converting the floating-point value `a' to
2056  * the two's complement integer format. The conversion is performed
2057  * according to the IEC/IEEE Standard for Binary Floating-Point
2058  * Arithmetic---which means in particular that the conversion is
2059  * rounded according to the current rounding mode. If `a' is a NaN,
2060  * the largest positive integer is returned. Otherwise, if the
2061  * conversion overflows, the largest integer with the same sign as `a'
2062  * is returned.
2063 */
2064 
2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2066                                      int64_t min, int64_t max,
2067                                      float_status *s)
2068 {
2069     uint64_t r;
2070     int orig_flags = get_float_exception_flags(s);
2071     FloatParts p = round_to_int(in, rmode, scale, s);
2072 
2073     switch (p.cls) {
2074     case float_class_snan:
2075     case float_class_qnan:
2076         s->float_exception_flags = orig_flags | float_flag_invalid;
2077         return max;
2078     case float_class_inf:
2079         s->float_exception_flags = orig_flags | float_flag_invalid;
2080         return p.sign ? min : max;
2081     case float_class_zero:
2082         return 0;
2083     case float_class_normal:
2084         if (p.exp < DECOMPOSED_BINARY_POINT) {
2085             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2086         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2087             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2088         } else {
2089             r = UINT64_MAX;
2090         }
2091         if (p.sign) {
2092             if (r <= -(uint64_t) min) {
2093                 return -r;
2094             } else {
2095                 s->float_exception_flags = orig_flags | float_flag_invalid;
2096                 return min;
2097             }
2098         } else {
2099             if (r <= max) {
2100                 return r;
2101             } else {
2102                 s->float_exception_flags = orig_flags | float_flag_invalid;
2103                 return max;
2104             }
2105         }
2106     default:
2107         g_assert_not_reached();
2108     }
2109 }
2110 
2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2112                                 float_status *s)
2113 {
2114     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2115                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2116 }
2117 
2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2119                                 float_status *s)
2120 {
2121     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2122                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2123 }
2124 
2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2126                                 float_status *s)
2127 {
2128     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2129                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2130 }
2131 
2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2133                                 float_status *s)
2134 {
2135     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2136                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2137 }
2138 
2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2140                                 float_status *s)
2141 {
2142     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2143                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2144 }
2145 
2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2147                                 float_status *s)
2148 {
2149     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2150                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2151 }
2152 
2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2154                                 float_status *s)
2155 {
2156     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2157                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2158 }
2159 
2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2161                                 float_status *s)
2162 {
2163     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2164                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2165 }
2166 
2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2168                                 float_status *s)
2169 {
2170     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2171                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2172 }
2173 
2174 int16_t float16_to_int16(float16 a, float_status *s)
2175 {
2176     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2177 }
2178 
2179 int32_t float16_to_int32(float16 a, float_status *s)
2180 {
2181     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2182 }
2183 
2184 int64_t float16_to_int64(float16 a, float_status *s)
2185 {
2186     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2187 }
2188 
2189 int16_t float32_to_int16(float32 a, float_status *s)
2190 {
2191     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2192 }
2193 
2194 int32_t float32_to_int32(float32 a, float_status *s)
2195 {
2196     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2197 }
2198 
2199 int64_t float32_to_int64(float32 a, float_status *s)
2200 {
2201     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2202 }
2203 
2204 int16_t float64_to_int16(float64 a, float_status *s)
2205 {
2206     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2207 }
2208 
2209 int32_t float64_to_int32(float64 a, float_status *s)
2210 {
2211     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2212 }
2213 
2214 int64_t float64_to_int64(float64 a, float_status *s)
2215 {
2216     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2217 }
2218 
2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2220 {
2221     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2222 }
2223 
2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2225 {
2226     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2227 }
2228 
2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2230 {
2231     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2232 }
2233 
2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2235 {
2236     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2237 }
2238 
2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2240 {
2241     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2242 }
2243 
2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2245 {
2246     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2247 }
2248 
2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2250 {
2251     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2252 }
2253 
2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2255 {
2256     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2257 }
2258 
2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2260 {
2261     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2262 }
2263 
2264 /*
2265  *  Returns the result of converting the floating-point value `a' to
2266  *  the unsigned integer format. The conversion is performed according
2267  *  to the IEC/IEEE Standard for Binary Floating-Point
2268  *  Arithmetic---which means in particular that the conversion is
2269  *  rounded according to the current rounding mode. If `a' is a NaN,
2270  *  the largest unsigned integer is returned. Otherwise, if the
2271  *  conversion overflows, the largest unsigned integer is returned. If
2272  *  the 'a' is negative, the result is rounded and zero is returned;
2273  *  values that do not round to zero will raise the inexact exception
2274  *  flag.
2275  */
2276 
2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2278                                        uint64_t max, float_status *s)
2279 {
2280     int orig_flags = get_float_exception_flags(s);
2281     FloatParts p = round_to_int(in, rmode, scale, s);
2282     uint64_t r;
2283 
2284     switch (p.cls) {
2285     case float_class_snan:
2286     case float_class_qnan:
2287         s->float_exception_flags = orig_flags | float_flag_invalid;
2288         return max;
2289     case float_class_inf:
2290         s->float_exception_flags = orig_flags | float_flag_invalid;
2291         return p.sign ? 0 : max;
2292     case float_class_zero:
2293         return 0;
2294     case float_class_normal:
2295         if (p.sign) {
2296             s->float_exception_flags = orig_flags | float_flag_invalid;
2297             return 0;
2298         }
2299 
2300         if (p.exp < DECOMPOSED_BINARY_POINT) {
2301             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2302         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2303             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2304         } else {
2305             s->float_exception_flags = orig_flags | float_flag_invalid;
2306             return max;
2307         }
2308 
2309         /* For uint64 this will never trip, but if p.exp is too large
2310          * to shift a decomposed fraction we shall have exited via the
2311          * 3rd leg above.
2312          */
2313         if (r > max) {
2314             s->float_exception_flags = orig_flags | float_flag_invalid;
2315             return max;
2316         }
2317         return r;
2318     default:
2319         g_assert_not_reached();
2320     }
2321 }
2322 
2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2324                                   float_status *s)
2325 {
2326     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2327                                   rmode, scale, UINT16_MAX, s);
2328 }
2329 
2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2331                                   float_status *s)
2332 {
2333     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2334                                   rmode, scale, UINT32_MAX, s);
2335 }
2336 
2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2338                                   float_status *s)
2339 {
2340     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2341                                   rmode, scale, UINT64_MAX, s);
2342 }
2343 
2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2345                                   float_status *s)
2346 {
2347     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2348                                   rmode, scale, UINT16_MAX, s);
2349 }
2350 
2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2352                                   float_status *s)
2353 {
2354     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2355                                   rmode, scale, UINT32_MAX, s);
2356 }
2357 
2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2359                                   float_status *s)
2360 {
2361     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2362                                   rmode, scale, UINT64_MAX, s);
2363 }
2364 
2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2366                                   float_status *s)
2367 {
2368     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2369                                   rmode, scale, UINT16_MAX, s);
2370 }
2371 
2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2373                                   float_status *s)
2374 {
2375     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2376                                   rmode, scale, UINT32_MAX, s);
2377 }
2378 
2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2380                                   float_status *s)
2381 {
2382     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2383                                   rmode, scale, UINT64_MAX, s);
2384 }
2385 
2386 uint16_t float16_to_uint16(float16 a, float_status *s)
2387 {
2388     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2389 }
2390 
2391 uint32_t float16_to_uint32(float16 a, float_status *s)
2392 {
2393     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2394 }
2395 
2396 uint64_t float16_to_uint64(float16 a, float_status *s)
2397 {
2398     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2399 }
2400 
2401 uint16_t float32_to_uint16(float32 a, float_status *s)
2402 {
2403     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2404 }
2405 
2406 uint32_t float32_to_uint32(float32 a, float_status *s)
2407 {
2408     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410 
2411 uint64_t float32_to_uint64(float32 a, float_status *s)
2412 {
2413     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415 
2416 uint16_t float64_to_uint16(float64 a, float_status *s)
2417 {
2418     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420 
2421 uint32_t float64_to_uint32(float64 a, float_status *s)
2422 {
2423     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425 
2426 uint64_t float64_to_uint64(float64 a, float_status *s)
2427 {
2428     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430 
2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2432 {
2433     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2434 }
2435 
2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2437 {
2438     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2439 }
2440 
2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2442 {
2443     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2444 }
2445 
2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2447 {
2448     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2449 }
2450 
2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2452 {
2453     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455 
2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2457 {
2458     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460 
2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2462 {
2463     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465 
2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2467 {
2468     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470 
2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2472 {
2473     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475 
2476 /*
2477  * Integer to float conversions
2478  *
2479  * Returns the result of converting the two's complement integer `a'
2480  * to the floating-point format. The conversion is performed according
2481  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2482  */
2483 
2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2485 {
2486     FloatParts r = { .sign = false };
2487 
2488     if (a == 0) {
2489         r.cls = float_class_zero;
2490     } else {
2491         uint64_t f = a;
2492         int shift;
2493 
2494         r.cls = float_class_normal;
2495         if (a < 0) {
2496             f = -f;
2497             r.sign = true;
2498         }
2499         shift = clz64(f) - 1;
2500         scale = MIN(MAX(scale, -0x10000), 0x10000);
2501 
2502         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2503         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2504     }
2505 
2506     return r;
2507 }
2508 
2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2510 {
2511     FloatParts pa = int_to_float(a, scale, status);
2512     return float16_round_pack_canonical(pa, status);
2513 }
2514 
2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2516 {
2517     return int64_to_float16_scalbn(a, scale, status);
2518 }
2519 
2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2521 {
2522     return int64_to_float16_scalbn(a, scale, status);
2523 }
2524 
2525 float16 int64_to_float16(int64_t a, float_status *status)
2526 {
2527     return int64_to_float16_scalbn(a, 0, status);
2528 }
2529 
2530 float16 int32_to_float16(int32_t a, float_status *status)
2531 {
2532     return int64_to_float16_scalbn(a, 0, status);
2533 }
2534 
2535 float16 int16_to_float16(int16_t a, float_status *status)
2536 {
2537     return int64_to_float16_scalbn(a, 0, status);
2538 }
2539 
2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2541 {
2542     FloatParts pa = int_to_float(a, scale, status);
2543     return float32_round_pack_canonical(pa, status);
2544 }
2545 
2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2547 {
2548     return int64_to_float32_scalbn(a, scale, status);
2549 }
2550 
2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2552 {
2553     return int64_to_float32_scalbn(a, scale, status);
2554 }
2555 
2556 float32 int64_to_float32(int64_t a, float_status *status)
2557 {
2558     return int64_to_float32_scalbn(a, 0, status);
2559 }
2560 
2561 float32 int32_to_float32(int32_t a, float_status *status)
2562 {
2563     return int64_to_float32_scalbn(a, 0, status);
2564 }
2565 
2566 float32 int16_to_float32(int16_t a, float_status *status)
2567 {
2568     return int64_to_float32_scalbn(a, 0, status);
2569 }
2570 
2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2572 {
2573     FloatParts pa = int_to_float(a, scale, status);
2574     return float64_round_pack_canonical(pa, status);
2575 }
2576 
2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2578 {
2579     return int64_to_float64_scalbn(a, scale, status);
2580 }
2581 
2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2583 {
2584     return int64_to_float64_scalbn(a, scale, status);
2585 }
2586 
2587 float64 int64_to_float64(int64_t a, float_status *status)
2588 {
2589     return int64_to_float64_scalbn(a, 0, status);
2590 }
2591 
2592 float64 int32_to_float64(int32_t a, float_status *status)
2593 {
2594     return int64_to_float64_scalbn(a, 0, status);
2595 }
2596 
2597 float64 int16_to_float64(int16_t a, float_status *status)
2598 {
2599     return int64_to_float64_scalbn(a, 0, status);
2600 }
2601 
2602 
2603 /*
2604  * Unsigned Integer to float conversions
2605  *
2606  * Returns the result of converting the unsigned integer `a' to the
2607  * floating-point format. The conversion is performed according to the
2608  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2609  */
2610 
2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2612 {
2613     FloatParts r = { .sign = false };
2614 
2615     if (a == 0) {
2616         r.cls = float_class_zero;
2617     } else {
2618         scale = MIN(MAX(scale, -0x10000), 0x10000);
2619         r.cls = float_class_normal;
2620         if ((int64_t)a < 0) {
2621             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2622             shift64RightJamming(a, 1, &a);
2623             r.frac = a;
2624         } else {
2625             int shift = clz64(a) - 1;
2626             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2627             r.frac = a << shift;
2628         }
2629     }
2630 
2631     return r;
2632 }
2633 
2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2635 {
2636     FloatParts pa = uint_to_float(a, scale, status);
2637     return float16_round_pack_canonical(pa, status);
2638 }
2639 
2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2641 {
2642     return uint64_to_float16_scalbn(a, scale, status);
2643 }
2644 
2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2646 {
2647     return uint64_to_float16_scalbn(a, scale, status);
2648 }
2649 
2650 float16 uint64_to_float16(uint64_t a, float_status *status)
2651 {
2652     return uint64_to_float16_scalbn(a, 0, status);
2653 }
2654 
2655 float16 uint32_to_float16(uint32_t a, float_status *status)
2656 {
2657     return uint64_to_float16_scalbn(a, 0, status);
2658 }
2659 
2660 float16 uint16_to_float16(uint16_t a, float_status *status)
2661 {
2662     return uint64_to_float16_scalbn(a, 0, status);
2663 }
2664 
2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2666 {
2667     FloatParts pa = uint_to_float(a, scale, status);
2668     return float32_round_pack_canonical(pa, status);
2669 }
2670 
2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2672 {
2673     return uint64_to_float32_scalbn(a, scale, status);
2674 }
2675 
2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2677 {
2678     return uint64_to_float32_scalbn(a, scale, status);
2679 }
2680 
2681 float32 uint64_to_float32(uint64_t a, float_status *status)
2682 {
2683     return uint64_to_float32_scalbn(a, 0, status);
2684 }
2685 
2686 float32 uint32_to_float32(uint32_t a, float_status *status)
2687 {
2688     return uint64_to_float32_scalbn(a, 0, status);
2689 }
2690 
2691 float32 uint16_to_float32(uint16_t a, float_status *status)
2692 {
2693     return uint64_to_float32_scalbn(a, 0, status);
2694 }
2695 
2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2697 {
2698     FloatParts pa = uint_to_float(a, scale, status);
2699     return float64_round_pack_canonical(pa, status);
2700 }
2701 
2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2703 {
2704     return uint64_to_float64_scalbn(a, scale, status);
2705 }
2706 
2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2708 {
2709     return uint64_to_float64_scalbn(a, scale, status);
2710 }
2711 
2712 float64 uint64_to_float64(uint64_t a, float_status *status)
2713 {
2714     return uint64_to_float64_scalbn(a, 0, status);
2715 }
2716 
2717 float64 uint32_to_float64(uint32_t a, float_status *status)
2718 {
2719     return uint64_to_float64_scalbn(a, 0, status);
2720 }
2721 
2722 float64 uint16_to_float64(uint16_t a, float_status *status)
2723 {
2724     return uint64_to_float64_scalbn(a, 0, status);
2725 }
2726 
2727 /* Float Min/Max */
2728 /* min() and max() functions. These can't be implemented as
2729  * 'compare and pick one input' because that would mishandle
2730  * NaNs and +0 vs -0.
2731  *
2732  * minnum() and maxnum() functions. These are similar to the min()
2733  * and max() functions but if one of the arguments is a QNaN and
2734  * the other is numerical then the numerical argument is returned.
2735  * SNaNs will get quietened before being returned.
2736  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2737  * and maxNum() operations. min() and max() are the typical min/max
2738  * semantics provided by many CPUs which predate that specification.
2739  *
2740  * minnummag() and maxnummag() functions correspond to minNumMag()
2741  * and minNumMag() from the IEEE-754 2008.
2742  */
2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2744                                 bool ieee, bool ismag, float_status *s)
2745 {
2746     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2747         if (ieee) {
2748             /* Takes two floating-point values `a' and `b', one of
2749              * which is a NaN, and returns the appropriate NaN
2750              * result. If either `a' or `b' is a signaling NaN,
2751              * the invalid exception is raised.
2752              */
2753             if (is_snan(a.cls) || is_snan(b.cls)) {
2754                 return pick_nan(a, b, s);
2755             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2756                 return b;
2757             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2758                 return a;
2759             }
2760         }
2761         return pick_nan(a, b, s);
2762     } else {
2763         int a_exp, b_exp;
2764 
2765         switch (a.cls) {
2766         case float_class_normal:
2767             a_exp = a.exp;
2768             break;
2769         case float_class_inf:
2770             a_exp = INT_MAX;
2771             break;
2772         case float_class_zero:
2773             a_exp = INT_MIN;
2774             break;
2775         default:
2776             g_assert_not_reached();
2777             break;
2778         }
2779         switch (b.cls) {
2780         case float_class_normal:
2781             b_exp = b.exp;
2782             break;
2783         case float_class_inf:
2784             b_exp = INT_MAX;
2785             break;
2786         case float_class_zero:
2787             b_exp = INT_MIN;
2788             break;
2789         default:
2790             g_assert_not_reached();
2791             break;
2792         }
2793 
2794         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2795             bool a_less = a_exp < b_exp;
2796             if (a_exp == b_exp) {
2797                 a_less = a.frac < b.frac;
2798             }
2799             return a_less ^ ismin ? b : a;
2800         }
2801 
2802         if (a.sign == b.sign) {
2803             bool a_less = a_exp < b_exp;
2804             if (a_exp == b_exp) {
2805                 a_less = a.frac < b.frac;
2806             }
2807             return a.sign ^ a_less ^ ismin ? b : a;
2808         } else {
2809             return a.sign ^ ismin ? b : a;
2810         }
2811     }
2812 }
2813 
2814 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2816                                      float_status *s)                   \
2817 {                                                                       \
2818     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2819     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2820     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2821                                                                         \
2822     return float ## sz ## _round_pack_canonical(pr, s);                 \
2823 }
2824 
2825 MINMAX(16, min, true, false, false)
2826 MINMAX(16, minnum, true, true, false)
2827 MINMAX(16, minnummag, true, true, true)
2828 MINMAX(16, max, false, false, false)
2829 MINMAX(16, maxnum, false, true, false)
2830 MINMAX(16, maxnummag, false, true, true)
2831 
2832 MINMAX(32, min, true, false, false)
2833 MINMAX(32, minnum, true, true, false)
2834 MINMAX(32, minnummag, true, true, true)
2835 MINMAX(32, max, false, false, false)
2836 MINMAX(32, maxnum, false, true, false)
2837 MINMAX(32, maxnummag, false, true, true)
2838 
2839 MINMAX(64, min, true, false, false)
2840 MINMAX(64, minnum, true, true, false)
2841 MINMAX(64, minnummag, true, true, true)
2842 MINMAX(64, max, false, false, false)
2843 MINMAX(64, maxnum, false, true, false)
2844 MINMAX(64, maxnummag, false, true, true)
2845 
2846 #undef MINMAX
2847 
2848 /* Floating point compare */
2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2850                           float_status *s)
2851 {
2852     if (is_nan(a.cls) || is_nan(b.cls)) {
2853         if (!is_quiet ||
2854             a.cls == float_class_snan ||
2855             b.cls == float_class_snan) {
2856             s->float_exception_flags |= float_flag_invalid;
2857         }
2858         return float_relation_unordered;
2859     }
2860 
2861     if (a.cls == float_class_zero) {
2862         if (b.cls == float_class_zero) {
2863             return float_relation_equal;
2864         }
2865         return b.sign ? float_relation_greater : float_relation_less;
2866     } else if (b.cls == float_class_zero) {
2867         return a.sign ? float_relation_less : float_relation_greater;
2868     }
2869 
2870     /* The only really important thing about infinity is its sign. If
2871      * both are infinities the sign marks the smallest of the two.
2872      */
2873     if (a.cls == float_class_inf) {
2874         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2875             return float_relation_equal;
2876         }
2877         return a.sign ? float_relation_less : float_relation_greater;
2878     } else if (b.cls == float_class_inf) {
2879         return b.sign ? float_relation_greater : float_relation_less;
2880     }
2881 
2882     if (a.sign != b.sign) {
2883         return a.sign ? float_relation_less : float_relation_greater;
2884     }
2885 
2886     if (a.exp == b.exp) {
2887         if (a.frac == b.frac) {
2888             return float_relation_equal;
2889         }
2890         if (a.sign) {
2891             return a.frac > b.frac ?
2892                 float_relation_less : float_relation_greater;
2893         } else {
2894             return a.frac > b.frac ?
2895                 float_relation_greater : float_relation_less;
2896         }
2897     } else {
2898         if (a.sign) {
2899             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2900         } else {
2901             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2902         }
2903     }
2904 }
2905 
2906 #define COMPARE(sz)                                                     \
2907 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2908                             float_status *s)                            \
2909 {                                                                       \
2910     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2911     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2912     return compare_floats(pa, pb, false, s);                            \
2913 }                                                                       \
2914 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2915                                   float_status *s)                      \
2916 {                                                                       \
2917     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2918     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2919     return compare_floats(pa, pb, true, s);                             \
2920 }
2921 
2922 COMPARE(16)
2923 COMPARE(32)
2924 COMPARE(64)
2925 
2926 #undef COMPARE
2927 
2928 /* Multiply A by 2 raised to the power N.  */
2929 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2930 {
2931     if (unlikely(is_nan(a.cls))) {
2932         return return_nan(a, s);
2933     }
2934     if (a.cls == float_class_normal) {
2935         /* The largest float type (even though not supported by FloatParts)
2936          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2937          * still allows rounding to infinity, without allowing overflow
2938          * within the int32_t that backs FloatParts.exp.
2939          */
2940         n = MIN(MAX(n, -0x10000), 0x10000);
2941         a.exp += n;
2942     }
2943     return a;
2944 }
2945 
2946 float16 float16_scalbn(float16 a, int n, float_status *status)
2947 {
2948     FloatParts pa = float16_unpack_canonical(a, status);
2949     FloatParts pr = scalbn_decomposed(pa, n, status);
2950     return float16_round_pack_canonical(pr, status);
2951 }
2952 
2953 float32 float32_scalbn(float32 a, int n, float_status *status)
2954 {
2955     FloatParts pa = float32_unpack_canonical(a, status);
2956     FloatParts pr = scalbn_decomposed(pa, n, status);
2957     return float32_round_pack_canonical(pr, status);
2958 }
2959 
2960 float64 float64_scalbn(float64 a, int n, float_status *status)
2961 {
2962     FloatParts pa = float64_unpack_canonical(a, status);
2963     FloatParts pr = scalbn_decomposed(pa, n, status);
2964     return float64_round_pack_canonical(pr, status);
2965 }
2966 
2967 /*
2968  * Square Root
2969  *
2970  * The old softfloat code did an approximation step before zeroing in
2971  * on the final result. However for simpleness we just compute the
2972  * square root by iterating down from the implicit bit to enough extra
2973  * bits to ensure we get a correctly rounded result.
2974  *
2975  * This does mean however the calculation is slower than before,
2976  * especially for 64 bit floats.
2977  */
2978 
2979 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2980 {
2981     uint64_t a_frac, r_frac, s_frac;
2982     int bit, last_bit;
2983 
2984     if (is_nan(a.cls)) {
2985         return return_nan(a, s);
2986     }
2987     if (a.cls == float_class_zero) {
2988         return a;  /* sqrt(+-0) = +-0 */
2989     }
2990     if (a.sign) {
2991         s->float_exception_flags |= float_flag_invalid;
2992         return parts_default_nan(s);
2993     }
2994     if (a.cls == float_class_inf) {
2995         return a;  /* sqrt(+inf) = +inf */
2996     }
2997 
2998     assert(a.cls == float_class_normal);
2999 
3000     /* We need two overflow bits at the top. Adding room for that is a
3001      * right shift. If the exponent is odd, we can discard the low bit
3002      * by multiplying the fraction by 2; that's a left shift. Combine
3003      * those and we shift right if the exponent is even.
3004      */
3005     a_frac = a.frac;
3006     if (!(a.exp & 1)) {
3007         a_frac >>= 1;
3008     }
3009     a.exp >>= 1;
3010 
3011     /* Bit-by-bit computation of sqrt.  */
3012     r_frac = 0;
3013     s_frac = 0;
3014 
3015     /* Iterate from implicit bit down to the 3 extra bits to compute a
3016      * properly rounded result. Remember we've inserted one more bit
3017      * at the top, so these positions are one less.
3018      */
3019     bit = DECOMPOSED_BINARY_POINT - 1;
3020     last_bit = MAX(p->frac_shift - 4, 0);
3021     do {
3022         uint64_t q = 1ULL << bit;
3023         uint64_t t_frac = s_frac + q;
3024         if (t_frac <= a_frac) {
3025             s_frac = t_frac + q;
3026             a_frac -= t_frac;
3027             r_frac += q;
3028         }
3029         a_frac <<= 1;
3030     } while (--bit >= last_bit);
3031 
3032     /* Undo the right shift done above. If there is any remaining
3033      * fraction, the result is inexact. Set the sticky bit.
3034      */
3035     a.frac = (r_frac << 1) + (a_frac != 0);
3036 
3037     return a;
3038 }
3039 
3040 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3041 {
3042     FloatParts pa = float16_unpack_canonical(a, status);
3043     FloatParts pr = sqrt_float(pa, status, &float16_params);
3044     return float16_round_pack_canonical(pr, status);
3045 }
3046 
3047 static float32 QEMU_SOFTFLOAT_ATTR
3048 soft_f32_sqrt(float32 a, float_status *status)
3049 {
3050     FloatParts pa = float32_unpack_canonical(a, status);
3051     FloatParts pr = sqrt_float(pa, status, &float32_params);
3052     return float32_round_pack_canonical(pr, status);
3053 }
3054 
3055 static float64 QEMU_SOFTFLOAT_ATTR
3056 soft_f64_sqrt(float64 a, float_status *status)
3057 {
3058     FloatParts pa = float64_unpack_canonical(a, status);
3059     FloatParts pr = sqrt_float(pa, status, &float64_params);
3060     return float64_round_pack_canonical(pr, status);
3061 }
3062 
3063 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3064 {
3065     union_float32 ua, ur;
3066 
3067     ua.s = xa;
3068     if (unlikely(!can_use_fpu(s))) {
3069         goto soft;
3070     }
3071 
3072     float32_input_flush1(&ua.s, s);
3073     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3074         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3075                        fpclassify(ua.h) == FP_ZERO) ||
3076                      signbit(ua.h))) {
3077             goto soft;
3078         }
3079     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3080                         float32_is_neg(ua.s))) {
3081         goto soft;
3082     }
3083     ur.h = sqrtf(ua.h);
3084     return ur.s;
3085 
3086  soft:
3087     return soft_f32_sqrt(ua.s, s);
3088 }
3089 
3090 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3091 {
3092     union_float64 ua, ur;
3093 
3094     ua.s = xa;
3095     if (unlikely(!can_use_fpu(s))) {
3096         goto soft;
3097     }
3098 
3099     float64_input_flush1(&ua.s, s);
3100     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3101         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3102                        fpclassify(ua.h) == FP_ZERO) ||
3103                      signbit(ua.h))) {
3104             goto soft;
3105         }
3106     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3107                         float64_is_neg(ua.s))) {
3108         goto soft;
3109     }
3110     ur.h = sqrt(ua.h);
3111     return ur.s;
3112 
3113  soft:
3114     return soft_f64_sqrt(ua.s, s);
3115 }
3116 
3117 /*----------------------------------------------------------------------------
3118 | The pattern for a default generated NaN.
3119 *----------------------------------------------------------------------------*/
3120 
3121 float16 float16_default_nan(float_status *status)
3122 {
3123     FloatParts p = parts_default_nan(status);
3124     p.frac >>= float16_params.frac_shift;
3125     return float16_pack_raw(p);
3126 }
3127 
3128 float32 float32_default_nan(float_status *status)
3129 {
3130     FloatParts p = parts_default_nan(status);
3131     p.frac >>= float32_params.frac_shift;
3132     return float32_pack_raw(p);
3133 }
3134 
3135 float64 float64_default_nan(float_status *status)
3136 {
3137     FloatParts p = parts_default_nan(status);
3138     p.frac >>= float64_params.frac_shift;
3139     return float64_pack_raw(p);
3140 }
3141 
3142 float128 float128_default_nan(float_status *status)
3143 {
3144     FloatParts p = parts_default_nan(status);
3145     float128 r;
3146 
3147     /* Extrapolate from the choices made by parts_default_nan to fill
3148      * in the quad-floating format.  If the low bit is set, assume we
3149      * want to set all non-snan bits.
3150      */
3151     r.low = -(p.frac & 1);
3152     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3153     r.high |= LIT64(0x7FFF000000000000);
3154     r.high |= (uint64_t)p.sign << 63;
3155 
3156     return r;
3157 }
3158 
3159 /*----------------------------------------------------------------------------
3160 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3161 *----------------------------------------------------------------------------*/
3162 
3163 float16 float16_silence_nan(float16 a, float_status *status)
3164 {
3165     FloatParts p = float16_unpack_raw(a);
3166     p.frac <<= float16_params.frac_shift;
3167     p = parts_silence_nan(p, status);
3168     p.frac >>= float16_params.frac_shift;
3169     return float16_pack_raw(p);
3170 }
3171 
3172 float32 float32_silence_nan(float32 a, float_status *status)
3173 {
3174     FloatParts p = float32_unpack_raw(a);
3175     p.frac <<= float32_params.frac_shift;
3176     p = parts_silence_nan(p, status);
3177     p.frac >>= float32_params.frac_shift;
3178     return float32_pack_raw(p);
3179 }
3180 
3181 float64 float64_silence_nan(float64 a, float_status *status)
3182 {
3183     FloatParts p = float64_unpack_raw(a);
3184     p.frac <<= float64_params.frac_shift;
3185     p = parts_silence_nan(p, status);
3186     p.frac >>= float64_params.frac_shift;
3187     return float64_pack_raw(p);
3188 }
3189 
3190 /*----------------------------------------------------------------------------
3191 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3192 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3193 | input.  If `zSign' is 1, the input is negated before being converted to an
3194 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3195 | is simply rounded to an integer, with the inexact exception raised if the
3196 | input cannot be represented exactly as an integer.  However, if the fixed-
3197 | point input is too large, the invalid exception is raised and the largest
3198 | positive or negative integer is returned.
3199 *----------------------------------------------------------------------------*/
3200 
3201 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3202 {
3203     int8_t roundingMode;
3204     flag roundNearestEven;
3205     int8_t roundIncrement, roundBits;
3206     int32_t z;
3207 
3208     roundingMode = status->float_rounding_mode;
3209     roundNearestEven = ( roundingMode == float_round_nearest_even );
3210     switch (roundingMode) {
3211     case float_round_nearest_even:
3212     case float_round_ties_away:
3213         roundIncrement = 0x40;
3214         break;
3215     case float_round_to_zero:
3216         roundIncrement = 0;
3217         break;
3218     case float_round_up:
3219         roundIncrement = zSign ? 0 : 0x7f;
3220         break;
3221     case float_round_down:
3222         roundIncrement = zSign ? 0x7f : 0;
3223         break;
3224     default:
3225         abort();
3226     }
3227     roundBits = absZ & 0x7F;
3228     absZ = ( absZ + roundIncrement )>>7;
3229     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3230     z = absZ;
3231     if ( zSign ) z = - z;
3232     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3233         float_raise(float_flag_invalid, status);
3234         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3235     }
3236     if (roundBits) {
3237         status->float_exception_flags |= float_flag_inexact;
3238     }
3239     return z;
3240 
3241 }
3242 
3243 /*----------------------------------------------------------------------------
3244 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3245 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3246 | and returns the properly rounded 64-bit integer corresponding to the input.
3247 | If `zSign' is 1, the input is negated before being converted to an integer.
3248 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3249 | the inexact exception raised if the input cannot be represented exactly as
3250 | an integer.  However, if the fixed-point input is too large, the invalid
3251 | exception is raised and the largest positive or negative integer is
3252 | returned.
3253 *----------------------------------------------------------------------------*/
3254 
3255 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3256                                float_status *status)
3257 {
3258     int8_t roundingMode;
3259     flag roundNearestEven, increment;
3260     int64_t z;
3261 
3262     roundingMode = status->float_rounding_mode;
3263     roundNearestEven = ( roundingMode == float_round_nearest_even );
3264     switch (roundingMode) {
3265     case float_round_nearest_even:
3266     case float_round_ties_away:
3267         increment = ((int64_t) absZ1 < 0);
3268         break;
3269     case float_round_to_zero:
3270         increment = 0;
3271         break;
3272     case float_round_up:
3273         increment = !zSign && absZ1;
3274         break;
3275     case float_round_down:
3276         increment = zSign && absZ1;
3277         break;
3278     default:
3279         abort();
3280     }
3281     if ( increment ) {
3282         ++absZ0;
3283         if ( absZ0 == 0 ) goto overflow;
3284         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3285     }
3286     z = absZ0;
3287     if ( zSign ) z = - z;
3288     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3289  overflow:
3290         float_raise(float_flag_invalid, status);
3291         return
3292               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3293             : LIT64( 0x7FFFFFFFFFFFFFFF );
3294     }
3295     if (absZ1) {
3296         status->float_exception_flags |= float_flag_inexact;
3297     }
3298     return z;
3299 
3300 }
3301 
3302 /*----------------------------------------------------------------------------
3303 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3304 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3305 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3306 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3307 | with the inexact exception raised if the input cannot be represented exactly
3308 | as an integer.  However, if the fixed-point input is too large, the invalid
3309 | exception is raised and the largest unsigned integer is returned.
3310 *----------------------------------------------------------------------------*/
3311 
3312 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3313                                 uint64_t absZ1, float_status *status)
3314 {
3315     int8_t roundingMode;
3316     flag roundNearestEven, increment;
3317 
3318     roundingMode = status->float_rounding_mode;
3319     roundNearestEven = (roundingMode == float_round_nearest_even);
3320     switch (roundingMode) {
3321     case float_round_nearest_even:
3322     case float_round_ties_away:
3323         increment = ((int64_t)absZ1 < 0);
3324         break;
3325     case float_round_to_zero:
3326         increment = 0;
3327         break;
3328     case float_round_up:
3329         increment = !zSign && absZ1;
3330         break;
3331     case float_round_down:
3332         increment = zSign && absZ1;
3333         break;
3334     default:
3335         abort();
3336     }
3337     if (increment) {
3338         ++absZ0;
3339         if (absZ0 == 0) {
3340             float_raise(float_flag_invalid, status);
3341             return LIT64(0xFFFFFFFFFFFFFFFF);
3342         }
3343         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3344     }
3345 
3346     if (zSign && absZ0) {
3347         float_raise(float_flag_invalid, status);
3348         return 0;
3349     }
3350 
3351     if (absZ1) {
3352         status->float_exception_flags |= float_flag_inexact;
3353     }
3354     return absZ0;
3355 }
3356 
3357 /*----------------------------------------------------------------------------
3358 | If `a' is denormal and we are in flush-to-zero mode then set the
3359 | input-denormal exception and return zero. Otherwise just return the value.
3360 *----------------------------------------------------------------------------*/
3361 float32 float32_squash_input_denormal(float32 a, float_status *status)
3362 {
3363     if (status->flush_inputs_to_zero) {
3364         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3365             float_raise(float_flag_input_denormal, status);
3366             return make_float32(float32_val(a) & 0x80000000);
3367         }
3368     }
3369     return a;
3370 }
3371 
3372 /*----------------------------------------------------------------------------
3373 | Normalizes the subnormal single-precision floating-point value represented
3374 | by the denormalized significand `aSig'.  The normalized exponent and
3375 | significand are stored at the locations pointed to by `zExpPtr' and
3376 | `zSigPtr', respectively.
3377 *----------------------------------------------------------------------------*/
3378 
3379 static void
3380  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3381 {
3382     int8_t shiftCount;
3383 
3384     shiftCount = clz32(aSig) - 8;
3385     *zSigPtr = aSig<<shiftCount;
3386     *zExpPtr = 1 - shiftCount;
3387 
3388 }
3389 
3390 /*----------------------------------------------------------------------------
3391 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3392 | and significand `zSig', and returns the proper single-precision floating-
3393 | point value corresponding to the abstract input.  Ordinarily, the abstract
3394 | value is simply rounded and packed into the single-precision format, with
3395 | the inexact exception raised if the abstract input cannot be represented
3396 | exactly.  However, if the abstract value is too large, the overflow and
3397 | inexact exceptions are raised and an infinity or maximal finite value is
3398 | returned.  If the abstract value is too small, the input value is rounded to
3399 | a subnormal number, and the underflow and inexact exceptions are raised if
3400 | the abstract input cannot be represented exactly as a subnormal single-
3401 | precision floating-point number.
3402 |     The input significand `zSig' has its binary point between bits 30
3403 | and 29, which is 7 bits to the left of the usual location.  This shifted
3404 | significand must be normalized or smaller.  If `zSig' is not normalized,
3405 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3406 | and it must not require rounding.  In the usual case that `zSig' is
3407 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3408 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3409 | Binary Floating-Point Arithmetic.
3410 *----------------------------------------------------------------------------*/
3411 
3412 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3413                                    float_status *status)
3414 {
3415     int8_t roundingMode;
3416     flag roundNearestEven;
3417     int8_t roundIncrement, roundBits;
3418     flag isTiny;
3419 
3420     roundingMode = status->float_rounding_mode;
3421     roundNearestEven = ( roundingMode == float_round_nearest_even );
3422     switch (roundingMode) {
3423     case float_round_nearest_even:
3424     case float_round_ties_away:
3425         roundIncrement = 0x40;
3426         break;
3427     case float_round_to_zero:
3428         roundIncrement = 0;
3429         break;
3430     case float_round_up:
3431         roundIncrement = zSign ? 0 : 0x7f;
3432         break;
3433     case float_round_down:
3434         roundIncrement = zSign ? 0x7f : 0;
3435         break;
3436     default:
3437         abort();
3438         break;
3439     }
3440     roundBits = zSig & 0x7F;
3441     if ( 0xFD <= (uint16_t) zExp ) {
3442         if (    ( 0xFD < zExp )
3443              || (    ( zExp == 0xFD )
3444                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3445            ) {
3446             float_raise(float_flag_overflow | float_flag_inexact, status);
3447             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3448         }
3449         if ( zExp < 0 ) {
3450             if (status->flush_to_zero) {
3451                 float_raise(float_flag_output_denormal, status);
3452                 return packFloat32(zSign, 0, 0);
3453             }
3454             isTiny =
3455                 (status->float_detect_tininess
3456                  == float_tininess_before_rounding)
3457                 || ( zExp < -1 )
3458                 || ( zSig + roundIncrement < 0x80000000 );
3459             shift32RightJamming( zSig, - zExp, &zSig );
3460             zExp = 0;
3461             roundBits = zSig & 0x7F;
3462             if (isTiny && roundBits) {
3463                 float_raise(float_flag_underflow, status);
3464             }
3465         }
3466     }
3467     if (roundBits) {
3468         status->float_exception_flags |= float_flag_inexact;
3469     }
3470     zSig = ( zSig + roundIncrement )>>7;
3471     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3472     if ( zSig == 0 ) zExp = 0;
3473     return packFloat32( zSign, zExp, zSig );
3474 
3475 }
3476 
3477 /*----------------------------------------------------------------------------
3478 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3479 | and significand `zSig', and returns the proper single-precision floating-
3480 | point value corresponding to the abstract input.  This routine is just like
3481 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3482 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3483 | floating-point exponent.
3484 *----------------------------------------------------------------------------*/
3485 
3486 static float32
3487  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3488                               float_status *status)
3489 {
3490     int8_t shiftCount;
3491 
3492     shiftCount = clz32(zSig) - 1;
3493     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3494                                status);
3495 
3496 }
3497 
3498 /*----------------------------------------------------------------------------
3499 | If `a' is denormal and we are in flush-to-zero mode then set the
3500 | input-denormal exception and return zero. Otherwise just return the value.
3501 *----------------------------------------------------------------------------*/
3502 float64 float64_squash_input_denormal(float64 a, float_status *status)
3503 {
3504     if (status->flush_inputs_to_zero) {
3505         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3506             float_raise(float_flag_input_denormal, status);
3507             return make_float64(float64_val(a) & (1ULL << 63));
3508         }
3509     }
3510     return a;
3511 }
3512 
3513 /*----------------------------------------------------------------------------
3514 | Normalizes the subnormal double-precision floating-point value represented
3515 | by the denormalized significand `aSig'.  The normalized exponent and
3516 | significand are stored at the locations pointed to by `zExpPtr' and
3517 | `zSigPtr', respectively.
3518 *----------------------------------------------------------------------------*/
3519 
3520 static void
3521  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3522 {
3523     int8_t shiftCount;
3524 
3525     shiftCount = clz64(aSig) - 11;
3526     *zSigPtr = aSig<<shiftCount;
3527     *zExpPtr = 1 - shiftCount;
3528 
3529 }
3530 
3531 /*----------------------------------------------------------------------------
3532 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3533 | double-precision floating-point value, returning the result.  After being
3534 | shifted into the proper positions, the three fields are simply added
3535 | together to form the result.  This means that any integer portion of `zSig'
3536 | will be added into the exponent.  Since a properly normalized significand
3537 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3538 | than the desired result exponent whenever `zSig' is a complete, normalized
3539 | significand.
3540 *----------------------------------------------------------------------------*/
3541 
3542 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3543 {
3544 
3545     return make_float64(
3546         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3547 
3548 }
3549 
3550 /*----------------------------------------------------------------------------
3551 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3552 | and significand `zSig', and returns the proper double-precision floating-
3553 | point value corresponding to the abstract input.  Ordinarily, the abstract
3554 | value is simply rounded and packed into the double-precision format, with
3555 | the inexact exception raised if the abstract input cannot be represented
3556 | exactly.  However, if the abstract value is too large, the overflow and
3557 | inexact exceptions are raised and an infinity or maximal finite value is
3558 | returned.  If the abstract value is too small, the input value is rounded to
3559 | a subnormal number, and the underflow and inexact exceptions are raised if
3560 | the abstract input cannot be represented exactly as a subnormal double-
3561 | precision floating-point number.
3562 |     The input significand `zSig' has its binary point between bits 62
3563 | and 61, which is 10 bits to the left of the usual location.  This shifted
3564 | significand must be normalized or smaller.  If `zSig' is not normalized,
3565 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3566 | and it must not require rounding.  In the usual case that `zSig' is
3567 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3568 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3569 | Binary Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3571 
3572 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3573                                    float_status *status)
3574 {
3575     int8_t roundingMode;
3576     flag roundNearestEven;
3577     int roundIncrement, roundBits;
3578     flag isTiny;
3579 
3580     roundingMode = status->float_rounding_mode;
3581     roundNearestEven = ( roundingMode == float_round_nearest_even );
3582     switch (roundingMode) {
3583     case float_round_nearest_even:
3584     case float_round_ties_away:
3585         roundIncrement = 0x200;
3586         break;
3587     case float_round_to_zero:
3588         roundIncrement = 0;
3589         break;
3590     case float_round_up:
3591         roundIncrement = zSign ? 0 : 0x3ff;
3592         break;
3593     case float_round_down:
3594         roundIncrement = zSign ? 0x3ff : 0;
3595         break;
3596     case float_round_to_odd:
3597         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3598         break;
3599     default:
3600         abort();
3601     }
3602     roundBits = zSig & 0x3FF;
3603     if ( 0x7FD <= (uint16_t) zExp ) {
3604         if (    ( 0x7FD < zExp )
3605              || (    ( zExp == 0x7FD )
3606                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3607            ) {
3608             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3609                                    roundIncrement != 0;
3610             float_raise(float_flag_overflow | float_flag_inexact, status);
3611             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3612         }
3613         if ( zExp < 0 ) {
3614             if (status->flush_to_zero) {
3615                 float_raise(float_flag_output_denormal, status);
3616                 return packFloat64(zSign, 0, 0);
3617             }
3618             isTiny =
3619                    (status->float_detect_tininess
3620                     == float_tininess_before_rounding)
3621                 || ( zExp < -1 )
3622                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3623             shift64RightJamming( zSig, - zExp, &zSig );
3624             zExp = 0;
3625             roundBits = zSig & 0x3FF;
3626             if (isTiny && roundBits) {
3627                 float_raise(float_flag_underflow, status);
3628             }
3629             if (roundingMode == float_round_to_odd) {
3630                 /*
3631                  * For round-to-odd case, the roundIncrement depends on
3632                  * zSig which just changed.
3633                  */
3634                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3635             }
3636         }
3637     }
3638     if (roundBits) {
3639         status->float_exception_flags |= float_flag_inexact;
3640     }
3641     zSig = ( zSig + roundIncrement )>>10;
3642     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3643     if ( zSig == 0 ) zExp = 0;
3644     return packFloat64( zSign, zExp, zSig );
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650 | and significand `zSig', and returns the proper double-precision floating-
3651 | point value corresponding to the abstract input.  This routine is just like
3652 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3653 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654 | floating-point exponent.
3655 *----------------------------------------------------------------------------*/
3656 
3657 static float64
3658  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3659                               float_status *status)
3660 {
3661     int8_t shiftCount;
3662 
3663     shiftCount = clz64(zSig) - 1;
3664     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3665                                status);
3666 
3667 }
3668 
3669 /*----------------------------------------------------------------------------
3670 | Normalizes the subnormal extended double-precision floating-point value
3671 | represented by the denormalized significand `aSig'.  The normalized exponent
3672 | and significand are stored at the locations pointed to by `zExpPtr' and
3673 | `zSigPtr', respectively.
3674 *----------------------------------------------------------------------------*/
3675 
3676 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3677                                 uint64_t *zSigPtr)
3678 {
3679     int8_t shiftCount;
3680 
3681     shiftCount = clz64(aSig);
3682     *zSigPtr = aSig<<shiftCount;
3683     *zExpPtr = 1 - shiftCount;
3684 }
3685 
3686 /*----------------------------------------------------------------------------
3687 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3688 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3689 | and returns the proper extended double-precision floating-point value
3690 | corresponding to the abstract input.  Ordinarily, the abstract value is
3691 | rounded and packed into the extended double-precision format, with the
3692 | inexact exception raised if the abstract input cannot be represented
3693 | exactly.  However, if the abstract value is too large, the overflow and
3694 | inexact exceptions are raised and an infinity or maximal finite value is
3695 | returned.  If the abstract value is too small, the input value is rounded to
3696 | a subnormal number, and the underflow and inexact exceptions are raised if
3697 | the abstract input cannot be represented exactly as a subnormal extended
3698 | double-precision floating-point number.
3699 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3700 | number of bits as single or double precision, respectively.  Otherwise, the
3701 | result is rounded to the full precision of the extended double-precision
3702 | format.
3703 |     The input significand must be normalized or smaller.  If the input
3704 | significand is not normalized, `zExp' must be 0; in that case, the result
3705 | returned is a subnormal number, and it must not require rounding.  The
3706 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3707 | Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3709 
3710 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3711                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3712                               float_status *status)
3713 {
3714     int8_t roundingMode;
3715     flag roundNearestEven, increment, isTiny;
3716     int64_t roundIncrement, roundMask, roundBits;
3717 
3718     roundingMode = status->float_rounding_mode;
3719     roundNearestEven = ( roundingMode == float_round_nearest_even );
3720     if ( roundingPrecision == 80 ) goto precision80;
3721     if ( roundingPrecision == 64 ) {
3722         roundIncrement = LIT64( 0x0000000000000400 );
3723         roundMask = LIT64( 0x00000000000007FF );
3724     }
3725     else if ( roundingPrecision == 32 ) {
3726         roundIncrement = LIT64( 0x0000008000000000 );
3727         roundMask = LIT64( 0x000000FFFFFFFFFF );
3728     }
3729     else {
3730         goto precision80;
3731     }
3732     zSig0 |= ( zSig1 != 0 );
3733     switch (roundingMode) {
3734     case float_round_nearest_even:
3735     case float_round_ties_away:
3736         break;
3737     case float_round_to_zero:
3738         roundIncrement = 0;
3739         break;
3740     case float_round_up:
3741         roundIncrement = zSign ? 0 : roundMask;
3742         break;
3743     case float_round_down:
3744         roundIncrement = zSign ? roundMask : 0;
3745         break;
3746     default:
3747         abort();
3748     }
3749     roundBits = zSig0 & roundMask;
3750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3751         if (    ( 0x7FFE < zExp )
3752              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3753            ) {
3754             goto overflow;
3755         }
3756         if ( zExp <= 0 ) {
3757             if (status->flush_to_zero) {
3758                 float_raise(float_flag_output_denormal, status);
3759                 return packFloatx80(zSign, 0, 0);
3760             }
3761             isTiny =
3762                    (status->float_detect_tininess
3763                     == float_tininess_before_rounding)
3764                 || ( zExp < 0 )
3765                 || ( zSig0 <= zSig0 + roundIncrement );
3766             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3767             zExp = 0;
3768             roundBits = zSig0 & roundMask;
3769             if (isTiny && roundBits) {
3770                 float_raise(float_flag_underflow, status);
3771             }
3772             if (roundBits) {
3773                 status->float_exception_flags |= float_flag_inexact;
3774             }
3775             zSig0 += roundIncrement;
3776             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3777             roundIncrement = roundMask + 1;
3778             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3779                 roundMask |= roundIncrement;
3780             }
3781             zSig0 &= ~ roundMask;
3782             return packFloatx80( zSign, zExp, zSig0 );
3783         }
3784     }
3785     if (roundBits) {
3786         status->float_exception_flags |= float_flag_inexact;
3787     }
3788     zSig0 += roundIncrement;
3789     if ( zSig0 < roundIncrement ) {
3790         ++zExp;
3791         zSig0 = LIT64( 0x8000000000000000 );
3792     }
3793     roundIncrement = roundMask + 1;
3794     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3795         roundMask |= roundIncrement;
3796     }
3797     zSig0 &= ~ roundMask;
3798     if ( zSig0 == 0 ) zExp = 0;
3799     return packFloatx80( zSign, zExp, zSig0 );
3800  precision80:
3801     switch (roundingMode) {
3802     case float_round_nearest_even:
3803     case float_round_ties_away:
3804         increment = ((int64_t)zSig1 < 0);
3805         break;
3806     case float_round_to_zero:
3807         increment = 0;
3808         break;
3809     case float_round_up:
3810         increment = !zSign && zSig1;
3811         break;
3812     case float_round_down:
3813         increment = zSign && zSig1;
3814         break;
3815     default:
3816         abort();
3817     }
3818     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3819         if (    ( 0x7FFE < zExp )
3820              || (    ( zExp == 0x7FFE )
3821                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3822                   && increment
3823                 )
3824            ) {
3825             roundMask = 0;
3826  overflow:
3827             float_raise(float_flag_overflow | float_flag_inexact, status);
3828             if (    ( roundingMode == float_round_to_zero )
3829                  || ( zSign && ( roundingMode == float_round_up ) )
3830                  || ( ! zSign && ( roundingMode == float_round_down ) )
3831                ) {
3832                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3833             }
3834             return packFloatx80(zSign,
3835                                 floatx80_infinity_high,
3836                                 floatx80_infinity_low);
3837         }
3838         if ( zExp <= 0 ) {
3839             isTiny =
3840                    (status->float_detect_tininess
3841                     == float_tininess_before_rounding)
3842                 || ( zExp < 0 )
3843                 || ! increment
3844                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3845             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3846             zExp = 0;
3847             if (isTiny && zSig1) {
3848                 float_raise(float_flag_underflow, status);
3849             }
3850             if (zSig1) {
3851                 status->float_exception_flags |= float_flag_inexact;
3852             }
3853             switch (roundingMode) {
3854             case float_round_nearest_even:
3855             case float_round_ties_away:
3856                 increment = ((int64_t)zSig1 < 0);
3857                 break;
3858             case float_round_to_zero:
3859                 increment = 0;
3860                 break;
3861             case float_round_up:
3862                 increment = !zSign && zSig1;
3863                 break;
3864             case float_round_down:
3865                 increment = zSign && zSig1;
3866                 break;
3867             default:
3868                 abort();
3869             }
3870             if ( increment ) {
3871                 ++zSig0;
3872                 zSig0 &=
3873                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3874                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3875             }
3876             return packFloatx80( zSign, zExp, zSig0 );
3877         }
3878     }
3879     if (zSig1) {
3880         status->float_exception_flags |= float_flag_inexact;
3881     }
3882     if ( increment ) {
3883         ++zSig0;
3884         if ( zSig0 == 0 ) {
3885             ++zExp;
3886             zSig0 = LIT64( 0x8000000000000000 );
3887         }
3888         else {
3889             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3890         }
3891     }
3892     else {
3893         if ( zSig0 == 0 ) zExp = 0;
3894     }
3895     return packFloatx80( zSign, zExp, zSig0 );
3896 
3897 }
3898 
3899 /*----------------------------------------------------------------------------
3900 | Takes an abstract floating-point value having sign `zSign', exponent
3901 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3902 | and returns the proper extended double-precision floating-point value
3903 | corresponding to the abstract input.  This routine is just like
3904 | `roundAndPackFloatx80' except that the input significand does not have to be
3905 | normalized.
3906 *----------------------------------------------------------------------------*/
3907 
3908 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3909                                        flag zSign, int32_t zExp,
3910                                        uint64_t zSig0, uint64_t zSig1,
3911                                        float_status *status)
3912 {
3913     int8_t shiftCount;
3914 
3915     if ( zSig0 == 0 ) {
3916         zSig0 = zSig1;
3917         zSig1 = 0;
3918         zExp -= 64;
3919     }
3920     shiftCount = clz64(zSig0);
3921     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3922     zExp -= shiftCount;
3923     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3924                                 zSig0, zSig1, status);
3925 
3926 }
3927 
3928 /*----------------------------------------------------------------------------
3929 | Returns the least-significant 64 fraction bits of the quadruple-precision
3930 | floating-point value `a'.
3931 *----------------------------------------------------------------------------*/
3932 
3933 static inline uint64_t extractFloat128Frac1( float128 a )
3934 {
3935 
3936     return a.low;
3937 
3938 }
3939 
3940 /*----------------------------------------------------------------------------
3941 | Returns the most-significant 48 fraction bits of the quadruple-precision
3942 | floating-point value `a'.
3943 *----------------------------------------------------------------------------*/
3944 
3945 static inline uint64_t extractFloat128Frac0( float128 a )
3946 {
3947 
3948     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3949 
3950 }
3951 
3952 /*----------------------------------------------------------------------------
3953 | Returns the exponent bits of the quadruple-precision floating-point value
3954 | `a'.
3955 *----------------------------------------------------------------------------*/
3956 
3957 static inline int32_t extractFloat128Exp( float128 a )
3958 {
3959 
3960     return ( a.high>>48 ) & 0x7FFF;
3961 
3962 }
3963 
3964 /*----------------------------------------------------------------------------
3965 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3966 *----------------------------------------------------------------------------*/
3967 
3968 static inline flag extractFloat128Sign( float128 a )
3969 {
3970 
3971     return a.high>>63;
3972 
3973 }
3974 
3975 /*----------------------------------------------------------------------------
3976 | Normalizes the subnormal quadruple-precision floating-point value
3977 | represented by the denormalized significand formed by the concatenation of
3978 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3979 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3980 | significand are stored at the location pointed to by `zSig0Ptr', and the
3981 | least significant 64 bits of the normalized significand are stored at the
3982 | location pointed to by `zSig1Ptr'.
3983 *----------------------------------------------------------------------------*/
3984 
3985 static void
3986  normalizeFloat128Subnormal(
3987      uint64_t aSig0,
3988      uint64_t aSig1,
3989      int32_t *zExpPtr,
3990      uint64_t *zSig0Ptr,
3991      uint64_t *zSig1Ptr
3992  )
3993 {
3994     int8_t shiftCount;
3995 
3996     if ( aSig0 == 0 ) {
3997         shiftCount = clz64(aSig1) - 15;
3998         if ( shiftCount < 0 ) {
3999             *zSig0Ptr = aSig1>>( - shiftCount );
4000             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4001         }
4002         else {
4003             *zSig0Ptr = aSig1<<shiftCount;
4004             *zSig1Ptr = 0;
4005         }
4006         *zExpPtr = - shiftCount - 63;
4007     }
4008     else {
4009         shiftCount = clz64(aSig0) - 15;
4010         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4011         *zExpPtr = 1 - shiftCount;
4012     }
4013 
4014 }
4015 
4016 /*----------------------------------------------------------------------------
4017 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4018 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4019 | floating-point value, returning the result.  After being shifted into the
4020 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4021 | added together to form the most significant 32 bits of the result.  This
4022 | means that any integer portion of `zSig0' will be added into the exponent.
4023 | Since a properly normalized significand will have an integer portion equal
4024 | to 1, the `zExp' input should be 1 less than the desired result exponent
4025 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4026 | significand.
4027 *----------------------------------------------------------------------------*/
4028 
4029 static inline float128
4030  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4031 {
4032     float128 z;
4033 
4034     z.low = zSig1;
4035     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4036     return z;
4037 
4038 }
4039 
4040 /*----------------------------------------------------------------------------
4041 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4042 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4043 | and `zSig2', and returns the proper quadruple-precision floating-point value
4044 | corresponding to the abstract input.  Ordinarily, the abstract value is
4045 | simply rounded and packed into the quadruple-precision format, with the
4046 | inexact exception raised if the abstract input cannot be represented
4047 | exactly.  However, if the abstract value is too large, the overflow and
4048 | inexact exceptions are raised and an infinity or maximal finite value is
4049 | returned.  If the abstract value is too small, the input value is rounded to
4050 | a subnormal number, and the underflow and inexact exceptions are raised if
4051 | the abstract input cannot be represented exactly as a subnormal quadruple-
4052 | precision floating-point number.
4053 |     The input significand must be normalized or smaller.  If the input
4054 | significand is not normalized, `zExp' must be 0; in that case, the result
4055 | returned is a subnormal number, and it must not require rounding.  In the
4056 | usual case that the input significand is normalized, `zExp' must be 1 less
4057 | than the ``true'' floating-point exponent.  The handling of underflow and
4058 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4059 *----------------------------------------------------------------------------*/
4060 
4061 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4062                                      uint64_t zSig0, uint64_t zSig1,
4063                                      uint64_t zSig2, float_status *status)
4064 {
4065     int8_t roundingMode;
4066     flag roundNearestEven, increment, isTiny;
4067 
4068     roundingMode = status->float_rounding_mode;
4069     roundNearestEven = ( roundingMode == float_round_nearest_even );
4070     switch (roundingMode) {
4071     case float_round_nearest_even:
4072     case float_round_ties_away:
4073         increment = ((int64_t)zSig2 < 0);
4074         break;
4075     case float_round_to_zero:
4076         increment = 0;
4077         break;
4078     case float_round_up:
4079         increment = !zSign && zSig2;
4080         break;
4081     case float_round_down:
4082         increment = zSign && zSig2;
4083         break;
4084     case float_round_to_odd:
4085         increment = !(zSig1 & 0x1) && zSig2;
4086         break;
4087     default:
4088         abort();
4089     }
4090     if ( 0x7FFD <= (uint32_t) zExp ) {
4091         if (    ( 0x7FFD < zExp )
4092              || (    ( zExp == 0x7FFD )
4093                   && eq128(
4094                          LIT64( 0x0001FFFFFFFFFFFF ),
4095                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4096                          zSig0,
4097                          zSig1
4098                      )
4099                   && increment
4100                 )
4101            ) {
4102             float_raise(float_flag_overflow | float_flag_inexact, status);
4103             if (    ( roundingMode == float_round_to_zero )
4104                  || ( zSign && ( roundingMode == float_round_up ) )
4105                  || ( ! zSign && ( roundingMode == float_round_down ) )
4106                  || (roundingMode == float_round_to_odd)
4107                ) {
4108                 return
4109                     packFloat128(
4110                         zSign,
4111                         0x7FFE,
4112                         LIT64( 0x0000FFFFFFFFFFFF ),
4113                         LIT64( 0xFFFFFFFFFFFFFFFF )
4114                     );
4115             }
4116             return packFloat128( zSign, 0x7FFF, 0, 0 );
4117         }
4118         if ( zExp < 0 ) {
4119             if (status->flush_to_zero) {
4120                 float_raise(float_flag_output_denormal, status);
4121                 return packFloat128(zSign, 0, 0, 0);
4122             }
4123             isTiny =
4124                    (status->float_detect_tininess
4125                     == float_tininess_before_rounding)
4126                 || ( zExp < -1 )
4127                 || ! increment
4128                 || lt128(
4129                        zSig0,
4130                        zSig1,
4131                        LIT64( 0x0001FFFFFFFFFFFF ),
4132                        LIT64( 0xFFFFFFFFFFFFFFFF )
4133                    );
4134             shift128ExtraRightJamming(
4135                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4136             zExp = 0;
4137             if (isTiny && zSig2) {
4138                 float_raise(float_flag_underflow, status);
4139             }
4140             switch (roundingMode) {
4141             case float_round_nearest_even:
4142             case float_round_ties_away:
4143                 increment = ((int64_t)zSig2 < 0);
4144                 break;
4145             case float_round_to_zero:
4146                 increment = 0;
4147                 break;
4148             case float_round_up:
4149                 increment = !zSign && zSig2;
4150                 break;
4151             case float_round_down:
4152                 increment = zSign && zSig2;
4153                 break;
4154             case float_round_to_odd:
4155                 increment = !(zSig1 & 0x1) && zSig2;
4156                 break;
4157             default:
4158                 abort();
4159             }
4160         }
4161     }
4162     if (zSig2) {
4163         status->float_exception_flags |= float_flag_inexact;
4164     }
4165     if ( increment ) {
4166         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4167         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4168     }
4169     else {
4170         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4171     }
4172     return packFloat128( zSign, zExp, zSig0, zSig1 );
4173 
4174 }
4175 
4176 /*----------------------------------------------------------------------------
4177 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4178 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4179 | returns the proper quadruple-precision floating-point value corresponding
4180 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4181 | except that the input significand has fewer bits and does not have to be
4182 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4183 | point exponent.
4184 *----------------------------------------------------------------------------*/
4185 
4186 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4187                                               uint64_t zSig0, uint64_t zSig1,
4188                                               float_status *status)
4189 {
4190     int8_t shiftCount;
4191     uint64_t zSig2;
4192 
4193     if ( zSig0 == 0 ) {
4194         zSig0 = zSig1;
4195         zSig1 = 0;
4196         zExp -= 64;
4197     }
4198     shiftCount = clz64(zSig0) - 15;
4199     if ( 0 <= shiftCount ) {
4200         zSig2 = 0;
4201         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4202     }
4203     else {
4204         shift128ExtraRightJamming(
4205             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4206     }
4207     zExp -= shiftCount;
4208     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4209 
4210 }
4211 
4212 
4213 /*----------------------------------------------------------------------------
4214 | Returns the result of converting the 32-bit two's complement integer `a'
4215 | to the extended double-precision floating-point format.  The conversion
4216 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4217 | Arithmetic.
4218 *----------------------------------------------------------------------------*/
4219 
4220 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4221 {
4222     flag zSign;
4223     uint32_t absA;
4224     int8_t shiftCount;
4225     uint64_t zSig;
4226 
4227     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4228     zSign = ( a < 0 );
4229     absA = zSign ? - a : a;
4230     shiftCount = clz32(absA) + 32;
4231     zSig = absA;
4232     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4233 
4234 }
4235 
4236 /*----------------------------------------------------------------------------
4237 | Returns the result of converting the 32-bit two's complement integer `a' to
4238 | the quadruple-precision floating-point format.  The conversion is performed
4239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4240 *----------------------------------------------------------------------------*/
4241 
4242 float128 int32_to_float128(int32_t a, float_status *status)
4243 {
4244     flag zSign;
4245     uint32_t absA;
4246     int8_t shiftCount;
4247     uint64_t zSig0;
4248 
4249     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4250     zSign = ( a < 0 );
4251     absA = zSign ? - a : a;
4252     shiftCount = clz32(absA) + 17;
4253     zSig0 = absA;
4254     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4255 
4256 }
4257 
4258 /*----------------------------------------------------------------------------
4259 | Returns the result of converting the 64-bit two's complement integer `a'
4260 | to the extended double-precision floating-point format.  The conversion
4261 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4262 | Arithmetic.
4263 *----------------------------------------------------------------------------*/
4264 
4265 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4266 {
4267     flag zSign;
4268     uint64_t absA;
4269     int8_t shiftCount;
4270 
4271     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4272     zSign = ( a < 0 );
4273     absA = zSign ? - a : a;
4274     shiftCount = clz64(absA);
4275     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4276 
4277 }
4278 
4279 /*----------------------------------------------------------------------------
4280 | Returns the result of converting the 64-bit two's complement integer `a' to
4281 | the quadruple-precision floating-point format.  The conversion is performed
4282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4283 *----------------------------------------------------------------------------*/
4284 
4285 float128 int64_to_float128(int64_t a, float_status *status)
4286 {
4287     flag zSign;
4288     uint64_t absA;
4289     int8_t shiftCount;
4290     int32_t zExp;
4291     uint64_t zSig0, zSig1;
4292 
4293     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4294     zSign = ( a < 0 );
4295     absA = zSign ? - a : a;
4296     shiftCount = clz64(absA) + 49;
4297     zExp = 0x406E - shiftCount;
4298     if ( 64 <= shiftCount ) {
4299         zSig1 = 0;
4300         zSig0 = absA;
4301         shiftCount -= 64;
4302     }
4303     else {
4304         zSig1 = absA;
4305         zSig0 = 0;
4306     }
4307     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4308     return packFloat128( zSign, zExp, zSig0, zSig1 );
4309 
4310 }
4311 
4312 /*----------------------------------------------------------------------------
4313 | Returns the result of converting the 64-bit unsigned integer `a'
4314 | to the quadruple-precision floating-point format.  The conversion is performed
4315 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4316 *----------------------------------------------------------------------------*/
4317 
4318 float128 uint64_to_float128(uint64_t a, float_status *status)
4319 {
4320     if (a == 0) {
4321         return float128_zero;
4322     }
4323     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4324 }
4325 
4326 /*----------------------------------------------------------------------------
4327 | Returns the result of converting the single-precision floating-point value
4328 | `a' to the extended double-precision floating-point format.  The conversion
4329 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4330 | Arithmetic.
4331 *----------------------------------------------------------------------------*/
4332 
4333 floatx80 float32_to_floatx80(float32 a, float_status *status)
4334 {
4335     flag aSign;
4336     int aExp;
4337     uint32_t aSig;
4338 
4339     a = float32_squash_input_denormal(a, status);
4340     aSig = extractFloat32Frac( a );
4341     aExp = extractFloat32Exp( a );
4342     aSign = extractFloat32Sign( a );
4343     if ( aExp == 0xFF ) {
4344         if (aSig) {
4345             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4346         }
4347         return packFloatx80(aSign,
4348                             floatx80_infinity_high,
4349                             floatx80_infinity_low);
4350     }
4351     if ( aExp == 0 ) {
4352         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4353         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4354     }
4355     aSig |= 0x00800000;
4356     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4357 
4358 }
4359 
4360 /*----------------------------------------------------------------------------
4361 | Returns the result of converting the single-precision floating-point value
4362 | `a' to the double-precision floating-point format.  The conversion is
4363 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4364 | Arithmetic.
4365 *----------------------------------------------------------------------------*/
4366 
4367 float128 float32_to_float128(float32 a, float_status *status)
4368 {
4369     flag aSign;
4370     int aExp;
4371     uint32_t aSig;
4372 
4373     a = float32_squash_input_denormal(a, status);
4374     aSig = extractFloat32Frac( a );
4375     aExp = extractFloat32Exp( a );
4376     aSign = extractFloat32Sign( a );
4377     if ( aExp == 0xFF ) {
4378         if (aSig) {
4379             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4380         }
4381         return packFloat128( aSign, 0x7FFF, 0, 0 );
4382     }
4383     if ( aExp == 0 ) {
4384         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4385         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4386         --aExp;
4387     }
4388     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns the remainder of the single-precision floating-point value `a'
4394 | with respect to the corresponding value `b'.  The operation is performed
4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4397 
4398 float32 float32_rem(float32 a, float32 b, float_status *status)
4399 {
4400     flag aSign, zSign;
4401     int aExp, bExp, expDiff;
4402     uint32_t aSig, bSig;
4403     uint32_t q;
4404     uint64_t aSig64, bSig64, q64;
4405     uint32_t alternateASig;
4406     int32_t sigMean;
4407     a = float32_squash_input_denormal(a, status);
4408     b = float32_squash_input_denormal(b, status);
4409 
4410     aSig = extractFloat32Frac( a );
4411     aExp = extractFloat32Exp( a );
4412     aSign = extractFloat32Sign( a );
4413     bSig = extractFloat32Frac( b );
4414     bExp = extractFloat32Exp( b );
4415     if ( aExp == 0xFF ) {
4416         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4417             return propagateFloat32NaN(a, b, status);
4418         }
4419         float_raise(float_flag_invalid, status);
4420         return float32_default_nan(status);
4421     }
4422     if ( bExp == 0xFF ) {
4423         if (bSig) {
4424             return propagateFloat32NaN(a, b, status);
4425         }
4426         return a;
4427     }
4428     if ( bExp == 0 ) {
4429         if ( bSig == 0 ) {
4430             float_raise(float_flag_invalid, status);
4431             return float32_default_nan(status);
4432         }
4433         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4434     }
4435     if ( aExp == 0 ) {
4436         if ( aSig == 0 ) return a;
4437         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4438     }
4439     expDiff = aExp - bExp;
4440     aSig |= 0x00800000;
4441     bSig |= 0x00800000;
4442     if ( expDiff < 32 ) {
4443         aSig <<= 8;
4444         bSig <<= 8;
4445         if ( expDiff < 0 ) {
4446             if ( expDiff < -1 ) return a;
4447             aSig >>= 1;
4448         }
4449         q = ( bSig <= aSig );
4450         if ( q ) aSig -= bSig;
4451         if ( 0 < expDiff ) {
4452             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4453             q >>= 32 - expDiff;
4454             bSig >>= 2;
4455             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4456         }
4457         else {
4458             aSig >>= 2;
4459             bSig >>= 2;
4460         }
4461     }
4462     else {
4463         if ( bSig <= aSig ) aSig -= bSig;
4464         aSig64 = ( (uint64_t) aSig )<<40;
4465         bSig64 = ( (uint64_t) bSig )<<40;
4466         expDiff -= 64;
4467         while ( 0 < expDiff ) {
4468             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4469             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4470             aSig64 = - ( ( bSig * q64 )<<38 );
4471             expDiff -= 62;
4472         }
4473         expDiff += 64;
4474         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4475         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4476         q = q64>>( 64 - expDiff );
4477         bSig <<= 6;
4478         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4479     }
4480     do {
4481         alternateASig = aSig;
4482         ++q;
4483         aSig -= bSig;
4484     } while ( 0 <= (int32_t) aSig );
4485     sigMean = aSig + alternateASig;
4486     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4487         aSig = alternateASig;
4488     }
4489     zSign = ( (int32_t) aSig < 0 );
4490     if ( zSign ) aSig = - aSig;
4491     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4492 }
4493 
4494 
4495 
4496 /*----------------------------------------------------------------------------
4497 | Returns the binary exponential of the single-precision floating-point value
4498 | `a'. The operation is performed according to the IEC/IEEE Standard for
4499 | Binary Floating-Point Arithmetic.
4500 |
4501 | Uses the following identities:
4502 |
4503 | 1. -------------------------------------------------------------------------
4504 |      x    x*ln(2)
4505 |     2  = e
4506 |
4507 | 2. -------------------------------------------------------------------------
4508 |                      2     3     4     5           n
4509 |      x        x     x     x     x     x           x
4510 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4511 |               1!    2!    3!    4!    5!          n!
4512 *----------------------------------------------------------------------------*/
4513 
4514 static const float64 float32_exp2_coefficients[15] =
4515 {
4516     const_float64( 0x3ff0000000000000ll ), /*  1 */
4517     const_float64( 0x3fe0000000000000ll ), /*  2 */
4518     const_float64( 0x3fc5555555555555ll ), /*  3 */
4519     const_float64( 0x3fa5555555555555ll ), /*  4 */
4520     const_float64( 0x3f81111111111111ll ), /*  5 */
4521     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4522     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4523     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4524     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4525     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4526     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4527     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4528     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4529     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4530     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4531 };
4532 
4533 float32 float32_exp2(float32 a, float_status *status)
4534 {
4535     flag aSign;
4536     int aExp;
4537     uint32_t aSig;
4538     float64 r, x, xn;
4539     int i;
4540     a = float32_squash_input_denormal(a, status);
4541 
4542     aSig = extractFloat32Frac( a );
4543     aExp = extractFloat32Exp( a );
4544     aSign = extractFloat32Sign( a );
4545 
4546     if ( aExp == 0xFF) {
4547         if (aSig) {
4548             return propagateFloat32NaN(a, float32_zero, status);
4549         }
4550         return (aSign) ? float32_zero : a;
4551     }
4552     if (aExp == 0) {
4553         if (aSig == 0) return float32_one;
4554     }
4555 
4556     float_raise(float_flag_inexact, status);
4557 
4558     /* ******************************* */
4559     /* using float64 for approximation */
4560     /* ******************************* */
4561     x = float32_to_float64(a, status);
4562     x = float64_mul(x, float64_ln2, status);
4563 
4564     xn = x;
4565     r = float64_one;
4566     for (i = 0 ; i < 15 ; i++) {
4567         float64 f;
4568 
4569         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4570         r = float64_add(r, f, status);
4571 
4572         xn = float64_mul(xn, x, status);
4573     }
4574 
4575     return float64_to_float32(r, status);
4576 }
4577 
4578 /*----------------------------------------------------------------------------
4579 | Returns the binary log of the single-precision floating-point value `a'.
4580 | The operation is performed according to the IEC/IEEE Standard for Binary
4581 | Floating-Point Arithmetic.
4582 *----------------------------------------------------------------------------*/
4583 float32 float32_log2(float32 a, float_status *status)
4584 {
4585     flag aSign, zSign;
4586     int aExp;
4587     uint32_t aSig, zSig, i;
4588 
4589     a = float32_squash_input_denormal(a, status);
4590     aSig = extractFloat32Frac( a );
4591     aExp = extractFloat32Exp( a );
4592     aSign = extractFloat32Sign( a );
4593 
4594     if ( aExp == 0 ) {
4595         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4596         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4597     }
4598     if ( aSign ) {
4599         float_raise(float_flag_invalid, status);
4600         return float32_default_nan(status);
4601     }
4602     if ( aExp == 0xFF ) {
4603         if (aSig) {
4604             return propagateFloat32NaN(a, float32_zero, status);
4605         }
4606         return a;
4607     }
4608 
4609     aExp -= 0x7F;
4610     aSig |= 0x00800000;
4611     zSign = aExp < 0;
4612     zSig = aExp << 23;
4613 
4614     for (i = 1 << 22; i > 0; i >>= 1) {
4615         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4616         if ( aSig & 0x01000000 ) {
4617             aSig >>= 1;
4618             zSig |= i;
4619         }
4620     }
4621 
4622     if ( zSign )
4623         zSig = -zSig;
4624 
4625     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4626 }
4627 
4628 /*----------------------------------------------------------------------------
4629 | Returns 1 if the single-precision floating-point value `a' is equal to
4630 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4631 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633 *----------------------------------------------------------------------------*/
4634 
4635 int float32_eq(float32 a, float32 b, float_status *status)
4636 {
4637     uint32_t av, bv;
4638     a = float32_squash_input_denormal(a, status);
4639     b = float32_squash_input_denormal(b, status);
4640 
4641     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4642          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4643        ) {
4644         float_raise(float_flag_invalid, status);
4645         return 0;
4646     }
4647     av = float32_val(a);
4648     bv = float32_val(b);
4649     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4650 }
4651 
4652 /*----------------------------------------------------------------------------
4653 | Returns 1 if the single-precision floating-point value `a' is less than
4654 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4655 | exception is raised if either operand is a NaN.  The comparison is performed
4656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4657 *----------------------------------------------------------------------------*/
4658 
4659 int float32_le(float32 a, float32 b, float_status *status)
4660 {
4661     flag aSign, bSign;
4662     uint32_t av, bv;
4663     a = float32_squash_input_denormal(a, status);
4664     b = float32_squash_input_denormal(b, status);
4665 
4666     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4667          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4668        ) {
4669         float_raise(float_flag_invalid, status);
4670         return 0;
4671     }
4672     aSign = extractFloat32Sign( a );
4673     bSign = extractFloat32Sign( b );
4674     av = float32_val(a);
4675     bv = float32_val(b);
4676     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4677     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4678 
4679 }
4680 
4681 /*----------------------------------------------------------------------------
4682 | Returns 1 if the single-precision floating-point value `a' is less than
4683 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4684 | raised if either operand is a NaN.  The comparison is performed according
4685 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4686 *----------------------------------------------------------------------------*/
4687 
4688 int float32_lt(float32 a, float32 b, float_status *status)
4689 {
4690     flag aSign, bSign;
4691     uint32_t av, bv;
4692     a = float32_squash_input_denormal(a, status);
4693     b = float32_squash_input_denormal(b, status);
4694 
4695     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4696          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4697        ) {
4698         float_raise(float_flag_invalid, status);
4699         return 0;
4700     }
4701     aSign = extractFloat32Sign( a );
4702     bSign = extractFloat32Sign( b );
4703     av = float32_val(a);
4704     bv = float32_val(b);
4705     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4706     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4707 
4708 }
4709 
4710 /*----------------------------------------------------------------------------
4711 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4712 | be compared, and 0 otherwise.  The invalid exception is raised if either
4713 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4714 | Standard for Binary Floating-Point Arithmetic.
4715 *----------------------------------------------------------------------------*/
4716 
4717 int float32_unordered(float32 a, float32 b, float_status *status)
4718 {
4719     a = float32_squash_input_denormal(a, status);
4720     b = float32_squash_input_denormal(b, status);
4721 
4722     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4723          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4724        ) {
4725         float_raise(float_flag_invalid, status);
4726         return 1;
4727     }
4728     return 0;
4729 }
4730 
4731 /*----------------------------------------------------------------------------
4732 | Returns 1 if the single-precision floating-point value `a' is equal to
4733 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4734 | exception.  The comparison is performed according to the IEC/IEEE Standard
4735 | for Binary Floating-Point Arithmetic.
4736 *----------------------------------------------------------------------------*/
4737 
4738 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4739 {
4740     a = float32_squash_input_denormal(a, status);
4741     b = float32_squash_input_denormal(b, status);
4742 
4743     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4744          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4745        ) {
4746         if (float32_is_signaling_nan(a, status)
4747          || float32_is_signaling_nan(b, status)) {
4748             float_raise(float_flag_invalid, status);
4749         }
4750         return 0;
4751     }
4752     return ( float32_val(a) == float32_val(b) ) ||
4753             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4754 }
4755 
4756 /*----------------------------------------------------------------------------
4757 | Returns 1 if the single-precision floating-point value `a' is less than or
4758 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4759 | cause an exception.  Otherwise, the comparison is performed according to the
4760 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4761 *----------------------------------------------------------------------------*/
4762 
4763 int float32_le_quiet(float32 a, float32 b, float_status *status)
4764 {
4765     flag aSign, bSign;
4766     uint32_t av, bv;
4767     a = float32_squash_input_denormal(a, status);
4768     b = float32_squash_input_denormal(b, status);
4769 
4770     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4771          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4772        ) {
4773         if (float32_is_signaling_nan(a, status)
4774          || float32_is_signaling_nan(b, status)) {
4775             float_raise(float_flag_invalid, status);
4776         }
4777         return 0;
4778     }
4779     aSign = extractFloat32Sign( a );
4780     bSign = extractFloat32Sign( b );
4781     av = float32_val(a);
4782     bv = float32_val(b);
4783     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4784     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4785 
4786 }
4787 
4788 /*----------------------------------------------------------------------------
4789 | Returns 1 if the single-precision floating-point value `a' is less than
4790 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4791 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4792 | Standard for Binary Floating-Point Arithmetic.
4793 *----------------------------------------------------------------------------*/
4794 
4795 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4796 {
4797     flag aSign, bSign;
4798     uint32_t av, bv;
4799     a = float32_squash_input_denormal(a, status);
4800     b = float32_squash_input_denormal(b, status);
4801 
4802     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4803          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4804        ) {
4805         if (float32_is_signaling_nan(a, status)
4806          || float32_is_signaling_nan(b, status)) {
4807             float_raise(float_flag_invalid, status);
4808         }
4809         return 0;
4810     }
4811     aSign = extractFloat32Sign( a );
4812     bSign = extractFloat32Sign( b );
4813     av = float32_val(a);
4814     bv = float32_val(b);
4815     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4816     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4817 
4818 }
4819 
4820 /*----------------------------------------------------------------------------
4821 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4822 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4823 | comparison is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826 
4827 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4828 {
4829     a = float32_squash_input_denormal(a, status);
4830     b = float32_squash_input_denormal(b, status);
4831 
4832     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4833          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4834        ) {
4835         if (float32_is_signaling_nan(a, status)
4836          || float32_is_signaling_nan(b, status)) {
4837             float_raise(float_flag_invalid, status);
4838         }
4839         return 1;
4840     }
4841     return 0;
4842 }
4843 
4844 /*----------------------------------------------------------------------------
4845 | If `a' is denormal and we are in flush-to-zero mode then set the
4846 | input-denormal exception and return zero. Otherwise just return the value.
4847 *----------------------------------------------------------------------------*/
4848 float16 float16_squash_input_denormal(float16 a, float_status *status)
4849 {
4850     if (status->flush_inputs_to_zero) {
4851         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4852             float_raise(float_flag_input_denormal, status);
4853             return make_float16(float16_val(a) & 0x8000);
4854         }
4855     }
4856     return a;
4857 }
4858 
4859 /*----------------------------------------------------------------------------
4860 | Returns the result of converting the double-precision floating-point value
4861 | `a' to the extended double-precision floating-point format.  The conversion
4862 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4863 | Arithmetic.
4864 *----------------------------------------------------------------------------*/
4865 
4866 floatx80 float64_to_floatx80(float64 a, float_status *status)
4867 {
4868     flag aSign;
4869     int aExp;
4870     uint64_t aSig;
4871 
4872     a = float64_squash_input_denormal(a, status);
4873     aSig = extractFloat64Frac( a );
4874     aExp = extractFloat64Exp( a );
4875     aSign = extractFloat64Sign( a );
4876     if ( aExp == 0x7FF ) {
4877         if (aSig) {
4878             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4879         }
4880         return packFloatx80(aSign,
4881                             floatx80_infinity_high,
4882                             floatx80_infinity_low);
4883     }
4884     if ( aExp == 0 ) {
4885         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4886         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4887     }
4888     return
4889         packFloatx80(
4890             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4891 
4892 }
4893 
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of converting the double-precision floating-point value
4896 | `a' to the quadruple-precision floating-point format.  The conversion is
4897 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4898 | Arithmetic.
4899 *----------------------------------------------------------------------------*/
4900 
4901 float128 float64_to_float128(float64 a, float_status *status)
4902 {
4903     flag aSign;
4904     int aExp;
4905     uint64_t aSig, zSig0, zSig1;
4906 
4907     a = float64_squash_input_denormal(a, status);
4908     aSig = extractFloat64Frac( a );
4909     aExp = extractFloat64Exp( a );
4910     aSign = extractFloat64Sign( a );
4911     if ( aExp == 0x7FF ) {
4912         if (aSig) {
4913             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4914         }
4915         return packFloat128( aSign, 0x7FFF, 0, 0 );
4916     }
4917     if ( aExp == 0 ) {
4918         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4919         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4920         --aExp;
4921     }
4922     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4923     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4924 
4925 }
4926 
4927 
4928 /*----------------------------------------------------------------------------
4929 | Returns the remainder of the double-precision floating-point value `a'
4930 | with respect to the corresponding value `b'.  The operation is performed
4931 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4933 
4934 float64 float64_rem(float64 a, float64 b, float_status *status)
4935 {
4936     flag aSign, zSign;
4937     int aExp, bExp, expDiff;
4938     uint64_t aSig, bSig;
4939     uint64_t q, alternateASig;
4940     int64_t sigMean;
4941 
4942     a = float64_squash_input_denormal(a, status);
4943     b = float64_squash_input_denormal(b, status);
4944     aSig = extractFloat64Frac( a );
4945     aExp = extractFloat64Exp( a );
4946     aSign = extractFloat64Sign( a );
4947     bSig = extractFloat64Frac( b );
4948     bExp = extractFloat64Exp( b );
4949     if ( aExp == 0x7FF ) {
4950         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4951             return propagateFloat64NaN(a, b, status);
4952         }
4953         float_raise(float_flag_invalid, status);
4954         return float64_default_nan(status);
4955     }
4956     if ( bExp == 0x7FF ) {
4957         if (bSig) {
4958             return propagateFloat64NaN(a, b, status);
4959         }
4960         return a;
4961     }
4962     if ( bExp == 0 ) {
4963         if ( bSig == 0 ) {
4964             float_raise(float_flag_invalid, status);
4965             return float64_default_nan(status);
4966         }
4967         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4968     }
4969     if ( aExp == 0 ) {
4970         if ( aSig == 0 ) return a;
4971         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4972     }
4973     expDiff = aExp - bExp;
4974     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4975     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4976     if ( expDiff < 0 ) {
4977         if ( expDiff < -1 ) return a;
4978         aSig >>= 1;
4979     }
4980     q = ( bSig <= aSig );
4981     if ( q ) aSig -= bSig;
4982     expDiff -= 64;
4983     while ( 0 < expDiff ) {
4984         q = estimateDiv128To64( aSig, 0, bSig );
4985         q = ( 2 < q ) ? q - 2 : 0;
4986         aSig = - ( ( bSig>>2 ) * q );
4987         expDiff -= 62;
4988     }
4989     expDiff += 64;
4990     if ( 0 < expDiff ) {
4991         q = estimateDiv128To64( aSig, 0, bSig );
4992         q = ( 2 < q ) ? q - 2 : 0;
4993         q >>= 64 - expDiff;
4994         bSig >>= 2;
4995         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4996     }
4997     else {
4998         aSig >>= 2;
4999         bSig >>= 2;
5000     }
5001     do {
5002         alternateASig = aSig;
5003         ++q;
5004         aSig -= bSig;
5005     } while ( 0 <= (int64_t) aSig );
5006     sigMean = aSig + alternateASig;
5007     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5008         aSig = alternateASig;
5009     }
5010     zSign = ( (int64_t) aSig < 0 );
5011     if ( zSign ) aSig = - aSig;
5012     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5013 
5014 }
5015 
5016 /*----------------------------------------------------------------------------
5017 | Returns the binary log of the double-precision floating-point value `a'.
5018 | The operation is performed according to the IEC/IEEE Standard for Binary
5019 | Floating-Point Arithmetic.
5020 *----------------------------------------------------------------------------*/
5021 float64 float64_log2(float64 a, float_status *status)
5022 {
5023     flag aSign, zSign;
5024     int aExp;
5025     uint64_t aSig, aSig0, aSig1, zSig, i;
5026     a = float64_squash_input_denormal(a, status);
5027 
5028     aSig = extractFloat64Frac( a );
5029     aExp = extractFloat64Exp( a );
5030     aSign = extractFloat64Sign( a );
5031 
5032     if ( aExp == 0 ) {
5033         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5034         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5035     }
5036     if ( aSign ) {
5037         float_raise(float_flag_invalid, status);
5038         return float64_default_nan(status);
5039     }
5040     if ( aExp == 0x7FF ) {
5041         if (aSig) {
5042             return propagateFloat64NaN(a, float64_zero, status);
5043         }
5044         return a;
5045     }
5046 
5047     aExp -= 0x3FF;
5048     aSig |= LIT64( 0x0010000000000000 );
5049     zSign = aExp < 0;
5050     zSig = (uint64_t)aExp << 52;
5051     for (i = 1LL << 51; i > 0; i >>= 1) {
5052         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5053         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5054         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5055             aSig >>= 1;
5056             zSig |= i;
5057         }
5058     }
5059 
5060     if ( zSign )
5061         zSig = -zSig;
5062     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5063 }
5064 
5065 /*----------------------------------------------------------------------------
5066 | Returns 1 if the double-precision floating-point value `a' is equal to the
5067 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5068 | if either operand is a NaN.  Otherwise, the comparison is performed
5069 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5070 *----------------------------------------------------------------------------*/
5071 
5072 int float64_eq(float64 a, float64 b, float_status *status)
5073 {
5074     uint64_t av, bv;
5075     a = float64_squash_input_denormal(a, status);
5076     b = float64_squash_input_denormal(b, status);
5077 
5078     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5079          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5080        ) {
5081         float_raise(float_flag_invalid, status);
5082         return 0;
5083     }
5084     av = float64_val(a);
5085     bv = float64_val(b);
5086     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5087 
5088 }
5089 
5090 /*----------------------------------------------------------------------------
5091 | Returns 1 if the double-precision floating-point value `a' is less than or
5092 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5093 | exception is raised if either operand is a NaN.  The comparison is performed
5094 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5095 *----------------------------------------------------------------------------*/
5096 
5097 int float64_le(float64 a, float64 b, float_status *status)
5098 {
5099     flag aSign, bSign;
5100     uint64_t av, bv;
5101     a = float64_squash_input_denormal(a, status);
5102     b = float64_squash_input_denormal(b, status);
5103 
5104     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5105          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5106        ) {
5107         float_raise(float_flag_invalid, status);
5108         return 0;
5109     }
5110     aSign = extractFloat64Sign( a );
5111     bSign = extractFloat64Sign( b );
5112     av = float64_val(a);
5113     bv = float64_val(b);
5114     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5115     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5116 
5117 }
5118 
5119 /*----------------------------------------------------------------------------
5120 | Returns 1 if the double-precision floating-point value `a' is less than
5121 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5122 | raised if either operand is a NaN.  The comparison is performed according
5123 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5124 *----------------------------------------------------------------------------*/
5125 
5126 int float64_lt(float64 a, float64 b, float_status *status)
5127 {
5128     flag aSign, bSign;
5129     uint64_t av, bv;
5130 
5131     a = float64_squash_input_denormal(a, status);
5132     b = float64_squash_input_denormal(b, status);
5133     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5134          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5135        ) {
5136         float_raise(float_flag_invalid, status);
5137         return 0;
5138     }
5139     aSign = extractFloat64Sign( a );
5140     bSign = extractFloat64Sign( b );
5141     av = float64_val(a);
5142     bv = float64_val(b);
5143     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5144     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5145 
5146 }
5147 
5148 /*----------------------------------------------------------------------------
5149 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5150 | be compared, and 0 otherwise.  The invalid exception is raised if either
5151 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5152 | Standard for Binary Floating-Point Arithmetic.
5153 *----------------------------------------------------------------------------*/
5154 
5155 int float64_unordered(float64 a, float64 b, float_status *status)
5156 {
5157     a = float64_squash_input_denormal(a, status);
5158     b = float64_squash_input_denormal(b, status);
5159 
5160     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5161          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5162        ) {
5163         float_raise(float_flag_invalid, status);
5164         return 1;
5165     }
5166     return 0;
5167 }
5168 
5169 /*----------------------------------------------------------------------------
5170 | Returns 1 if the double-precision floating-point value `a' is equal to the
5171 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5172 | exception.The comparison is performed according to the IEC/IEEE Standard
5173 | for Binary Floating-Point Arithmetic.
5174 *----------------------------------------------------------------------------*/
5175 
5176 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5177 {
5178     uint64_t av, bv;
5179     a = float64_squash_input_denormal(a, status);
5180     b = float64_squash_input_denormal(b, status);
5181 
5182     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5183          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5184        ) {
5185         if (float64_is_signaling_nan(a, status)
5186          || float64_is_signaling_nan(b, status)) {
5187             float_raise(float_flag_invalid, status);
5188         }
5189         return 0;
5190     }
5191     av = float64_val(a);
5192     bv = float64_val(b);
5193     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5194 
5195 }
5196 
5197 /*----------------------------------------------------------------------------
5198 | Returns 1 if the double-precision floating-point value `a' is less than or
5199 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5200 | cause an exception.  Otherwise, the comparison is performed according to the
5201 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5203 
5204 int float64_le_quiet(float64 a, float64 b, float_status *status)
5205 {
5206     flag aSign, bSign;
5207     uint64_t av, bv;
5208     a = float64_squash_input_denormal(a, status);
5209     b = float64_squash_input_denormal(b, status);
5210 
5211     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5212          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5213        ) {
5214         if (float64_is_signaling_nan(a, status)
5215          || float64_is_signaling_nan(b, status)) {
5216             float_raise(float_flag_invalid, status);
5217         }
5218         return 0;
5219     }
5220     aSign = extractFloat64Sign( a );
5221     bSign = extractFloat64Sign( b );
5222     av = float64_val(a);
5223     bv = float64_val(b);
5224     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5225     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5226 
5227 }
5228 
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the double-precision floating-point value `a' is less than
5231 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5232 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5235 
5236 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5237 {
5238     flag aSign, bSign;
5239     uint64_t av, bv;
5240     a = float64_squash_input_denormal(a, status);
5241     b = float64_squash_input_denormal(b, status);
5242 
5243     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5244          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5245        ) {
5246         if (float64_is_signaling_nan(a, status)
5247          || float64_is_signaling_nan(b, status)) {
5248             float_raise(float_flag_invalid, status);
5249         }
5250         return 0;
5251     }
5252     aSign = extractFloat64Sign( a );
5253     bSign = extractFloat64Sign( b );
5254     av = float64_val(a);
5255     bv = float64_val(b);
5256     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5257     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5258 
5259 }
5260 
5261 /*----------------------------------------------------------------------------
5262 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5263 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5264 | comparison is performed according to the IEC/IEEE Standard for Binary
5265 | Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5267 
5268 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5269 {
5270     a = float64_squash_input_denormal(a, status);
5271     b = float64_squash_input_denormal(b, status);
5272 
5273     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5274          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5275        ) {
5276         if (float64_is_signaling_nan(a, status)
5277          || float64_is_signaling_nan(b, status)) {
5278             float_raise(float_flag_invalid, status);
5279         }
5280         return 1;
5281     }
5282     return 0;
5283 }
5284 
5285 /*----------------------------------------------------------------------------
5286 | Returns the result of converting the extended double-precision floating-
5287 | point value `a' to the 32-bit two's complement integer format.  The
5288 | conversion is performed according to the IEC/IEEE Standard for Binary
5289 | Floating-Point Arithmetic---which means in particular that the conversion
5290 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5291 | largest positive integer is returned.  Otherwise, if the conversion
5292 | overflows, the largest integer with the same sign as `a' is returned.
5293 *----------------------------------------------------------------------------*/
5294 
5295 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5296 {
5297     flag aSign;
5298     int32_t aExp, shiftCount;
5299     uint64_t aSig;
5300 
5301     if (floatx80_invalid_encoding(a)) {
5302         float_raise(float_flag_invalid, status);
5303         return 1 << 31;
5304     }
5305     aSig = extractFloatx80Frac( a );
5306     aExp = extractFloatx80Exp( a );
5307     aSign = extractFloatx80Sign( a );
5308     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5309     shiftCount = 0x4037 - aExp;
5310     if ( shiftCount <= 0 ) shiftCount = 1;
5311     shift64RightJamming( aSig, shiftCount, &aSig );
5312     return roundAndPackInt32(aSign, aSig, status);
5313 
5314 }
5315 
5316 /*----------------------------------------------------------------------------
5317 | Returns the result of converting the extended double-precision floating-
5318 | point value `a' to the 32-bit two's complement integer format.  The
5319 | conversion is performed according to the IEC/IEEE Standard for Binary
5320 | Floating-Point Arithmetic, except that the conversion is always rounded
5321 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5322 | Otherwise, if the conversion overflows, the largest integer with the same
5323 | sign as `a' is returned.
5324 *----------------------------------------------------------------------------*/
5325 
5326 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5327 {
5328     flag aSign;
5329     int32_t aExp, shiftCount;
5330     uint64_t aSig, savedASig;
5331     int32_t z;
5332 
5333     if (floatx80_invalid_encoding(a)) {
5334         float_raise(float_flag_invalid, status);
5335         return 1 << 31;
5336     }
5337     aSig = extractFloatx80Frac( a );
5338     aExp = extractFloatx80Exp( a );
5339     aSign = extractFloatx80Sign( a );
5340     if ( 0x401E < aExp ) {
5341         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5342         goto invalid;
5343     }
5344     else if ( aExp < 0x3FFF ) {
5345         if (aExp || aSig) {
5346             status->float_exception_flags |= float_flag_inexact;
5347         }
5348         return 0;
5349     }
5350     shiftCount = 0x403E - aExp;
5351     savedASig = aSig;
5352     aSig >>= shiftCount;
5353     z = aSig;
5354     if ( aSign ) z = - z;
5355     if ( ( z < 0 ) ^ aSign ) {
5356  invalid:
5357         float_raise(float_flag_invalid, status);
5358         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5359     }
5360     if ( ( aSig<<shiftCount ) != savedASig ) {
5361         status->float_exception_flags |= float_flag_inexact;
5362     }
5363     return z;
5364 
5365 }
5366 
5367 /*----------------------------------------------------------------------------
5368 | Returns the result of converting the extended double-precision floating-
5369 | point value `a' to the 64-bit two's complement integer format.  The
5370 | conversion is performed according to the IEC/IEEE Standard for Binary
5371 | Floating-Point Arithmetic---which means in particular that the conversion
5372 | is rounded according to the current rounding mode.  If `a' is a NaN,
5373 | the largest positive integer is returned.  Otherwise, if the conversion
5374 | overflows, the largest integer with the same sign as `a' is returned.
5375 *----------------------------------------------------------------------------*/
5376 
5377 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5378 {
5379     flag aSign;
5380     int32_t aExp, shiftCount;
5381     uint64_t aSig, aSigExtra;
5382 
5383     if (floatx80_invalid_encoding(a)) {
5384         float_raise(float_flag_invalid, status);
5385         return 1ULL << 63;
5386     }
5387     aSig = extractFloatx80Frac( a );
5388     aExp = extractFloatx80Exp( a );
5389     aSign = extractFloatx80Sign( a );
5390     shiftCount = 0x403E - aExp;
5391     if ( shiftCount <= 0 ) {
5392         if ( shiftCount ) {
5393             float_raise(float_flag_invalid, status);
5394             if (!aSign || floatx80_is_any_nan(a)) {
5395                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5396             }
5397             return (int64_t) LIT64( 0x8000000000000000 );
5398         }
5399         aSigExtra = 0;
5400     }
5401     else {
5402         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5403     }
5404     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5405 
5406 }
5407 
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of converting the extended double-precision floating-
5410 | point value `a' to the 64-bit two's complement integer format.  The
5411 | conversion is performed according to the IEC/IEEE Standard for Binary
5412 | Floating-Point Arithmetic, except that the conversion is always rounded
5413 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5414 | Otherwise, if the conversion overflows, the largest integer with the same
5415 | sign as `a' is returned.
5416 *----------------------------------------------------------------------------*/
5417 
5418 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5419 {
5420     flag aSign;
5421     int32_t aExp, shiftCount;
5422     uint64_t aSig;
5423     int64_t z;
5424 
5425     if (floatx80_invalid_encoding(a)) {
5426         float_raise(float_flag_invalid, status);
5427         return 1ULL << 63;
5428     }
5429     aSig = extractFloatx80Frac( a );
5430     aExp = extractFloatx80Exp( a );
5431     aSign = extractFloatx80Sign( a );
5432     shiftCount = aExp - 0x403E;
5433     if ( 0 <= shiftCount ) {
5434         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5435         if ( ( a.high != 0xC03E ) || aSig ) {
5436             float_raise(float_flag_invalid, status);
5437             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5438                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5439             }
5440         }
5441         return (int64_t) LIT64( 0x8000000000000000 );
5442     }
5443     else if ( aExp < 0x3FFF ) {
5444         if (aExp | aSig) {
5445             status->float_exception_flags |= float_flag_inexact;
5446         }
5447         return 0;
5448     }
5449     z = aSig>>( - shiftCount );
5450     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5451         status->float_exception_flags |= float_flag_inexact;
5452     }
5453     if ( aSign ) z = - z;
5454     return z;
5455 
5456 }
5457 
5458 /*----------------------------------------------------------------------------
5459 | Returns the result of converting the extended double-precision floating-
5460 | point value `a' to the single-precision floating-point format.  The
5461 | conversion is performed according to the IEC/IEEE Standard for Binary
5462 | Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5464 
5465 float32 floatx80_to_float32(floatx80 a, float_status *status)
5466 {
5467     flag aSign;
5468     int32_t aExp;
5469     uint64_t aSig;
5470 
5471     if (floatx80_invalid_encoding(a)) {
5472         float_raise(float_flag_invalid, status);
5473         return float32_default_nan(status);
5474     }
5475     aSig = extractFloatx80Frac( a );
5476     aExp = extractFloatx80Exp( a );
5477     aSign = extractFloatx80Sign( a );
5478     if ( aExp == 0x7FFF ) {
5479         if ( (uint64_t) ( aSig<<1 ) ) {
5480             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5481         }
5482         return packFloat32( aSign, 0xFF, 0 );
5483     }
5484     shift64RightJamming( aSig, 33, &aSig );
5485     if ( aExp || aSig ) aExp -= 0x3F81;
5486     return roundAndPackFloat32(aSign, aExp, aSig, status);
5487 
5488 }
5489 
5490 /*----------------------------------------------------------------------------
5491 | Returns the result of converting the extended double-precision floating-
5492 | point value `a' to the double-precision floating-point format.  The
5493 | conversion is performed according to the IEC/IEEE Standard for Binary
5494 | Floating-Point Arithmetic.
5495 *----------------------------------------------------------------------------*/
5496 
5497 float64 floatx80_to_float64(floatx80 a, float_status *status)
5498 {
5499     flag aSign;
5500     int32_t aExp;
5501     uint64_t aSig, zSig;
5502 
5503     if (floatx80_invalid_encoding(a)) {
5504         float_raise(float_flag_invalid, status);
5505         return float64_default_nan(status);
5506     }
5507     aSig = extractFloatx80Frac( a );
5508     aExp = extractFloatx80Exp( a );
5509     aSign = extractFloatx80Sign( a );
5510     if ( aExp == 0x7FFF ) {
5511         if ( (uint64_t) ( aSig<<1 ) ) {
5512             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5513         }
5514         return packFloat64( aSign, 0x7FF, 0 );
5515     }
5516     shift64RightJamming( aSig, 1, &zSig );
5517     if ( aExp || aSig ) aExp -= 0x3C01;
5518     return roundAndPackFloat64(aSign, aExp, zSig, status);
5519 
5520 }
5521 
5522 /*----------------------------------------------------------------------------
5523 | Returns the result of converting the extended double-precision floating-
5524 | point value `a' to the quadruple-precision floating-point format.  The
5525 | conversion is performed according to the IEC/IEEE Standard for Binary
5526 | Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528 
5529 float128 floatx80_to_float128(floatx80 a, float_status *status)
5530 {
5531     flag aSign;
5532     int aExp;
5533     uint64_t aSig, zSig0, zSig1;
5534 
5535     if (floatx80_invalid_encoding(a)) {
5536         float_raise(float_flag_invalid, status);
5537         return float128_default_nan(status);
5538     }
5539     aSig = extractFloatx80Frac( a );
5540     aExp = extractFloatx80Exp( a );
5541     aSign = extractFloatx80Sign( a );
5542     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5543         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5544     }
5545     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5546     return packFloat128( aSign, aExp, zSig0, zSig1 );
5547 
5548 }
5549 
5550 /*----------------------------------------------------------------------------
5551 | Rounds the extended double-precision floating-point value `a'
5552 | to the precision provided by floatx80_rounding_precision and returns the
5553 | result as an extended double-precision floating-point value.
5554 | The operation is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5557 
5558 floatx80 floatx80_round(floatx80 a, float_status *status)
5559 {
5560     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5561                                 extractFloatx80Sign(a),
5562                                 extractFloatx80Exp(a),
5563                                 extractFloatx80Frac(a), 0, status);
5564 }
5565 
5566 /*----------------------------------------------------------------------------
5567 | Rounds the extended double-precision floating-point value `a' to an integer,
5568 | and returns the result as an extended quadruple-precision floating-point
5569 | value.  The operation is performed according to the IEC/IEEE Standard for
5570 | Binary Floating-Point Arithmetic.
5571 *----------------------------------------------------------------------------*/
5572 
5573 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5574 {
5575     flag aSign;
5576     int32_t aExp;
5577     uint64_t lastBitMask, roundBitsMask;
5578     floatx80 z;
5579 
5580     if (floatx80_invalid_encoding(a)) {
5581         float_raise(float_flag_invalid, status);
5582         return floatx80_default_nan(status);
5583     }
5584     aExp = extractFloatx80Exp( a );
5585     if ( 0x403E <= aExp ) {
5586         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5587             return propagateFloatx80NaN(a, a, status);
5588         }
5589         return a;
5590     }
5591     if ( aExp < 0x3FFF ) {
5592         if (    ( aExp == 0 )
5593              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5594             return a;
5595         }
5596         status->float_exception_flags |= float_flag_inexact;
5597         aSign = extractFloatx80Sign( a );
5598         switch (status->float_rounding_mode) {
5599          case float_round_nearest_even:
5600             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5601                ) {
5602                 return
5603                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5604             }
5605             break;
5606         case float_round_ties_away:
5607             if (aExp == 0x3FFE) {
5608                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5609             }
5610             break;
5611          case float_round_down:
5612             return
5613                   aSign ?
5614                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5615                 : packFloatx80( 0, 0, 0 );
5616          case float_round_up:
5617             return
5618                   aSign ? packFloatx80( 1, 0, 0 )
5619                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5620         }
5621         return packFloatx80( aSign, 0, 0 );
5622     }
5623     lastBitMask = 1;
5624     lastBitMask <<= 0x403E - aExp;
5625     roundBitsMask = lastBitMask - 1;
5626     z = a;
5627     switch (status->float_rounding_mode) {
5628     case float_round_nearest_even:
5629         z.low += lastBitMask>>1;
5630         if ((z.low & roundBitsMask) == 0) {
5631             z.low &= ~lastBitMask;
5632         }
5633         break;
5634     case float_round_ties_away:
5635         z.low += lastBitMask >> 1;
5636         break;
5637     case float_round_to_zero:
5638         break;
5639     case float_round_up:
5640         if (!extractFloatx80Sign(z)) {
5641             z.low += roundBitsMask;
5642         }
5643         break;
5644     case float_round_down:
5645         if (extractFloatx80Sign(z)) {
5646             z.low += roundBitsMask;
5647         }
5648         break;
5649     default:
5650         abort();
5651     }
5652     z.low &= ~ roundBitsMask;
5653     if ( z.low == 0 ) {
5654         ++z.high;
5655         z.low = LIT64( 0x8000000000000000 );
5656     }
5657     if (z.low != a.low) {
5658         status->float_exception_flags |= float_flag_inexact;
5659     }
5660     return z;
5661 
5662 }
5663 
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of adding the absolute values of the extended double-
5666 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5667 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5668 | The addition is performed according to the IEC/IEEE Standard for Binary
5669 | Floating-Point Arithmetic.
5670 *----------------------------------------------------------------------------*/
5671 
5672 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5673                                 float_status *status)
5674 {
5675     int32_t aExp, bExp, zExp;
5676     uint64_t aSig, bSig, zSig0, zSig1;
5677     int32_t expDiff;
5678 
5679     aSig = extractFloatx80Frac( a );
5680     aExp = extractFloatx80Exp( a );
5681     bSig = extractFloatx80Frac( b );
5682     bExp = extractFloatx80Exp( b );
5683     expDiff = aExp - bExp;
5684     if ( 0 < expDiff ) {
5685         if ( aExp == 0x7FFF ) {
5686             if ((uint64_t)(aSig << 1)) {
5687                 return propagateFloatx80NaN(a, b, status);
5688             }
5689             return a;
5690         }
5691         if ( bExp == 0 ) --expDiff;
5692         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5693         zExp = aExp;
5694     }
5695     else if ( expDiff < 0 ) {
5696         if ( bExp == 0x7FFF ) {
5697             if ((uint64_t)(bSig << 1)) {
5698                 return propagateFloatx80NaN(a, b, status);
5699             }
5700             return packFloatx80(zSign,
5701                                 floatx80_infinity_high,
5702                                 floatx80_infinity_low);
5703         }
5704         if ( aExp == 0 ) ++expDiff;
5705         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5706         zExp = bExp;
5707     }
5708     else {
5709         if ( aExp == 0x7FFF ) {
5710             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5711                 return propagateFloatx80NaN(a, b, status);
5712             }
5713             return a;
5714         }
5715         zSig1 = 0;
5716         zSig0 = aSig + bSig;
5717         if ( aExp == 0 ) {
5718             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5719             goto roundAndPack;
5720         }
5721         zExp = aExp;
5722         goto shiftRight1;
5723     }
5724     zSig0 = aSig + bSig;
5725     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5726  shiftRight1:
5727     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5728     zSig0 |= LIT64( 0x8000000000000000 );
5729     ++zExp;
5730  roundAndPack:
5731     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5732                                 zSign, zExp, zSig0, zSig1, status);
5733 }
5734 
5735 /*----------------------------------------------------------------------------
5736 | Returns the result of subtracting the absolute values of the extended
5737 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5738 | difference is negated before being returned.  `zSign' is ignored if the
5739 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5740 | Standard for Binary Floating-Point Arithmetic.
5741 *----------------------------------------------------------------------------*/
5742 
5743 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5744                                 float_status *status)
5745 {
5746     int32_t aExp, bExp, zExp;
5747     uint64_t aSig, bSig, zSig0, zSig1;
5748     int32_t expDiff;
5749 
5750     aSig = extractFloatx80Frac( a );
5751     aExp = extractFloatx80Exp( a );
5752     bSig = extractFloatx80Frac( b );
5753     bExp = extractFloatx80Exp( b );
5754     expDiff = aExp - bExp;
5755     if ( 0 < expDiff ) goto aExpBigger;
5756     if ( expDiff < 0 ) goto bExpBigger;
5757     if ( aExp == 0x7FFF ) {
5758         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5759             return propagateFloatx80NaN(a, b, status);
5760         }
5761         float_raise(float_flag_invalid, status);
5762         return floatx80_default_nan(status);
5763     }
5764     if ( aExp == 0 ) {
5765         aExp = 1;
5766         bExp = 1;
5767     }
5768     zSig1 = 0;
5769     if ( bSig < aSig ) goto aBigger;
5770     if ( aSig < bSig ) goto bBigger;
5771     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5772  bExpBigger:
5773     if ( bExp == 0x7FFF ) {
5774         if ((uint64_t)(bSig << 1)) {
5775             return propagateFloatx80NaN(a, b, status);
5776         }
5777         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5778                             floatx80_infinity_low);
5779     }
5780     if ( aExp == 0 ) ++expDiff;
5781     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5782  bBigger:
5783     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5784     zExp = bExp;
5785     zSign ^= 1;
5786     goto normalizeRoundAndPack;
5787  aExpBigger:
5788     if ( aExp == 0x7FFF ) {
5789         if ((uint64_t)(aSig << 1)) {
5790             return propagateFloatx80NaN(a, b, status);
5791         }
5792         return a;
5793     }
5794     if ( bExp == 0 ) --expDiff;
5795     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5796  aBigger:
5797     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5798     zExp = aExp;
5799  normalizeRoundAndPack:
5800     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5801                                          zSign, zExp, zSig0, zSig1, status);
5802 }
5803 
5804 /*----------------------------------------------------------------------------
5805 | Returns the result of adding the extended double-precision floating-point
5806 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5807 | Standard for Binary Floating-Point Arithmetic.
5808 *----------------------------------------------------------------------------*/
5809 
5810 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5811 {
5812     flag aSign, bSign;
5813 
5814     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5815         float_raise(float_flag_invalid, status);
5816         return floatx80_default_nan(status);
5817     }
5818     aSign = extractFloatx80Sign( a );
5819     bSign = extractFloatx80Sign( b );
5820     if ( aSign == bSign ) {
5821         return addFloatx80Sigs(a, b, aSign, status);
5822     }
5823     else {
5824         return subFloatx80Sigs(a, b, aSign, status);
5825     }
5826 
5827 }
5828 
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of subtracting the extended double-precision floating-
5831 | point values `a' and `b'.  The operation is performed according to the
5832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5834 
5835 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5836 {
5837     flag aSign, bSign;
5838 
5839     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5840         float_raise(float_flag_invalid, status);
5841         return floatx80_default_nan(status);
5842     }
5843     aSign = extractFloatx80Sign( a );
5844     bSign = extractFloatx80Sign( b );
5845     if ( aSign == bSign ) {
5846         return subFloatx80Sigs(a, b, aSign, status);
5847     }
5848     else {
5849         return addFloatx80Sigs(a, b, aSign, status);
5850     }
5851 
5852 }
5853 
5854 /*----------------------------------------------------------------------------
5855 | Returns the result of multiplying the extended double-precision floating-
5856 | point values `a' and `b'.  The operation is performed according to the
5857 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5858 *----------------------------------------------------------------------------*/
5859 
5860 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5861 {
5862     flag aSign, bSign, zSign;
5863     int32_t aExp, bExp, zExp;
5864     uint64_t aSig, bSig, zSig0, zSig1;
5865 
5866     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5867         float_raise(float_flag_invalid, status);
5868         return floatx80_default_nan(status);
5869     }
5870     aSig = extractFloatx80Frac( a );
5871     aExp = extractFloatx80Exp( a );
5872     aSign = extractFloatx80Sign( a );
5873     bSig = extractFloatx80Frac( b );
5874     bExp = extractFloatx80Exp( b );
5875     bSign = extractFloatx80Sign( b );
5876     zSign = aSign ^ bSign;
5877     if ( aExp == 0x7FFF ) {
5878         if (    (uint64_t) ( aSig<<1 )
5879              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5880             return propagateFloatx80NaN(a, b, status);
5881         }
5882         if ( ( bExp | bSig ) == 0 ) goto invalid;
5883         return packFloatx80(zSign, floatx80_infinity_high,
5884                                    floatx80_infinity_low);
5885     }
5886     if ( bExp == 0x7FFF ) {
5887         if ((uint64_t)(bSig << 1)) {
5888             return propagateFloatx80NaN(a, b, status);
5889         }
5890         if ( ( aExp | aSig ) == 0 ) {
5891  invalid:
5892             float_raise(float_flag_invalid, status);
5893             return floatx80_default_nan(status);
5894         }
5895         return packFloatx80(zSign, floatx80_infinity_high,
5896                                    floatx80_infinity_low);
5897     }
5898     if ( aExp == 0 ) {
5899         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5900         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5901     }
5902     if ( bExp == 0 ) {
5903         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5904         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5905     }
5906     zExp = aExp + bExp - 0x3FFE;
5907     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5908     if ( 0 < (int64_t) zSig0 ) {
5909         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5910         --zExp;
5911     }
5912     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5913                                 zSign, zExp, zSig0, zSig1, status);
5914 }
5915 
5916 /*----------------------------------------------------------------------------
5917 | Returns the result of dividing the extended double-precision floating-point
5918 | value `a' by the corresponding value `b'.  The operation is performed
5919 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5920 *----------------------------------------------------------------------------*/
5921 
5922 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5923 {
5924     flag aSign, bSign, zSign;
5925     int32_t aExp, bExp, zExp;
5926     uint64_t aSig, bSig, zSig0, zSig1;
5927     uint64_t rem0, rem1, rem2, term0, term1, term2;
5928 
5929     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5930         float_raise(float_flag_invalid, status);
5931         return floatx80_default_nan(status);
5932     }
5933     aSig = extractFloatx80Frac( a );
5934     aExp = extractFloatx80Exp( a );
5935     aSign = extractFloatx80Sign( a );
5936     bSig = extractFloatx80Frac( b );
5937     bExp = extractFloatx80Exp( b );
5938     bSign = extractFloatx80Sign( b );
5939     zSign = aSign ^ bSign;
5940     if ( aExp == 0x7FFF ) {
5941         if ((uint64_t)(aSig << 1)) {
5942             return propagateFloatx80NaN(a, b, status);
5943         }
5944         if ( bExp == 0x7FFF ) {
5945             if ((uint64_t)(bSig << 1)) {
5946                 return propagateFloatx80NaN(a, b, status);
5947             }
5948             goto invalid;
5949         }
5950         return packFloatx80(zSign, floatx80_infinity_high,
5951                                    floatx80_infinity_low);
5952     }
5953     if ( bExp == 0x7FFF ) {
5954         if ((uint64_t)(bSig << 1)) {
5955             return propagateFloatx80NaN(a, b, status);
5956         }
5957         return packFloatx80( zSign, 0, 0 );
5958     }
5959     if ( bExp == 0 ) {
5960         if ( bSig == 0 ) {
5961             if ( ( aExp | aSig ) == 0 ) {
5962  invalid:
5963                 float_raise(float_flag_invalid, status);
5964                 return floatx80_default_nan(status);
5965             }
5966             float_raise(float_flag_divbyzero, status);
5967             return packFloatx80(zSign, floatx80_infinity_high,
5968                                        floatx80_infinity_low);
5969         }
5970         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5971     }
5972     if ( aExp == 0 ) {
5973         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5974         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5975     }
5976     zExp = aExp - bExp + 0x3FFE;
5977     rem1 = 0;
5978     if ( bSig <= aSig ) {
5979         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5980         ++zExp;
5981     }
5982     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5983     mul64To128( bSig, zSig0, &term0, &term1 );
5984     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5985     while ( (int64_t) rem0 < 0 ) {
5986         --zSig0;
5987         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5988     }
5989     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5990     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5991         mul64To128( bSig, zSig1, &term1, &term2 );
5992         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5993         while ( (int64_t) rem1 < 0 ) {
5994             --zSig1;
5995             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5996         }
5997         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5998     }
5999     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6000                                 zSign, zExp, zSig0, zSig1, status);
6001 }
6002 
6003 /*----------------------------------------------------------------------------
6004 | Returns the remainder of the extended double-precision floating-point value
6005 | `a' with respect to the corresponding value `b'.  The operation is performed
6006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6007 *----------------------------------------------------------------------------*/
6008 
6009 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6010 {
6011     flag aSign, zSign;
6012     int32_t aExp, bExp, expDiff;
6013     uint64_t aSig0, aSig1, bSig;
6014     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6015 
6016     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6017         float_raise(float_flag_invalid, status);
6018         return floatx80_default_nan(status);
6019     }
6020     aSig0 = extractFloatx80Frac( a );
6021     aExp = extractFloatx80Exp( a );
6022     aSign = extractFloatx80Sign( a );
6023     bSig = extractFloatx80Frac( b );
6024     bExp = extractFloatx80Exp( b );
6025     if ( aExp == 0x7FFF ) {
6026         if (    (uint64_t) ( aSig0<<1 )
6027              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6028             return propagateFloatx80NaN(a, b, status);
6029         }
6030         goto invalid;
6031     }
6032     if ( bExp == 0x7FFF ) {
6033         if ((uint64_t)(bSig << 1)) {
6034             return propagateFloatx80NaN(a, b, status);
6035         }
6036         return a;
6037     }
6038     if ( bExp == 0 ) {
6039         if ( bSig == 0 ) {
6040  invalid:
6041             float_raise(float_flag_invalid, status);
6042             return floatx80_default_nan(status);
6043         }
6044         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6045     }
6046     if ( aExp == 0 ) {
6047         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6048         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6049     }
6050     bSig |= LIT64( 0x8000000000000000 );
6051     zSign = aSign;
6052     expDiff = aExp - bExp;
6053     aSig1 = 0;
6054     if ( expDiff < 0 ) {
6055         if ( expDiff < -1 ) return a;
6056         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6057         expDiff = 0;
6058     }
6059     q = ( bSig <= aSig0 );
6060     if ( q ) aSig0 -= bSig;
6061     expDiff -= 64;
6062     while ( 0 < expDiff ) {
6063         q = estimateDiv128To64( aSig0, aSig1, bSig );
6064         q = ( 2 < q ) ? q - 2 : 0;
6065         mul64To128( bSig, q, &term0, &term1 );
6066         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6067         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6068         expDiff -= 62;
6069     }
6070     expDiff += 64;
6071     if ( 0 < expDiff ) {
6072         q = estimateDiv128To64( aSig0, aSig1, bSig );
6073         q = ( 2 < q ) ? q - 2 : 0;
6074         q >>= 64 - expDiff;
6075         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6076         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6077         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6078         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6079             ++q;
6080             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6081         }
6082     }
6083     else {
6084         term1 = 0;
6085         term0 = bSig;
6086     }
6087     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6088     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6089          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6090               && ( q & 1 ) )
6091        ) {
6092         aSig0 = alternateASig0;
6093         aSig1 = alternateASig1;
6094         zSign = ! zSign;
6095     }
6096     return
6097         normalizeRoundAndPackFloatx80(
6098             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6099 
6100 }
6101 
6102 /*----------------------------------------------------------------------------
6103 | Returns the square root of the extended double-precision floating-point
6104 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6105 | for Binary Floating-Point Arithmetic.
6106 *----------------------------------------------------------------------------*/
6107 
6108 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6109 {
6110     flag aSign;
6111     int32_t aExp, zExp;
6112     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6113     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6114 
6115     if (floatx80_invalid_encoding(a)) {
6116         float_raise(float_flag_invalid, status);
6117         return floatx80_default_nan(status);
6118     }
6119     aSig0 = extractFloatx80Frac( a );
6120     aExp = extractFloatx80Exp( a );
6121     aSign = extractFloatx80Sign( a );
6122     if ( aExp == 0x7FFF ) {
6123         if ((uint64_t)(aSig0 << 1)) {
6124             return propagateFloatx80NaN(a, a, status);
6125         }
6126         if ( ! aSign ) return a;
6127         goto invalid;
6128     }
6129     if ( aSign ) {
6130         if ( ( aExp | aSig0 ) == 0 ) return a;
6131  invalid:
6132         float_raise(float_flag_invalid, status);
6133         return floatx80_default_nan(status);
6134     }
6135     if ( aExp == 0 ) {
6136         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6137         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6138     }
6139     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6140     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6141     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6142     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6143     doubleZSig0 = zSig0<<1;
6144     mul64To128( zSig0, zSig0, &term0, &term1 );
6145     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6146     while ( (int64_t) rem0 < 0 ) {
6147         --zSig0;
6148         doubleZSig0 -= 2;
6149         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6150     }
6151     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6152     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6153         if ( zSig1 == 0 ) zSig1 = 1;
6154         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6155         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6156         mul64To128( zSig1, zSig1, &term2, &term3 );
6157         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6158         while ( (int64_t) rem1 < 0 ) {
6159             --zSig1;
6160             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6161             term3 |= 1;
6162             term2 |= doubleZSig0;
6163             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6164         }
6165         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6166     }
6167     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6168     zSig0 |= doubleZSig0;
6169     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6170                                 0, zExp, zSig0, zSig1, status);
6171 }
6172 
6173 /*----------------------------------------------------------------------------
6174 | Returns 1 if the extended double-precision floating-point value `a' is equal
6175 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6176 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6178 *----------------------------------------------------------------------------*/
6179 
6180 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6181 {
6182 
6183     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6184         || (extractFloatx80Exp(a) == 0x7FFF
6185             && (uint64_t) (extractFloatx80Frac(a) << 1))
6186         || (extractFloatx80Exp(b) == 0x7FFF
6187             && (uint64_t) (extractFloatx80Frac(b) << 1))
6188        ) {
6189         float_raise(float_flag_invalid, status);
6190         return 0;
6191     }
6192     return
6193            ( a.low == b.low )
6194         && (    ( a.high == b.high )
6195              || (    ( a.low == 0 )
6196                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6197            );
6198 
6199 }
6200 
6201 /*----------------------------------------------------------------------------
6202 | Returns 1 if the extended double-precision floating-point value `a' is
6203 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6204 | invalid exception is raised if either operand is a NaN.  The comparison is
6205 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6206 | Arithmetic.
6207 *----------------------------------------------------------------------------*/
6208 
6209 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6210 {
6211     flag aSign, bSign;
6212 
6213     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6214         || (extractFloatx80Exp(a) == 0x7FFF
6215             && (uint64_t) (extractFloatx80Frac(a) << 1))
6216         || (extractFloatx80Exp(b) == 0x7FFF
6217             && (uint64_t) (extractFloatx80Frac(b) << 1))
6218        ) {
6219         float_raise(float_flag_invalid, status);
6220         return 0;
6221     }
6222     aSign = extractFloatx80Sign( a );
6223     bSign = extractFloatx80Sign( b );
6224     if ( aSign != bSign ) {
6225         return
6226                aSign
6227             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6228                  == 0 );
6229     }
6230     return
6231           aSign ? le128( b.high, b.low, a.high, a.low )
6232         : le128( a.high, a.low, b.high, b.low );
6233 
6234 }
6235 
6236 /*----------------------------------------------------------------------------
6237 | Returns 1 if the extended double-precision floating-point value `a' is
6238 | less than the corresponding value `b', and 0 otherwise.  The invalid
6239 | exception is raised if either operand is a NaN.  The comparison is performed
6240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6241 *----------------------------------------------------------------------------*/
6242 
6243 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6244 {
6245     flag aSign, bSign;
6246 
6247     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6248         || (extractFloatx80Exp(a) == 0x7FFF
6249             && (uint64_t) (extractFloatx80Frac(a) << 1))
6250         || (extractFloatx80Exp(b) == 0x7FFF
6251             && (uint64_t) (extractFloatx80Frac(b) << 1))
6252        ) {
6253         float_raise(float_flag_invalid, status);
6254         return 0;
6255     }
6256     aSign = extractFloatx80Sign( a );
6257     bSign = extractFloatx80Sign( b );
6258     if ( aSign != bSign ) {
6259         return
6260                aSign
6261             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6262                  != 0 );
6263     }
6264     return
6265           aSign ? lt128( b.high, b.low, a.high, a.low )
6266         : lt128( a.high, a.low, b.high, b.low );
6267 
6268 }
6269 
6270 /*----------------------------------------------------------------------------
6271 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6272 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6273 | either operand is a NaN.   The comparison is performed according to the
6274 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6275 *----------------------------------------------------------------------------*/
6276 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6277 {
6278     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6279         || (extractFloatx80Exp(a) == 0x7FFF
6280             && (uint64_t) (extractFloatx80Frac(a) << 1))
6281         || (extractFloatx80Exp(b) == 0x7FFF
6282             && (uint64_t) (extractFloatx80Frac(b) << 1))
6283        ) {
6284         float_raise(float_flag_invalid, status);
6285         return 1;
6286     }
6287     return 0;
6288 }
6289 
6290 /*----------------------------------------------------------------------------
6291 | Returns 1 if the extended double-precision floating-point value `a' is
6292 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6293 | cause an exception.  The comparison is performed according to the IEC/IEEE
6294 | Standard for Binary Floating-Point Arithmetic.
6295 *----------------------------------------------------------------------------*/
6296 
6297 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6298 {
6299 
6300     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6301         float_raise(float_flag_invalid, status);
6302         return 0;
6303     }
6304     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6305               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6306          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6307               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6308        ) {
6309         if (floatx80_is_signaling_nan(a, status)
6310          || floatx80_is_signaling_nan(b, status)) {
6311             float_raise(float_flag_invalid, status);
6312         }
6313         return 0;
6314     }
6315     return
6316            ( a.low == b.low )
6317         && (    ( a.high == b.high )
6318              || (    ( a.low == 0 )
6319                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6320            );
6321 
6322 }
6323 
6324 /*----------------------------------------------------------------------------
6325 | Returns 1 if the extended double-precision floating-point value `a' is less
6326 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6327 | do not cause an exception.  Otherwise, the comparison is performed according
6328 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6329 *----------------------------------------------------------------------------*/
6330 
6331 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6332 {
6333     flag aSign, bSign;
6334 
6335     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6336         float_raise(float_flag_invalid, status);
6337         return 0;
6338     }
6339     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6340               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6341          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6342               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6343        ) {
6344         if (floatx80_is_signaling_nan(a, status)
6345          || floatx80_is_signaling_nan(b, status)) {
6346             float_raise(float_flag_invalid, status);
6347         }
6348         return 0;
6349     }
6350     aSign = extractFloatx80Sign( a );
6351     bSign = extractFloatx80Sign( b );
6352     if ( aSign != bSign ) {
6353         return
6354                aSign
6355             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6356                  == 0 );
6357     }
6358     return
6359           aSign ? le128( b.high, b.low, a.high, a.low )
6360         : le128( a.high, a.low, b.high, b.low );
6361 
6362 }
6363 
6364 /*----------------------------------------------------------------------------
6365 | Returns 1 if the extended double-precision floating-point value `a' is less
6366 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6367 | an exception.  Otherwise, the comparison is performed according to the
6368 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6369 *----------------------------------------------------------------------------*/
6370 
6371 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6372 {
6373     flag aSign, bSign;
6374 
6375     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6376         float_raise(float_flag_invalid, status);
6377         return 0;
6378     }
6379     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6380               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6381          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6382               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6383        ) {
6384         if (floatx80_is_signaling_nan(a, status)
6385          || floatx80_is_signaling_nan(b, status)) {
6386             float_raise(float_flag_invalid, status);
6387         }
6388         return 0;
6389     }
6390     aSign = extractFloatx80Sign( a );
6391     bSign = extractFloatx80Sign( b );
6392     if ( aSign != bSign ) {
6393         return
6394                aSign
6395             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6396                  != 0 );
6397     }
6398     return
6399           aSign ? lt128( b.high, b.low, a.high, a.low )
6400         : lt128( a.high, a.low, b.high, b.low );
6401 
6402 }
6403 
6404 /*----------------------------------------------------------------------------
6405 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6406 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6407 | The comparison is performed according to the IEC/IEEE Standard for Binary
6408 | Floating-Point Arithmetic.
6409 *----------------------------------------------------------------------------*/
6410 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6411 {
6412     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6413         float_raise(float_flag_invalid, status);
6414         return 1;
6415     }
6416     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6417               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6418          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6419               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6420        ) {
6421         if (floatx80_is_signaling_nan(a, status)
6422          || floatx80_is_signaling_nan(b, status)) {
6423             float_raise(float_flag_invalid, status);
6424         }
6425         return 1;
6426     }
6427     return 0;
6428 }
6429 
6430 /*----------------------------------------------------------------------------
6431 | Returns the result of converting the quadruple-precision floating-point
6432 | value `a' to the 32-bit two's complement integer format.  The conversion
6433 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6434 | Arithmetic---which means in particular that the conversion is rounded
6435 | according to the current rounding mode.  If `a' is a NaN, the largest
6436 | positive integer is returned.  Otherwise, if the conversion overflows, the
6437 | largest integer with the same sign as `a' is returned.
6438 *----------------------------------------------------------------------------*/
6439 
6440 int32_t float128_to_int32(float128 a, float_status *status)
6441 {
6442     flag aSign;
6443     int32_t aExp, shiftCount;
6444     uint64_t aSig0, aSig1;
6445 
6446     aSig1 = extractFloat128Frac1( a );
6447     aSig0 = extractFloat128Frac0( a );
6448     aExp = extractFloat128Exp( a );
6449     aSign = extractFloat128Sign( a );
6450     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6451     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6452     aSig0 |= ( aSig1 != 0 );
6453     shiftCount = 0x4028 - aExp;
6454     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6455     return roundAndPackInt32(aSign, aSig0, status);
6456 
6457 }
6458 
6459 /*----------------------------------------------------------------------------
6460 | Returns the result of converting the quadruple-precision floating-point
6461 | value `a' to the 32-bit two's complement integer format.  The conversion
6462 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6463 | Arithmetic, except that the conversion is always rounded toward zero.  If
6464 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6465 | conversion overflows, the largest integer with the same sign as `a' is
6466 | returned.
6467 *----------------------------------------------------------------------------*/
6468 
6469 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6470 {
6471     flag aSign;
6472     int32_t aExp, shiftCount;
6473     uint64_t aSig0, aSig1, savedASig;
6474     int32_t z;
6475 
6476     aSig1 = extractFloat128Frac1( a );
6477     aSig0 = extractFloat128Frac0( a );
6478     aExp = extractFloat128Exp( a );
6479     aSign = extractFloat128Sign( a );
6480     aSig0 |= ( aSig1 != 0 );
6481     if ( 0x401E < aExp ) {
6482         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6483         goto invalid;
6484     }
6485     else if ( aExp < 0x3FFF ) {
6486         if (aExp || aSig0) {
6487             status->float_exception_flags |= float_flag_inexact;
6488         }
6489         return 0;
6490     }
6491     aSig0 |= LIT64( 0x0001000000000000 );
6492     shiftCount = 0x402F - aExp;
6493     savedASig = aSig0;
6494     aSig0 >>= shiftCount;
6495     z = aSig0;
6496     if ( aSign ) z = - z;
6497     if ( ( z < 0 ) ^ aSign ) {
6498  invalid:
6499         float_raise(float_flag_invalid, status);
6500         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6501     }
6502     if ( ( aSig0<<shiftCount ) != savedASig ) {
6503         status->float_exception_flags |= float_flag_inexact;
6504     }
6505     return z;
6506 
6507 }
6508 
6509 /*----------------------------------------------------------------------------
6510 | Returns the result of converting the quadruple-precision floating-point
6511 | value `a' to the 64-bit two's complement integer format.  The conversion
6512 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6513 | Arithmetic---which means in particular that the conversion is rounded
6514 | according to the current rounding mode.  If `a' is a NaN, the largest
6515 | positive integer is returned.  Otherwise, if the conversion overflows, the
6516 | largest integer with the same sign as `a' is returned.
6517 *----------------------------------------------------------------------------*/
6518 
6519 int64_t float128_to_int64(float128 a, float_status *status)
6520 {
6521     flag aSign;
6522     int32_t aExp, shiftCount;
6523     uint64_t aSig0, aSig1;
6524 
6525     aSig1 = extractFloat128Frac1( a );
6526     aSig0 = extractFloat128Frac0( a );
6527     aExp = extractFloat128Exp( a );
6528     aSign = extractFloat128Sign( a );
6529     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6530     shiftCount = 0x402F - aExp;
6531     if ( shiftCount <= 0 ) {
6532         if ( 0x403E < aExp ) {
6533             float_raise(float_flag_invalid, status);
6534             if (    ! aSign
6535                  || (    ( aExp == 0x7FFF )
6536                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6537                     )
6538                ) {
6539                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6540             }
6541             return (int64_t) LIT64( 0x8000000000000000 );
6542         }
6543         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6544     }
6545     else {
6546         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6547     }
6548     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6549 
6550 }
6551 
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of converting the quadruple-precision floating-point
6554 | value `a' to the 64-bit two's complement integer format.  The conversion
6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6556 | Arithmetic, except that the conversion is always rounded toward zero.
6557 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6558 | the conversion overflows, the largest integer with the same sign as `a' is
6559 | returned.
6560 *----------------------------------------------------------------------------*/
6561 
6562 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6563 {
6564     flag aSign;
6565     int32_t aExp, shiftCount;
6566     uint64_t aSig0, aSig1;
6567     int64_t z;
6568 
6569     aSig1 = extractFloat128Frac1( a );
6570     aSig0 = extractFloat128Frac0( a );
6571     aExp = extractFloat128Exp( a );
6572     aSign = extractFloat128Sign( a );
6573     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6574     shiftCount = aExp - 0x402F;
6575     if ( 0 < shiftCount ) {
6576         if ( 0x403E <= aExp ) {
6577             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6578             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6579                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6580                 if (aSig1) {
6581                     status->float_exception_flags |= float_flag_inexact;
6582                 }
6583             }
6584             else {
6585                 float_raise(float_flag_invalid, status);
6586                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6587                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6588                 }
6589             }
6590             return (int64_t) LIT64( 0x8000000000000000 );
6591         }
6592         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6593         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6594             status->float_exception_flags |= float_flag_inexact;
6595         }
6596     }
6597     else {
6598         if ( aExp < 0x3FFF ) {
6599             if ( aExp | aSig0 | aSig1 ) {
6600                 status->float_exception_flags |= float_flag_inexact;
6601             }
6602             return 0;
6603         }
6604         z = aSig0>>( - shiftCount );
6605         if (    aSig1
6606              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6607             status->float_exception_flags |= float_flag_inexact;
6608         }
6609     }
6610     if ( aSign ) z = - z;
6611     return z;
6612 
6613 }
6614 
6615 /*----------------------------------------------------------------------------
6616 | Returns the result of converting the quadruple-precision floating-point value
6617 | `a' to the 64-bit unsigned integer format.  The conversion is
6618 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6619 | Arithmetic---which means in particular that the conversion is rounded
6620 | according to the current rounding mode.  If `a' is a NaN, the largest
6621 | positive integer is returned.  If the conversion overflows, the
6622 | largest unsigned integer is returned.  If 'a' is negative, the value is
6623 | rounded and zero is returned; negative values that do not round to zero
6624 | will raise the inexact exception.
6625 *----------------------------------------------------------------------------*/
6626 
6627 uint64_t float128_to_uint64(float128 a, float_status *status)
6628 {
6629     flag aSign;
6630     int aExp;
6631     int shiftCount;
6632     uint64_t aSig0, aSig1;
6633 
6634     aSig0 = extractFloat128Frac0(a);
6635     aSig1 = extractFloat128Frac1(a);
6636     aExp = extractFloat128Exp(a);
6637     aSign = extractFloat128Sign(a);
6638     if (aSign && (aExp > 0x3FFE)) {
6639         float_raise(float_flag_invalid, status);
6640         if (float128_is_any_nan(a)) {
6641             return LIT64(0xFFFFFFFFFFFFFFFF);
6642         } else {
6643             return 0;
6644         }
6645     }
6646     if (aExp) {
6647         aSig0 |= LIT64(0x0001000000000000);
6648     }
6649     shiftCount = 0x402F - aExp;
6650     if (shiftCount <= 0) {
6651         if (0x403E < aExp) {
6652             float_raise(float_flag_invalid, status);
6653             return LIT64(0xFFFFFFFFFFFFFFFF);
6654         }
6655         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6656     } else {
6657         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6658     }
6659     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6660 }
6661 
6662 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6663 {
6664     uint64_t v;
6665     signed char current_rounding_mode = status->float_rounding_mode;
6666 
6667     set_float_rounding_mode(float_round_to_zero, status);
6668     v = float128_to_uint64(a, status);
6669     set_float_rounding_mode(current_rounding_mode, status);
6670 
6671     return v;
6672 }
6673 
6674 /*----------------------------------------------------------------------------
6675 | Returns the result of converting the quadruple-precision floating-point
6676 | value `a' to the 32-bit unsigned integer format.  The conversion
6677 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6678 | Arithmetic except that the conversion is always rounded toward zero.
6679 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6680 | if the conversion overflows, the largest unsigned integer is returned.
6681 | If 'a' is negative, the value is rounded and zero is returned; negative
6682 | values that do not round to zero will raise the inexact exception.
6683 *----------------------------------------------------------------------------*/
6684 
6685 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6686 {
6687     uint64_t v;
6688     uint32_t res;
6689     int old_exc_flags = get_float_exception_flags(status);
6690 
6691     v = float128_to_uint64_round_to_zero(a, status);
6692     if (v > 0xffffffff) {
6693         res = 0xffffffff;
6694     } else {
6695         return v;
6696     }
6697     set_float_exception_flags(old_exc_flags, status);
6698     float_raise(float_flag_invalid, status);
6699     return res;
6700 }
6701 
6702 /*----------------------------------------------------------------------------
6703 | Returns the result of converting the quadruple-precision floating-point
6704 | value `a' to the single-precision floating-point format.  The conversion
6705 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6706 | Arithmetic.
6707 *----------------------------------------------------------------------------*/
6708 
6709 float32 float128_to_float32(float128 a, float_status *status)
6710 {
6711     flag aSign;
6712     int32_t aExp;
6713     uint64_t aSig0, aSig1;
6714     uint32_t zSig;
6715 
6716     aSig1 = extractFloat128Frac1( a );
6717     aSig0 = extractFloat128Frac0( a );
6718     aExp = extractFloat128Exp( a );
6719     aSign = extractFloat128Sign( a );
6720     if ( aExp == 0x7FFF ) {
6721         if ( aSig0 | aSig1 ) {
6722             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6723         }
6724         return packFloat32( aSign, 0xFF, 0 );
6725     }
6726     aSig0 |= ( aSig1 != 0 );
6727     shift64RightJamming( aSig0, 18, &aSig0 );
6728     zSig = aSig0;
6729     if ( aExp || zSig ) {
6730         zSig |= 0x40000000;
6731         aExp -= 0x3F81;
6732     }
6733     return roundAndPackFloat32(aSign, aExp, zSig, status);
6734 
6735 }
6736 
6737 /*----------------------------------------------------------------------------
6738 | Returns the result of converting the quadruple-precision floating-point
6739 | value `a' to the double-precision floating-point format.  The conversion
6740 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6741 | Arithmetic.
6742 *----------------------------------------------------------------------------*/
6743 
6744 float64 float128_to_float64(float128 a, float_status *status)
6745 {
6746     flag aSign;
6747     int32_t aExp;
6748     uint64_t aSig0, aSig1;
6749 
6750     aSig1 = extractFloat128Frac1( a );
6751     aSig0 = extractFloat128Frac0( a );
6752     aExp = extractFloat128Exp( a );
6753     aSign = extractFloat128Sign( a );
6754     if ( aExp == 0x7FFF ) {
6755         if ( aSig0 | aSig1 ) {
6756             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6757         }
6758         return packFloat64( aSign, 0x7FF, 0 );
6759     }
6760     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6761     aSig0 |= ( aSig1 != 0 );
6762     if ( aExp || aSig0 ) {
6763         aSig0 |= LIT64( 0x4000000000000000 );
6764         aExp -= 0x3C01;
6765     }
6766     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6767 
6768 }
6769 
6770 /*----------------------------------------------------------------------------
6771 | Returns the result of converting the quadruple-precision floating-point
6772 | value `a' to the extended double-precision floating-point format.  The
6773 | conversion is performed according to the IEC/IEEE Standard for Binary
6774 | Floating-Point Arithmetic.
6775 *----------------------------------------------------------------------------*/
6776 
6777 floatx80 float128_to_floatx80(float128 a, float_status *status)
6778 {
6779     flag aSign;
6780     int32_t aExp;
6781     uint64_t aSig0, aSig1;
6782 
6783     aSig1 = extractFloat128Frac1( a );
6784     aSig0 = extractFloat128Frac0( a );
6785     aExp = extractFloat128Exp( a );
6786     aSign = extractFloat128Sign( a );
6787     if ( aExp == 0x7FFF ) {
6788         if ( aSig0 | aSig1 ) {
6789             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6790         }
6791         return packFloatx80(aSign, floatx80_infinity_high,
6792                                    floatx80_infinity_low);
6793     }
6794     if ( aExp == 0 ) {
6795         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6796         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6797     }
6798     else {
6799         aSig0 |= LIT64( 0x0001000000000000 );
6800     }
6801     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6802     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6803 
6804 }
6805 
6806 /*----------------------------------------------------------------------------
6807 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6808 | returns the result as a quadruple-precision floating-point value.  The
6809 | operation is performed according to the IEC/IEEE Standard for Binary
6810 | Floating-Point Arithmetic.
6811 *----------------------------------------------------------------------------*/
6812 
6813 float128 float128_round_to_int(float128 a, float_status *status)
6814 {
6815     flag aSign;
6816     int32_t aExp;
6817     uint64_t lastBitMask, roundBitsMask;
6818     float128 z;
6819 
6820     aExp = extractFloat128Exp( a );
6821     if ( 0x402F <= aExp ) {
6822         if ( 0x406F <= aExp ) {
6823             if (    ( aExp == 0x7FFF )
6824                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6825                ) {
6826                 return propagateFloat128NaN(a, a, status);
6827             }
6828             return a;
6829         }
6830         lastBitMask = 1;
6831         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6832         roundBitsMask = lastBitMask - 1;
6833         z = a;
6834         switch (status->float_rounding_mode) {
6835         case float_round_nearest_even:
6836             if ( lastBitMask ) {
6837                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6838                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6839             }
6840             else {
6841                 if ( (int64_t) z.low < 0 ) {
6842                     ++z.high;
6843                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6844                 }
6845             }
6846             break;
6847         case float_round_ties_away:
6848             if (lastBitMask) {
6849                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6850             } else {
6851                 if ((int64_t) z.low < 0) {
6852                     ++z.high;
6853                 }
6854             }
6855             break;
6856         case float_round_to_zero:
6857             break;
6858         case float_round_up:
6859             if (!extractFloat128Sign(z)) {
6860                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6861             }
6862             break;
6863         case float_round_down:
6864             if (extractFloat128Sign(z)) {
6865                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6866             }
6867             break;
6868         default:
6869             abort();
6870         }
6871         z.low &= ~ roundBitsMask;
6872     }
6873     else {
6874         if ( aExp < 0x3FFF ) {
6875             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6876             status->float_exception_flags |= float_flag_inexact;
6877             aSign = extractFloat128Sign( a );
6878             switch (status->float_rounding_mode) {
6879              case float_round_nearest_even:
6880                 if (    ( aExp == 0x3FFE )
6881                      && (   extractFloat128Frac0( a )
6882                           | extractFloat128Frac1( a ) )
6883                    ) {
6884                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6885                 }
6886                 break;
6887             case float_round_ties_away:
6888                 if (aExp == 0x3FFE) {
6889                     return packFloat128(aSign, 0x3FFF, 0, 0);
6890                 }
6891                 break;
6892              case float_round_down:
6893                 return
6894                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6895                     : packFloat128( 0, 0, 0, 0 );
6896              case float_round_up:
6897                 return
6898                       aSign ? packFloat128( 1, 0, 0, 0 )
6899                     : packFloat128( 0, 0x3FFF, 0, 0 );
6900             }
6901             return packFloat128( aSign, 0, 0, 0 );
6902         }
6903         lastBitMask = 1;
6904         lastBitMask <<= 0x402F - aExp;
6905         roundBitsMask = lastBitMask - 1;
6906         z.low = 0;
6907         z.high = a.high;
6908         switch (status->float_rounding_mode) {
6909         case float_round_nearest_even:
6910             z.high += lastBitMask>>1;
6911             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6912                 z.high &= ~ lastBitMask;
6913             }
6914             break;
6915         case float_round_ties_away:
6916             z.high += lastBitMask>>1;
6917             break;
6918         case float_round_to_zero:
6919             break;
6920         case float_round_up:
6921             if (!extractFloat128Sign(z)) {
6922                 z.high |= ( a.low != 0 );
6923                 z.high += roundBitsMask;
6924             }
6925             break;
6926         case float_round_down:
6927             if (extractFloat128Sign(z)) {
6928                 z.high |= (a.low != 0);
6929                 z.high += roundBitsMask;
6930             }
6931             break;
6932         default:
6933             abort();
6934         }
6935         z.high &= ~ roundBitsMask;
6936     }
6937     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6938         status->float_exception_flags |= float_flag_inexact;
6939     }
6940     return z;
6941 
6942 }
6943 
6944 /*----------------------------------------------------------------------------
6945 | Returns the result of adding the absolute values of the quadruple-precision
6946 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6947 | before being returned.  `zSign' is ignored if the result is a NaN.
6948 | The addition is performed according to the IEC/IEEE Standard for Binary
6949 | Floating-Point Arithmetic.
6950 *----------------------------------------------------------------------------*/
6951 
6952 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6953                                 float_status *status)
6954 {
6955     int32_t aExp, bExp, zExp;
6956     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6957     int32_t expDiff;
6958 
6959     aSig1 = extractFloat128Frac1( a );
6960     aSig0 = extractFloat128Frac0( a );
6961     aExp = extractFloat128Exp( a );
6962     bSig1 = extractFloat128Frac1( b );
6963     bSig0 = extractFloat128Frac0( b );
6964     bExp = extractFloat128Exp( b );
6965     expDiff = aExp - bExp;
6966     if ( 0 < expDiff ) {
6967         if ( aExp == 0x7FFF ) {
6968             if (aSig0 | aSig1) {
6969                 return propagateFloat128NaN(a, b, status);
6970             }
6971             return a;
6972         }
6973         if ( bExp == 0 ) {
6974             --expDiff;
6975         }
6976         else {
6977             bSig0 |= LIT64( 0x0001000000000000 );
6978         }
6979         shift128ExtraRightJamming(
6980             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6981         zExp = aExp;
6982     }
6983     else if ( expDiff < 0 ) {
6984         if ( bExp == 0x7FFF ) {
6985             if (bSig0 | bSig1) {
6986                 return propagateFloat128NaN(a, b, status);
6987             }
6988             return packFloat128( zSign, 0x7FFF, 0, 0 );
6989         }
6990         if ( aExp == 0 ) {
6991             ++expDiff;
6992         }
6993         else {
6994             aSig0 |= LIT64( 0x0001000000000000 );
6995         }
6996         shift128ExtraRightJamming(
6997             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6998         zExp = bExp;
6999     }
7000     else {
7001         if ( aExp == 0x7FFF ) {
7002             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7003                 return propagateFloat128NaN(a, b, status);
7004             }
7005             return a;
7006         }
7007         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7008         if ( aExp == 0 ) {
7009             if (status->flush_to_zero) {
7010                 if (zSig0 | zSig1) {
7011                     float_raise(float_flag_output_denormal, status);
7012                 }
7013                 return packFloat128(zSign, 0, 0, 0);
7014             }
7015             return packFloat128( zSign, 0, zSig0, zSig1 );
7016         }
7017         zSig2 = 0;
7018         zSig0 |= LIT64( 0x0002000000000000 );
7019         zExp = aExp;
7020         goto shiftRight1;
7021     }
7022     aSig0 |= LIT64( 0x0001000000000000 );
7023     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7024     --zExp;
7025     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7026     ++zExp;
7027  shiftRight1:
7028     shift128ExtraRightJamming(
7029         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7030  roundAndPack:
7031     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7032 
7033 }
7034 
7035 /*----------------------------------------------------------------------------
7036 | Returns the result of subtracting the absolute values of the quadruple-
7037 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7038 | difference is negated before being returned.  `zSign' is ignored if the
7039 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7040 | Standard for Binary Floating-Point Arithmetic.
7041 *----------------------------------------------------------------------------*/
7042 
7043 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7044                                 float_status *status)
7045 {
7046     int32_t aExp, bExp, zExp;
7047     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7048     int32_t expDiff;
7049 
7050     aSig1 = extractFloat128Frac1( a );
7051     aSig0 = extractFloat128Frac0( a );
7052     aExp = extractFloat128Exp( a );
7053     bSig1 = extractFloat128Frac1( b );
7054     bSig0 = extractFloat128Frac0( b );
7055     bExp = extractFloat128Exp( b );
7056     expDiff = aExp - bExp;
7057     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7058     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7059     if ( 0 < expDiff ) goto aExpBigger;
7060     if ( expDiff < 0 ) goto bExpBigger;
7061     if ( aExp == 0x7FFF ) {
7062         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7063             return propagateFloat128NaN(a, b, status);
7064         }
7065         float_raise(float_flag_invalid, status);
7066         return float128_default_nan(status);
7067     }
7068     if ( aExp == 0 ) {
7069         aExp = 1;
7070         bExp = 1;
7071     }
7072     if ( bSig0 < aSig0 ) goto aBigger;
7073     if ( aSig0 < bSig0 ) goto bBigger;
7074     if ( bSig1 < aSig1 ) goto aBigger;
7075     if ( aSig1 < bSig1 ) goto bBigger;
7076     return packFloat128(status->float_rounding_mode == float_round_down,
7077                         0, 0, 0);
7078  bExpBigger:
7079     if ( bExp == 0x7FFF ) {
7080         if (bSig0 | bSig1) {
7081             return propagateFloat128NaN(a, b, status);
7082         }
7083         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7084     }
7085     if ( aExp == 0 ) {
7086         ++expDiff;
7087     }
7088     else {
7089         aSig0 |= LIT64( 0x4000000000000000 );
7090     }
7091     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7092     bSig0 |= LIT64( 0x4000000000000000 );
7093  bBigger:
7094     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7095     zExp = bExp;
7096     zSign ^= 1;
7097     goto normalizeRoundAndPack;
7098  aExpBigger:
7099     if ( aExp == 0x7FFF ) {
7100         if (aSig0 | aSig1) {
7101             return propagateFloat128NaN(a, b, status);
7102         }
7103         return a;
7104     }
7105     if ( bExp == 0 ) {
7106         --expDiff;
7107     }
7108     else {
7109         bSig0 |= LIT64( 0x4000000000000000 );
7110     }
7111     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7112     aSig0 |= LIT64( 0x4000000000000000 );
7113  aBigger:
7114     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7115     zExp = aExp;
7116  normalizeRoundAndPack:
7117     --zExp;
7118     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7119                                          status);
7120 
7121 }
7122 
7123 /*----------------------------------------------------------------------------
7124 | Returns the result of adding the quadruple-precision floating-point values
7125 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7126 | for Binary Floating-Point Arithmetic.
7127 *----------------------------------------------------------------------------*/
7128 
7129 float128 float128_add(float128 a, float128 b, float_status *status)
7130 {
7131     flag aSign, bSign;
7132 
7133     aSign = extractFloat128Sign( a );
7134     bSign = extractFloat128Sign( b );
7135     if ( aSign == bSign ) {
7136         return addFloat128Sigs(a, b, aSign, status);
7137     }
7138     else {
7139         return subFloat128Sigs(a, b, aSign, status);
7140     }
7141 
7142 }
7143 
7144 /*----------------------------------------------------------------------------
7145 | Returns the result of subtracting the quadruple-precision floating-point
7146 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7147 | Standard for Binary Floating-Point Arithmetic.
7148 *----------------------------------------------------------------------------*/
7149 
7150 float128 float128_sub(float128 a, float128 b, float_status *status)
7151 {
7152     flag aSign, bSign;
7153 
7154     aSign = extractFloat128Sign( a );
7155     bSign = extractFloat128Sign( b );
7156     if ( aSign == bSign ) {
7157         return subFloat128Sigs(a, b, aSign, status);
7158     }
7159     else {
7160         return addFloat128Sigs(a, b, aSign, status);
7161     }
7162 
7163 }
7164 
7165 /*----------------------------------------------------------------------------
7166 | Returns the result of multiplying the quadruple-precision floating-point
7167 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7168 | Standard for Binary Floating-Point Arithmetic.
7169 *----------------------------------------------------------------------------*/
7170 
7171 float128 float128_mul(float128 a, float128 b, float_status *status)
7172 {
7173     flag aSign, bSign, zSign;
7174     int32_t aExp, bExp, zExp;
7175     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7176 
7177     aSig1 = extractFloat128Frac1( a );
7178     aSig0 = extractFloat128Frac0( a );
7179     aExp = extractFloat128Exp( a );
7180     aSign = extractFloat128Sign( a );
7181     bSig1 = extractFloat128Frac1( b );
7182     bSig0 = extractFloat128Frac0( b );
7183     bExp = extractFloat128Exp( b );
7184     bSign = extractFloat128Sign( b );
7185     zSign = aSign ^ bSign;
7186     if ( aExp == 0x7FFF ) {
7187         if (    ( aSig0 | aSig1 )
7188              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7189             return propagateFloat128NaN(a, b, status);
7190         }
7191         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7192         return packFloat128( zSign, 0x7FFF, 0, 0 );
7193     }
7194     if ( bExp == 0x7FFF ) {
7195         if (bSig0 | bSig1) {
7196             return propagateFloat128NaN(a, b, status);
7197         }
7198         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7199  invalid:
7200             float_raise(float_flag_invalid, status);
7201             return float128_default_nan(status);
7202         }
7203         return packFloat128( zSign, 0x7FFF, 0, 0 );
7204     }
7205     if ( aExp == 0 ) {
7206         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7207         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7208     }
7209     if ( bExp == 0 ) {
7210         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7211         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7212     }
7213     zExp = aExp + bExp - 0x4000;
7214     aSig0 |= LIT64( 0x0001000000000000 );
7215     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7216     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7217     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7218     zSig2 |= ( zSig3 != 0 );
7219     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7220         shift128ExtraRightJamming(
7221             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7222         ++zExp;
7223     }
7224     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7225 
7226 }
7227 
7228 /*----------------------------------------------------------------------------
7229 | Returns the result of dividing the quadruple-precision floating-point value
7230 | `a' by the corresponding value `b'.  The operation is performed according to
7231 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7232 *----------------------------------------------------------------------------*/
7233 
7234 float128 float128_div(float128 a, float128 b, float_status *status)
7235 {
7236     flag aSign, bSign, zSign;
7237     int32_t aExp, bExp, zExp;
7238     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7239     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7240 
7241     aSig1 = extractFloat128Frac1( a );
7242     aSig0 = extractFloat128Frac0( a );
7243     aExp = extractFloat128Exp( a );
7244     aSign = extractFloat128Sign( a );
7245     bSig1 = extractFloat128Frac1( b );
7246     bSig0 = extractFloat128Frac0( b );
7247     bExp = extractFloat128Exp( b );
7248     bSign = extractFloat128Sign( b );
7249     zSign = aSign ^ bSign;
7250     if ( aExp == 0x7FFF ) {
7251         if (aSig0 | aSig1) {
7252             return propagateFloat128NaN(a, b, status);
7253         }
7254         if ( bExp == 0x7FFF ) {
7255             if (bSig0 | bSig1) {
7256                 return propagateFloat128NaN(a, b, status);
7257             }
7258             goto invalid;
7259         }
7260         return packFloat128( zSign, 0x7FFF, 0, 0 );
7261     }
7262     if ( bExp == 0x7FFF ) {
7263         if (bSig0 | bSig1) {
7264             return propagateFloat128NaN(a, b, status);
7265         }
7266         return packFloat128( zSign, 0, 0, 0 );
7267     }
7268     if ( bExp == 0 ) {
7269         if ( ( bSig0 | bSig1 ) == 0 ) {
7270             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7271  invalid:
7272                 float_raise(float_flag_invalid, status);
7273                 return float128_default_nan(status);
7274             }
7275             float_raise(float_flag_divbyzero, status);
7276             return packFloat128( zSign, 0x7FFF, 0, 0 );
7277         }
7278         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7279     }
7280     if ( aExp == 0 ) {
7281         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7282         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7283     }
7284     zExp = aExp - bExp + 0x3FFD;
7285     shortShift128Left(
7286         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7287     shortShift128Left(
7288         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7289     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7290         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7291         ++zExp;
7292     }
7293     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7294     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7295     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7296     while ( (int64_t) rem0 < 0 ) {
7297         --zSig0;
7298         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7299     }
7300     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7301     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7302         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7303         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7304         while ( (int64_t) rem1 < 0 ) {
7305             --zSig1;
7306             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7307         }
7308         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7309     }
7310     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7311     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7312 
7313 }
7314 
7315 /*----------------------------------------------------------------------------
7316 | Returns the remainder of the quadruple-precision floating-point value `a'
7317 | with respect to the corresponding value `b'.  The operation is performed
7318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7319 *----------------------------------------------------------------------------*/
7320 
7321 float128 float128_rem(float128 a, float128 b, float_status *status)
7322 {
7323     flag aSign, zSign;
7324     int32_t aExp, bExp, expDiff;
7325     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7326     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7327     int64_t sigMean0;
7328 
7329     aSig1 = extractFloat128Frac1( a );
7330     aSig0 = extractFloat128Frac0( a );
7331     aExp = extractFloat128Exp( a );
7332     aSign = extractFloat128Sign( a );
7333     bSig1 = extractFloat128Frac1( b );
7334     bSig0 = extractFloat128Frac0( b );
7335     bExp = extractFloat128Exp( b );
7336     if ( aExp == 0x7FFF ) {
7337         if (    ( aSig0 | aSig1 )
7338              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7339             return propagateFloat128NaN(a, b, status);
7340         }
7341         goto invalid;
7342     }
7343     if ( bExp == 0x7FFF ) {
7344         if (bSig0 | bSig1) {
7345             return propagateFloat128NaN(a, b, status);
7346         }
7347         return a;
7348     }
7349     if ( bExp == 0 ) {
7350         if ( ( bSig0 | bSig1 ) == 0 ) {
7351  invalid:
7352             float_raise(float_flag_invalid, status);
7353             return float128_default_nan(status);
7354         }
7355         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7356     }
7357     if ( aExp == 0 ) {
7358         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7359         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7360     }
7361     expDiff = aExp - bExp;
7362     if ( expDiff < -1 ) return a;
7363     shortShift128Left(
7364         aSig0 | LIT64( 0x0001000000000000 ),
7365         aSig1,
7366         15 - ( expDiff < 0 ),
7367         &aSig0,
7368         &aSig1
7369     );
7370     shortShift128Left(
7371         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7372     q = le128( bSig0, bSig1, aSig0, aSig1 );
7373     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7374     expDiff -= 64;
7375     while ( 0 < expDiff ) {
7376         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7377         q = ( 4 < q ) ? q - 4 : 0;
7378         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7379         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7380         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7381         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7382         expDiff -= 61;
7383     }
7384     if ( -64 < expDiff ) {
7385         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7386         q = ( 4 < q ) ? q - 4 : 0;
7387         q >>= - expDiff;
7388         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7389         expDiff += 52;
7390         if ( expDiff < 0 ) {
7391             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7392         }
7393         else {
7394             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7395         }
7396         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7397         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7398     }
7399     else {
7400         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7401         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7402     }
7403     do {
7404         alternateASig0 = aSig0;
7405         alternateASig1 = aSig1;
7406         ++q;
7407         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7408     } while ( 0 <= (int64_t) aSig0 );
7409     add128(
7410         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7411     if (    ( sigMean0 < 0 )
7412          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7413         aSig0 = alternateASig0;
7414         aSig1 = alternateASig1;
7415     }
7416     zSign = ( (int64_t) aSig0 < 0 );
7417     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7418     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7419                                          status);
7420 }
7421 
7422 /*----------------------------------------------------------------------------
7423 | Returns the square root of the quadruple-precision floating-point value `a'.
7424 | The operation is performed according to the IEC/IEEE Standard for Binary
7425 | Floating-Point Arithmetic.
7426 *----------------------------------------------------------------------------*/
7427 
7428 float128 float128_sqrt(float128 a, float_status *status)
7429 {
7430     flag aSign;
7431     int32_t aExp, zExp;
7432     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7433     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7434 
7435     aSig1 = extractFloat128Frac1( a );
7436     aSig0 = extractFloat128Frac0( a );
7437     aExp = extractFloat128Exp( a );
7438     aSign = extractFloat128Sign( a );
7439     if ( aExp == 0x7FFF ) {
7440         if (aSig0 | aSig1) {
7441             return propagateFloat128NaN(a, a, status);
7442         }
7443         if ( ! aSign ) return a;
7444         goto invalid;
7445     }
7446     if ( aSign ) {
7447         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7448  invalid:
7449         float_raise(float_flag_invalid, status);
7450         return float128_default_nan(status);
7451     }
7452     if ( aExp == 0 ) {
7453         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7454         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7455     }
7456     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7457     aSig0 |= LIT64( 0x0001000000000000 );
7458     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7459     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7460     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7461     doubleZSig0 = zSig0<<1;
7462     mul64To128( zSig0, zSig0, &term0, &term1 );
7463     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7464     while ( (int64_t) rem0 < 0 ) {
7465         --zSig0;
7466         doubleZSig0 -= 2;
7467         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7468     }
7469     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7470     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7471         if ( zSig1 == 0 ) zSig1 = 1;
7472         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7473         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7474         mul64To128( zSig1, zSig1, &term2, &term3 );
7475         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7476         while ( (int64_t) rem1 < 0 ) {
7477             --zSig1;
7478             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7479             term3 |= 1;
7480             term2 |= doubleZSig0;
7481             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7482         }
7483         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7484     }
7485     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7486     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7487 
7488 }
7489 
7490 /*----------------------------------------------------------------------------
7491 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7492 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7493 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7495 *----------------------------------------------------------------------------*/
7496 
7497 int float128_eq(float128 a, float128 b, float_status *status)
7498 {
7499 
7500     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7501               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7502          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7503               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7504        ) {
7505         float_raise(float_flag_invalid, status);
7506         return 0;
7507     }
7508     return
7509            ( a.low == b.low )
7510         && (    ( a.high == b.high )
7511              || (    ( a.low == 0 )
7512                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7513            );
7514 
7515 }
7516 
7517 /*----------------------------------------------------------------------------
7518 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7519 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7520 | exception is raised if either operand is a NaN.  The comparison is performed
7521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7522 *----------------------------------------------------------------------------*/
7523 
7524 int float128_le(float128 a, float128 b, float_status *status)
7525 {
7526     flag aSign, bSign;
7527 
7528     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7529               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7530          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7531               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7532        ) {
7533         float_raise(float_flag_invalid, status);
7534         return 0;
7535     }
7536     aSign = extractFloat128Sign( a );
7537     bSign = extractFloat128Sign( b );
7538     if ( aSign != bSign ) {
7539         return
7540                aSign
7541             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7542                  == 0 );
7543     }
7544     return
7545           aSign ? le128( b.high, b.low, a.high, a.low )
7546         : le128( a.high, a.low, b.high, b.low );
7547 
7548 }
7549 
7550 /*----------------------------------------------------------------------------
7551 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7552 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7553 | raised if either operand is a NaN.  The comparison is performed according
7554 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7555 *----------------------------------------------------------------------------*/
7556 
7557 int float128_lt(float128 a, float128 b, float_status *status)
7558 {
7559     flag aSign, bSign;
7560 
7561     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7562               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7563          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7564               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7565        ) {
7566         float_raise(float_flag_invalid, status);
7567         return 0;
7568     }
7569     aSign = extractFloat128Sign( a );
7570     bSign = extractFloat128Sign( b );
7571     if ( aSign != bSign ) {
7572         return
7573                aSign
7574             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7575                  != 0 );
7576     }
7577     return
7578           aSign ? lt128( b.high, b.low, a.high, a.low )
7579         : lt128( a.high, a.low, b.high, b.low );
7580 
7581 }
7582 
7583 /*----------------------------------------------------------------------------
7584 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7585 | be compared, and 0 otherwise.  The invalid exception is raised if either
7586 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7587 | Standard for Binary Floating-Point Arithmetic.
7588 *----------------------------------------------------------------------------*/
7589 
7590 int float128_unordered(float128 a, float128 b, float_status *status)
7591 {
7592     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7593               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7594          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7595               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7596        ) {
7597         float_raise(float_flag_invalid, status);
7598         return 1;
7599     }
7600     return 0;
7601 }
7602 
7603 /*----------------------------------------------------------------------------
7604 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7605 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7606 | exception.  The comparison is performed according to the IEC/IEEE Standard
7607 | for Binary Floating-Point Arithmetic.
7608 *----------------------------------------------------------------------------*/
7609 
7610 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7611 {
7612 
7613     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7614               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7615          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7616               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7617        ) {
7618         if (float128_is_signaling_nan(a, status)
7619          || float128_is_signaling_nan(b, status)) {
7620             float_raise(float_flag_invalid, status);
7621         }
7622         return 0;
7623     }
7624     return
7625            ( a.low == b.low )
7626         && (    ( a.high == b.high )
7627              || (    ( a.low == 0 )
7628                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7629            );
7630 
7631 }
7632 
7633 /*----------------------------------------------------------------------------
7634 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7635 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7636 | cause an exception.  Otherwise, the comparison is performed according to the
7637 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7638 *----------------------------------------------------------------------------*/
7639 
7640 int float128_le_quiet(float128 a, float128 b, float_status *status)
7641 {
7642     flag aSign, bSign;
7643 
7644     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7645               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7646          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7647               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7648        ) {
7649         if (float128_is_signaling_nan(a, status)
7650          || float128_is_signaling_nan(b, status)) {
7651             float_raise(float_flag_invalid, status);
7652         }
7653         return 0;
7654     }
7655     aSign = extractFloat128Sign( a );
7656     bSign = extractFloat128Sign( b );
7657     if ( aSign != bSign ) {
7658         return
7659                aSign
7660             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7661                  == 0 );
7662     }
7663     return
7664           aSign ? le128( b.high, b.low, a.high, a.low )
7665         : le128( a.high, a.low, b.high, b.low );
7666 
7667 }
7668 
7669 /*----------------------------------------------------------------------------
7670 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7671 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7672 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7673 | Standard for Binary Floating-Point Arithmetic.
7674 *----------------------------------------------------------------------------*/
7675 
7676 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7677 {
7678     flag aSign, bSign;
7679 
7680     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7681               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7682          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7683               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7684        ) {
7685         if (float128_is_signaling_nan(a, status)
7686          || float128_is_signaling_nan(b, status)) {
7687             float_raise(float_flag_invalid, status);
7688         }
7689         return 0;
7690     }
7691     aSign = extractFloat128Sign( a );
7692     bSign = extractFloat128Sign( b );
7693     if ( aSign != bSign ) {
7694         return
7695                aSign
7696             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7697                  != 0 );
7698     }
7699     return
7700           aSign ? lt128( b.high, b.low, a.high, a.low )
7701         : lt128( a.high, a.low, b.high, b.low );
7702 
7703 }
7704 
7705 /*----------------------------------------------------------------------------
7706 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7707 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7708 | comparison is performed according to the IEC/IEEE Standard for Binary
7709 | Floating-Point Arithmetic.
7710 *----------------------------------------------------------------------------*/
7711 
7712 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7713 {
7714     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7715               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7716          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7717               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7718        ) {
7719         if (float128_is_signaling_nan(a, status)
7720          || float128_is_signaling_nan(b, status)) {
7721             float_raise(float_flag_invalid, status);
7722         }
7723         return 1;
7724     }
7725     return 0;
7726 }
7727 
7728 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7729                                             int is_quiet, float_status *status)
7730 {
7731     flag aSign, bSign;
7732 
7733     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7734         float_raise(float_flag_invalid, status);
7735         return float_relation_unordered;
7736     }
7737     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7738           ( extractFloatx80Frac( a )<<1 ) ) ||
7739         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7740           ( extractFloatx80Frac( b )<<1 ) )) {
7741         if (!is_quiet ||
7742             floatx80_is_signaling_nan(a, status) ||
7743             floatx80_is_signaling_nan(b, status)) {
7744             float_raise(float_flag_invalid, status);
7745         }
7746         return float_relation_unordered;
7747     }
7748     aSign = extractFloatx80Sign( a );
7749     bSign = extractFloatx80Sign( b );
7750     if ( aSign != bSign ) {
7751 
7752         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7753              ( ( a.low | b.low ) == 0 ) ) {
7754             /* zero case */
7755             return float_relation_equal;
7756         } else {
7757             return 1 - (2 * aSign);
7758         }
7759     } else {
7760         if (a.low == b.low && a.high == b.high) {
7761             return float_relation_equal;
7762         } else {
7763             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7764         }
7765     }
7766 }
7767 
7768 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7769 {
7770     return floatx80_compare_internal(a, b, 0, status);
7771 }
7772 
7773 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7774 {
7775     return floatx80_compare_internal(a, b, 1, status);
7776 }
7777 
7778 static inline int float128_compare_internal(float128 a, float128 b,
7779                                             int is_quiet, float_status *status)
7780 {
7781     flag aSign, bSign;
7782 
7783     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7784           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7785         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7786           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7787         if (!is_quiet ||
7788             float128_is_signaling_nan(a, status) ||
7789             float128_is_signaling_nan(b, status)) {
7790             float_raise(float_flag_invalid, status);
7791         }
7792         return float_relation_unordered;
7793     }
7794     aSign = extractFloat128Sign( a );
7795     bSign = extractFloat128Sign( b );
7796     if ( aSign != bSign ) {
7797         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7798             /* zero case */
7799             return float_relation_equal;
7800         } else {
7801             return 1 - (2 * aSign);
7802         }
7803     } else {
7804         if (a.low == b.low && a.high == b.high) {
7805             return float_relation_equal;
7806         } else {
7807             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7808         }
7809     }
7810 }
7811 
7812 int float128_compare(float128 a, float128 b, float_status *status)
7813 {
7814     return float128_compare_internal(a, b, 0, status);
7815 }
7816 
7817 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7818 {
7819     return float128_compare_internal(a, b, 1, status);
7820 }
7821 
7822 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7823 {
7824     flag aSign;
7825     int32_t aExp;
7826     uint64_t aSig;
7827 
7828     if (floatx80_invalid_encoding(a)) {
7829         float_raise(float_flag_invalid, status);
7830         return floatx80_default_nan(status);
7831     }
7832     aSig = extractFloatx80Frac( a );
7833     aExp = extractFloatx80Exp( a );
7834     aSign = extractFloatx80Sign( a );
7835 
7836     if ( aExp == 0x7FFF ) {
7837         if ( aSig<<1 ) {
7838             return propagateFloatx80NaN(a, a, status);
7839         }
7840         return a;
7841     }
7842 
7843     if (aExp == 0) {
7844         if (aSig == 0) {
7845             return a;
7846         }
7847         aExp++;
7848     }
7849 
7850     if (n > 0x10000) {
7851         n = 0x10000;
7852     } else if (n < -0x10000) {
7853         n = -0x10000;
7854     }
7855 
7856     aExp += n;
7857     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7858                                          aSign, aExp, aSig, 0, status);
7859 }
7860 
7861 float128 float128_scalbn(float128 a, int n, float_status *status)
7862 {
7863     flag aSign;
7864     int32_t aExp;
7865     uint64_t aSig0, aSig1;
7866 
7867     aSig1 = extractFloat128Frac1( a );
7868     aSig0 = extractFloat128Frac0( a );
7869     aExp = extractFloat128Exp( a );
7870     aSign = extractFloat128Sign( a );
7871     if ( aExp == 0x7FFF ) {
7872         if ( aSig0 | aSig1 ) {
7873             return propagateFloat128NaN(a, a, status);
7874         }
7875         return a;
7876     }
7877     if (aExp != 0) {
7878         aSig0 |= LIT64( 0x0001000000000000 );
7879     } else if (aSig0 == 0 && aSig1 == 0) {
7880         return a;
7881     } else {
7882         aExp++;
7883     }
7884 
7885     if (n > 0x10000) {
7886         n = 0x10000;
7887     } else if (n < -0x10000) {
7888         n = -0x10000;
7889     }
7890 
7891     aExp += n - 1;
7892     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7893                                          , status);
7894 
7895 }
7896